1
linux/include/trace/events/sched.h
Thomas Gleixner f2530dc71c kthread: Prevent unpark race which puts threads on the wrong cpu
The smpboot threads rely on the park/unpark mechanism which binds per
cpu threads on a particular core. Though the functionality is racy:

CPU0	       	 	CPU1  	     	    CPU2
unpark(T)				    wake_up_process(T)
  clear(SHOULD_PARK)	T runs
			leave parkme() due to !SHOULD_PARK  
  bind_to(CPU2)		BUG_ON(wrong CPU)						    

We cannot let the tasks move themself to the target CPU as one of
those tasks is actually the migration thread itself, which requires
that it starts running on the target cpu right away.

The solution to this problem is to prevent wakeups in park mode which
are not from unpark(). That way we can guarantee that the association
of the task to the target cpu is working correctly.

Add a new task state (TASK_PARKED) which prevents other wakeups and
use this state explicitly for the unpark wakeup.

Peter noticed: Also, since the task state is visible to userspace and
all the parked tasks are still in the PID space, its a good hint in ps
and friends that these tasks aren't really there for the moment.

The migration thread has another related issue.

CPU0	      	     	 CPU1
Bring up CPU2
create_thread(T)
park(T)
 wait_for_completion()
			 parkme()
			 complete()
sched_set_stop_task()
			 schedule(TASK_PARKED)

The sched_set_stop_task() call is issued while the task is on the
runqueue of CPU1 and that confuses the hell out of the stop_task class
on that cpu. So we need the same synchronizaion before
sched_set_stop_task().

Reported-by: Dave Jones <davej@redhat.com>
Reported-and-tested-by: Dave Hansen <dave@sr71.net>
Reported-and-tested-by: Borislav Petkov <bp@alien8.de>
Acked-by: Peter Ziljstra <peterz@infradead.org>
Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: dhillf@gmail.com
Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304091635430.21884@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2013-04-12 14:18:43 +02:00

437 lines
9.8 KiB
C

#undef TRACE_SYSTEM
#define TRACE_SYSTEM sched
#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
#define _TRACE_SCHED_H
#include <linux/sched.h>
#include <linux/tracepoint.h>
#include <linux/binfmts.h>
/*
* Tracepoint for calling kthread_stop, performed to end a kthread:
*/
TRACE_EVENT(sched_kthread_stop,
TP_PROTO(struct task_struct *t),
TP_ARGS(t),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
),
TP_fast_assign(
memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
__entry->pid = t->pid;
),
TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid)
);
/*
* Tracepoint for the return value of the kthread stopping:
*/
TRACE_EVENT(sched_kthread_stop_ret,
TP_PROTO(int ret),
TP_ARGS(ret),
TP_STRUCT__entry(
__field( int, ret )
),
TP_fast_assign(
__entry->ret = ret;
),
TP_printk("ret=%d", __entry->ret)
);
/*
* Tracepoint for waking up a task:
*/
DECLARE_EVENT_CLASS(sched_wakeup_template,
TP_PROTO(struct task_struct *p, int success),
TP_ARGS(p, success),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( int, prio )
__field( int, success )
__field( int, target_cpu )
),
TP_fast_assign(
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
__entry->prio = p->prio;
__entry->success = success;
__entry->target_cpu = task_cpu(p);
)
TP_perf_assign(
__perf_task(p);
),
TP_printk("comm=%s pid=%d prio=%d success=%d target_cpu=%03d",
__entry->comm, __entry->pid, __entry->prio,
__entry->success, __entry->target_cpu)
);
DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
TP_PROTO(struct task_struct *p, int success),
TP_ARGS(p, success));
/*
* Tracepoint for waking up a new task:
*/
DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
TP_PROTO(struct task_struct *p, int success),
TP_ARGS(p, success));
#ifdef CREATE_TRACE_POINTS
static inline long __trace_sched_switch_state(struct task_struct *p)
{
long state = p->state;
#ifdef CONFIG_PREEMPT
/*
* For all intents and purposes a preempted task is a running task.
*/
if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)
state = TASK_RUNNING | TASK_STATE_MAX;
#endif
return state;
}
#endif
/*
* Tracepoint for task switches, performed by the scheduler:
*/
TRACE_EVENT(sched_switch,
TP_PROTO(struct task_struct *prev,
struct task_struct *next),
TP_ARGS(prev, next),
TP_STRUCT__entry(
__array( char, prev_comm, TASK_COMM_LEN )
__field( pid_t, prev_pid )
__field( int, prev_prio )
__field( long, prev_state )
__array( char, next_comm, TASK_COMM_LEN )
__field( pid_t, next_pid )
__field( int, next_prio )
),
TP_fast_assign(
memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
__entry->prev_pid = prev->pid;
__entry->prev_prio = prev->prio;
__entry->prev_state = __trace_sched_switch_state(prev);
memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
__entry->next_pid = next->pid;
__entry->next_prio = next->prio;
),
TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
__entry->prev_state & (TASK_STATE_MAX-1) ?
__print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
{ 16, "Z" }, { 32, "X" }, { 64, "x" },
{ 128, "K" }, { 256, "W" }, { 512, "P" }) : "R",
__entry->prev_state & TASK_STATE_MAX ? "+" : "",
__entry->next_comm, __entry->next_pid, __entry->next_prio)
);
/*
* Tracepoint for a task being migrated:
*/
TRACE_EVENT(sched_migrate_task,
TP_PROTO(struct task_struct *p, int dest_cpu),
TP_ARGS(p, dest_cpu),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( int, prio )
__field( int, orig_cpu )
__field( int, dest_cpu )
),
TP_fast_assign(
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
__entry->prio = p->prio;
__entry->orig_cpu = task_cpu(p);
__entry->dest_cpu = dest_cpu;
),
TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d",
__entry->comm, __entry->pid, __entry->prio,
__entry->orig_cpu, __entry->dest_cpu)
);
DECLARE_EVENT_CLASS(sched_process_template,
TP_PROTO(struct task_struct *p),
TP_ARGS(p),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( int, prio )
),
TP_fast_assign(
memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
__entry->pid = p->pid;
__entry->prio = p->prio;
),
TP_printk("comm=%s pid=%d prio=%d",
__entry->comm, __entry->pid, __entry->prio)
);
/*
* Tracepoint for freeing a task:
*/
DEFINE_EVENT(sched_process_template, sched_process_free,
TP_PROTO(struct task_struct *p),
TP_ARGS(p));
/*
* Tracepoint for a task exiting:
*/
DEFINE_EVENT(sched_process_template, sched_process_exit,
TP_PROTO(struct task_struct *p),
TP_ARGS(p));
/*
* Tracepoint for waiting on task to unschedule:
*/
DEFINE_EVENT(sched_process_template, sched_wait_task,
TP_PROTO(struct task_struct *p),
TP_ARGS(p));
/*
* Tracepoint for a waiting task:
*/
TRACE_EVENT(sched_process_wait,
TP_PROTO(struct pid *pid),
TP_ARGS(pid),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( int, prio )
),
TP_fast_assign(
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
__entry->pid = pid_nr(pid);
__entry->prio = current->prio;
),
TP_printk("comm=%s pid=%d prio=%d",
__entry->comm, __entry->pid, __entry->prio)
);
/*
* Tracepoint for do_fork:
*/
TRACE_EVENT(sched_process_fork,
TP_PROTO(struct task_struct *parent, struct task_struct *child),
TP_ARGS(parent, child),
TP_STRUCT__entry(
__array( char, parent_comm, TASK_COMM_LEN )
__field( pid_t, parent_pid )
__array( char, child_comm, TASK_COMM_LEN )
__field( pid_t, child_pid )
),
TP_fast_assign(
memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
__entry->parent_pid = parent->pid;
memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
__entry->child_pid = child->pid;
),
TP_printk("comm=%s pid=%d child_comm=%s child_pid=%d",
__entry->parent_comm, __entry->parent_pid,
__entry->child_comm, __entry->child_pid)
);
/*
* Tracepoint for exec:
*/
TRACE_EVENT(sched_process_exec,
TP_PROTO(struct task_struct *p, pid_t old_pid,
struct linux_binprm *bprm),
TP_ARGS(p, old_pid, bprm),
TP_STRUCT__entry(
__string( filename, bprm->filename )
__field( pid_t, pid )
__field( pid_t, old_pid )
),
TP_fast_assign(
__assign_str(filename, bprm->filename);
__entry->pid = p->pid;
__entry->old_pid = old_pid;
),
TP_printk("filename=%s pid=%d old_pid=%d", __get_str(filename),
__entry->pid, __entry->old_pid)
);
/*
* XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE
* adding sched_stat support to SCHED_FIFO/RR would be welcome.
*/
DECLARE_EVENT_CLASS(sched_stat_template,
TP_PROTO(struct task_struct *tsk, u64 delay),
TP_ARGS(tsk, delay),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( u64, delay )
),
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->delay = delay;
)
TP_perf_assign(
__perf_count(delay);
__perf_task(tsk);
),
TP_printk("comm=%s pid=%d delay=%Lu [ns]",
__entry->comm, __entry->pid,
(unsigned long long)__entry->delay)
);
/*
* Tracepoint for accounting wait time (time the task is runnable
* but not actually running due to scheduler contention).
*/
DEFINE_EVENT(sched_stat_template, sched_stat_wait,
TP_PROTO(struct task_struct *tsk, u64 delay),
TP_ARGS(tsk, delay));
/*
* Tracepoint for accounting sleep time (time the task is not runnable,
* including iowait, see below).
*/
DEFINE_EVENT(sched_stat_template, sched_stat_sleep,
TP_PROTO(struct task_struct *tsk, u64 delay),
TP_ARGS(tsk, delay));
/*
* Tracepoint for accounting iowait time (time the task is not runnable
* due to waiting on IO to complete).
*/
DEFINE_EVENT(sched_stat_template, sched_stat_iowait,
TP_PROTO(struct task_struct *tsk, u64 delay),
TP_ARGS(tsk, delay));
/*
* Tracepoint for accounting blocked time (time the task is in uninterruptible).
*/
DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
TP_PROTO(struct task_struct *tsk, u64 delay),
TP_ARGS(tsk, delay));
/*
* Tracepoint for accounting runtime (time the task is executing
* on a CPU).
*/
TRACE_EVENT(sched_stat_runtime,
TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
TP_ARGS(tsk, runtime, vruntime),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( u64, runtime )
__field( u64, vruntime )
),
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->runtime = runtime;
__entry->vruntime = vruntime;
)
TP_perf_assign(
__perf_count(runtime);
),
TP_printk("comm=%s pid=%d runtime=%Lu [ns] vruntime=%Lu [ns]",
__entry->comm, __entry->pid,
(unsigned long long)__entry->runtime,
(unsigned long long)__entry->vruntime)
);
/*
* Tracepoint for showing priority inheritance modifying a tasks
* priority.
*/
TRACE_EVENT(sched_pi_setprio,
TP_PROTO(struct task_struct *tsk, int newprio),
TP_ARGS(tsk, newprio),
TP_STRUCT__entry(
__array( char, comm, TASK_COMM_LEN )
__field( pid_t, pid )
__field( int, oldprio )
__field( int, newprio )
),
TP_fast_assign(
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
__entry->pid = tsk->pid;
__entry->oldprio = tsk->prio;
__entry->newprio = newprio;
),
TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
__entry->comm, __entry->pid,
__entry->oldprio, __entry->newprio)
);
#endif /* _TRACE_SCHED_H */
/* This part must be outside protection */
#include <trace/define_trace.h>