sched/psi: Fix mistaken CPU pressure indication after corrupted task state bug
Since sched_delayed tasks remain queued even after blocking, the load
balancer can migrate them between runqueues while PSI considers them
to be asleep. As a result, it misreads the migration requeue followed
by a wakeup as a double queue:
psi: inconsistent task state! task=... cpu=... psi_flags=4 clear=. set=4
First, call psi_enqueue() after p->sched_class->enqueue_task(). A
wakeup will clear p->se.sched_delayed while a migration will not, so
psi can use that flag to tell them apart.
Then teach psi to migrate any "sleep" state when delayed-dequeue tasks
are being migrated.
Delayed-dequeue tasks can be revived by ttwu_runnable(), which will
call down with a new ENQUEUE_DELAYED. Instead of further complicating
the wakeup conditional in enqueue_task(), identify migration contexts
instead and default to wakeup handling for all other cases.
It's not just the warning in dmesg, the task state corruption causes a
permanent CPU pressure indication, which messes with workload/machine
health monitoring.
Debugged-by-and-original-fix-by: K Prateek Nayak <kprateek.nayak@amd.com>
Fixes: 152e11f6df
("sched/fair: Implement delayed dequeue")
Closes: https://lore.kernel.org/lkml/20240830123458.3557-1-spasswolf@web.de/
Closes: https://lore.kernel.org/all/cd67fbcd-d659-4822-bb90-7e8fbb40a856@molgen.mpg.de/
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Tested-by: K Prateek Nayak <kprateek.nayak@amd.com>
Link: https://lkml.kernel.org/r/20241010193712.GC181795@cmpxchg.org
This commit is contained in:
parent
f5aaff7bfa
commit
c650812419
@ -2012,11 +2012,6 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
if (!(flags & ENQUEUE_NOCLOCK))
|
||||
update_rq_clock(rq);
|
||||
|
||||
if (!(flags & ENQUEUE_RESTORE)) {
|
||||
sched_info_enqueue(rq, p);
|
||||
psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED));
|
||||
}
|
||||
|
||||
p->sched_class->enqueue_task(rq, p, flags);
|
||||
/*
|
||||
* Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear
|
||||
@ -2024,6 +2019,11 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
*/
|
||||
uclamp_rq_inc(rq, p);
|
||||
|
||||
if (!(flags & ENQUEUE_RESTORE)) {
|
||||
sched_info_enqueue(rq, p);
|
||||
psi_enqueue(p, flags & ENQUEUE_MIGRATED);
|
||||
}
|
||||
|
||||
if (sched_core_enabled(rq))
|
||||
sched_core_enqueue(rq, p);
|
||||
}
|
||||
@ -2041,7 +2041,7 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
|
||||
|
||||
if (!(flags & DEQUEUE_SAVE)) {
|
||||
sched_info_dequeue(rq, p);
|
||||
psi_dequeue(p, flags & DEQUEUE_SLEEP);
|
||||
psi_dequeue(p, !(flags & DEQUEUE_SLEEP));
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -119,45 +119,63 @@ static inline void psi_account_irqtime(struct rq *rq, struct task_struct *curr,
|
||||
/*
|
||||
* PSI tracks state that persists across sleeps, such as iowaits and
|
||||
* memory stalls. As a result, it has to distinguish between sleeps,
|
||||
* where a task's runnable state changes, and requeues, where a task
|
||||
* and its state are being moved between CPUs and runqueues.
|
||||
* where a task's runnable state changes, and migrations, where a task
|
||||
* and its runnable state are being moved between CPUs and runqueues.
|
||||
*
|
||||
* A notable case is a task whose dequeue is delayed. PSI considers
|
||||
* those sleeping, but because they are still on the runqueue they can
|
||||
* go through migration requeues. In this case, *sleeping* states need
|
||||
* to be transferred.
|
||||
*/
|
||||
static inline void psi_enqueue(struct task_struct *p, bool wakeup)
|
||||
static inline void psi_enqueue(struct task_struct *p, bool migrate)
|
||||
{
|
||||
int clear = 0, set = TSK_RUNNING;
|
||||
int clear = 0, set = 0;
|
||||
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return;
|
||||
|
||||
if (p->in_memstall)
|
||||
set |= TSK_MEMSTALL_RUNNING;
|
||||
|
||||
if (!wakeup) {
|
||||
if (p->se.sched_delayed) {
|
||||
/* CPU migration of "sleeping" task */
|
||||
SCHED_WARN_ON(!migrate);
|
||||
if (p->in_memstall)
|
||||
set |= TSK_MEMSTALL;
|
||||
if (p->in_iowait)
|
||||
set |= TSK_IOWAIT;
|
||||
} else if (migrate) {
|
||||
/* CPU migration of runnable task */
|
||||
set = TSK_RUNNING;
|
||||
if (p->in_memstall)
|
||||
set |= TSK_MEMSTALL | TSK_MEMSTALL_RUNNING;
|
||||
} else {
|
||||
/* Wakeup of new or sleeping task */
|
||||
if (p->in_iowait)
|
||||
clear |= TSK_IOWAIT;
|
||||
set = TSK_RUNNING;
|
||||
if (p->in_memstall)
|
||||
set |= TSK_MEMSTALL_RUNNING;
|
||||
}
|
||||
|
||||
psi_task_change(p, clear, set);
|
||||
}
|
||||
|
||||
static inline void psi_dequeue(struct task_struct *p, bool sleep)
|
||||
static inline void psi_dequeue(struct task_struct *p, bool migrate)
|
||||
{
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return;
|
||||
|
||||
/*
|
||||
* When migrating a task to another CPU, clear all psi
|
||||
* state. The enqueue callback above will work it out.
|
||||
*/
|
||||
if (migrate)
|
||||
psi_task_change(p, p->psi_flags, 0);
|
||||
|
||||
/*
|
||||
* A voluntary sleep is a dequeue followed by a task switch. To
|
||||
* avoid walking all ancestors twice, psi_task_switch() handles
|
||||
* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
|
||||
* Do nothing here.
|
||||
*/
|
||||
if (sleep)
|
||||
return;
|
||||
|
||||
psi_task_change(p, p->psi_flags, 0);
|
||||
}
|
||||
|
||||
static inline void psi_ttwu_dequeue(struct task_struct *p)
|
||||
@ -190,8 +208,8 @@ static inline void psi_sched_switch(struct task_struct *prev,
|
||||
}
|
||||
|
||||
#else /* CONFIG_PSI */
|
||||
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
|
||||
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
|
||||
static inline void psi_enqueue(struct task_struct *p, bool migrate) {}
|
||||
static inline void psi_dequeue(struct task_struct *p, bool migrate) {}
|
||||
static inline void psi_ttwu_dequeue(struct task_struct *p) {}
|
||||
static inline void psi_sched_switch(struct task_struct *prev,
|
||||
struct task_struct *next,
|
||||
|
Loading…
Reference in New Issue
Block a user