1

sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx()

sched_ext dispatches tasks from the BPF scheduler from balance_scx() and
thus every pick_task_scx() call must be preceded by balance_scx(). While
this usually holds, due to a bug, there are cases where the fair class's
balance() returns true indicating that it has tasks to run on the CPU and
thus terminating balance() calls but fails to actually find the next task to
run when pick_task() is called. In such cases, pick_task_scx() can be called
without preceding balance_scx().

Detect this condition using SCX_RQ_BAL_PENDING flags. If detected, keep
running the previous task if possible and avoid stalling from entering idle
without balancing.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/Ztj_h5c2LYsdXYbA@slm.duckdns.org
This commit is contained in:
Tejun Heo 2024-11-09 10:43:55 -10:00
parent a759bf0dfc
commit a6250aa251
3 changed files with 42 additions and 20 deletions

View File

@ -5914,12 +5914,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
#ifdef CONFIG_SCHED_CLASS_EXT #ifdef CONFIG_SCHED_CLASS_EXT
/* /*
* SCX requires a balance() call before every pick_next_task() including * SCX requires a balance() call before every pick_task() including when
* when waking up from SCHED_IDLE. If @start_class is below SCX, start * waking up from SCHED_IDLE. If @start_class is below SCX, start from
* from SCX instead. * SCX instead. Also, set a flag to detect missing balance() call.
*/ */
if (scx_enabled() && sched_class_above(&ext_sched_class, start_class)) if (scx_enabled()) {
start_class = &ext_sched_class; rq->scx.flags |= SCX_RQ_BAL_PENDING;
if (sched_class_above(&ext_sched_class, start_class))
start_class = &ext_sched_class;
}
#endif #endif
/* /*

View File

@ -2634,7 +2634,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
lockdep_assert_rq_held(rq); lockdep_assert_rq_held(rq);
rq->scx.flags |= SCX_RQ_IN_BALANCE; rq->scx.flags |= SCX_RQ_IN_BALANCE;
rq->scx.flags &= ~SCX_RQ_BAL_KEEP; rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
if (static_branch_unlikely(&scx_ops_cpu_preempt) && if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
unlikely(rq->scx.cpu_released)) { unlikely(rq->scx.cpu_released)) {
@ -2948,12 +2948,11 @@ static struct task_struct *pick_task_scx(struct rq *rq)
{ {
struct task_struct *prev = rq->curr; struct task_struct *prev = rq->curr;
struct task_struct *p; struct task_struct *p;
bool prev_on_scx = prev->sched_class == &ext_sched_class;
bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
bool kick_idle = false;
/* /*
* If balance_scx() is telling us to keep running @prev, replenish slice
* if necessary and keep running @prev. Otherwise, pop the first one
* from the local DSQ.
*
* WORKAROUND: * WORKAROUND:
* *
* %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just * %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
@ -2962,22 +2961,41 @@ static struct task_struct *pick_task_scx(struct rq *rq)
* which then ends up calling pick_task_scx() without preceding * which then ends up calling pick_task_scx() without preceding
* balance_scx(). * balance_scx().
* *
* For now, ignore cases where $prev is not on SCX. This isn't great and * Keep running @prev if possible and avoid stalling from entering idle
* can theoretically lead to stalls. However, for switch_all cases, this * without balancing.
* happens only while a BPF scheduler is being loaded or unloaded, and,
* for partial cases, fair will likely keep triggering this CPU.
* *
* Once fair is fixed, restore WARN_ON_ONCE(). * Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
* if pick_task_scx() is called without preceding balance_scx().
*/ */
if ((rq->scx.flags & SCX_RQ_BAL_KEEP) && if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
prev->sched_class == &ext_sched_class) { if (prev_on_scx) {
keep_prev = true;
} else {
keep_prev = false;
kick_idle = true;
}
} else if (unlikely(keep_prev && !prev_on_scx)) {
/* only allowed during transitions */
WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
keep_prev = false;
}
/*
* If balance_scx() is telling us to keep running @prev, replenish slice
* if necessary and keep running @prev. Otherwise, pop the first one
* from the local DSQ.
*/
if (keep_prev) {
p = prev; p = prev;
if (!p->scx.slice) if (!p->scx.slice)
p->scx.slice = SCX_SLICE_DFL; p->scx.slice = SCX_SLICE_DFL;
} else { } else {
p = first_local_task(rq); p = first_local_task(rq);
if (!p) if (!p) {
if (kick_idle)
scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
return NULL; return NULL;
}
if (unlikely(!p->scx.slice)) { if (unlikely(!p->scx.slice)) {
if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) { if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {

View File

@ -751,8 +751,9 @@ enum scx_rq_flags {
*/ */
SCX_RQ_ONLINE = 1 << 0, SCX_RQ_ONLINE = 1 << 0,
SCX_RQ_CAN_STOP_TICK = 1 << 1, SCX_RQ_CAN_STOP_TICK = 1 << 1,
SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */ SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
SCX_RQ_BYPASSING = 1 << 3, SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
SCX_RQ_BYPASSING = 1 << 4,
SCX_RQ_IN_WAKEUP = 1 << 16, SCX_RQ_IN_WAKEUP = 1 << 16,
SCX_RQ_IN_BALANCE = 1 << 17, SCX_RQ_IN_BALANCE = 1 << 17,