sched_ext: Fixes for v6.12-rc7
- The fair sched class currently has a bug where its balance() returns true telling the sched core that it has tasks to run but then NULL from pick_task(). This makes sched core call sched_ext's pick_task() without preceding balance() which can lead to stalls in partial mode. For now, work around by detecting the condition and forcing the CPU to go through another scheduling cycle. - Add a missing newline to an error message and fix drgn introspection tool which went out of sync. -----BEGIN PGP SIGNATURE----- iIQEABYKACwWIQTfIjM1kS57o3GsC/uxYfJx3gVYGQUCZzI8sw4cdGpAa2VybmVs Lm9yZwAKCRCxYfJx3gVYGb5KAP40b/o6TyAFDG+Hn6GxyxQT7rcAUMXsdB2bcEpg /IjmzQEAwbHU5KP5vQXV6XHv+2V7Rs7u6ZqFtDnL88N0A9hf3wk= =7hL8 -----END PGP SIGNATURE----- Merge tag 'sched_ext-for-6.12-rc7-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext Pull sched_ext fixes from Tejun Heo: - The fair sched class currently has a bug where its balance() returns true telling the sched core that it has tasks to run but then NULL from pick_task(). This makes sched core call sched_ext's pick_task() without preceding balance() which can lead to stalls in partial mode. For now, work around by detecting the condition and forcing the CPU to go through another scheduling cycle. - Add a missing newline to an error message and fix drgn introspection tool which went out of sync. * tag 'sched_ext-for-6.12-rc7-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: sched_ext: Handle cases where pick_task_scx() is called without preceding balance_scx() sched_ext: Update scx_show_state.py to match scx_ops_bypass_depth's new type sched_ext: Add a missing newline at the end of an error message
This commit is contained in:
commit
3022e9d00e
@ -5920,12 +5920,15 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
|
||||
|
||||
#ifdef CONFIG_SCHED_CLASS_EXT
|
||||
/*
|
||||
* SCX requires a balance() call before every pick_next_task() including
|
||||
* when waking up from SCHED_IDLE. If @start_class is below SCX, start
|
||||
* from SCX instead.
|
||||
* SCX requires a balance() call before every pick_task() including when
|
||||
* waking up from SCHED_IDLE. If @start_class is below SCX, start from
|
||||
* SCX instead. Also, set a flag to detect missing balance() call.
|
||||
*/
|
||||
if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
|
||||
start_class = &ext_sched_class;
|
||||
if (scx_enabled()) {
|
||||
rq->scx.flags |= SCX_RQ_BAL_PENDING;
|
||||
if (sched_class_above(&ext_sched_class, start_class))
|
||||
start_class = &ext_sched_class;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
@ -2634,7 +2634,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
|
||||
|
||||
lockdep_assert_rq_held(rq);
|
||||
rq->scx.flags |= SCX_RQ_IN_BALANCE;
|
||||
rq->scx.flags &= ~SCX_RQ_BAL_KEEP;
|
||||
rq->scx.flags &= ~(SCX_RQ_BAL_PENDING | SCX_RQ_BAL_KEEP);
|
||||
|
||||
if (static_branch_unlikely(&scx_ops_cpu_preempt) &&
|
||||
unlikely(rq->scx.cpu_released)) {
|
||||
@ -2948,12 +2948,11 @@ static struct task_struct *pick_task_scx(struct rq *rq)
|
||||
{
|
||||
struct task_struct *prev = rq->curr;
|
||||
struct task_struct *p;
|
||||
bool prev_on_scx = prev->sched_class == &ext_sched_class;
|
||||
bool keep_prev = rq->scx.flags & SCX_RQ_BAL_KEEP;
|
||||
bool kick_idle = false;
|
||||
|
||||
/*
|
||||
* If balance_scx() is telling us to keep running @prev, replenish slice
|
||||
* if necessary and keep running @prev. Otherwise, pop the first one
|
||||
* from the local DSQ.
|
||||
*
|
||||
* WORKAROUND:
|
||||
*
|
||||
* %SCX_RQ_BAL_KEEP should be set iff $prev is on SCX as it must just
|
||||
@ -2962,22 +2961,41 @@ static struct task_struct *pick_task_scx(struct rq *rq)
|
||||
* which then ends up calling pick_task_scx() without preceding
|
||||
* balance_scx().
|
||||
*
|
||||
* For now, ignore cases where $prev is not on SCX. This isn't great and
|
||||
* can theoretically lead to stalls. However, for switch_all cases, this
|
||||
* happens only while a BPF scheduler is being loaded or unloaded, and,
|
||||
* for partial cases, fair will likely keep triggering this CPU.
|
||||
* Keep running @prev if possible and avoid stalling from entering idle
|
||||
* without balancing.
|
||||
*
|
||||
* Once fair is fixed, restore WARN_ON_ONCE().
|
||||
* Once fair is fixed, remove the workaround and trigger WARN_ON_ONCE()
|
||||
* if pick_task_scx() is called without preceding balance_scx().
|
||||
*/
|
||||
if ((rq->scx.flags & SCX_RQ_BAL_KEEP) &&
|
||||
prev->sched_class == &ext_sched_class) {
|
||||
if (unlikely(rq->scx.flags & SCX_RQ_BAL_PENDING)) {
|
||||
if (prev_on_scx) {
|
||||
keep_prev = true;
|
||||
} else {
|
||||
keep_prev = false;
|
||||
kick_idle = true;
|
||||
}
|
||||
} else if (unlikely(keep_prev && !prev_on_scx)) {
|
||||
/* only allowed during transitions */
|
||||
WARN_ON_ONCE(scx_ops_enable_state() == SCX_OPS_ENABLED);
|
||||
keep_prev = false;
|
||||
}
|
||||
|
||||
/*
|
||||
* If balance_scx() is telling us to keep running @prev, replenish slice
|
||||
* if necessary and keep running @prev. Otherwise, pop the first one
|
||||
* from the local DSQ.
|
||||
*/
|
||||
if (keep_prev) {
|
||||
p = prev;
|
||||
if (!p->scx.slice)
|
||||
p->scx.slice = SCX_SLICE_DFL;
|
||||
} else {
|
||||
p = first_local_task(rq);
|
||||
if (!p)
|
||||
if (!p) {
|
||||
if (kick_idle)
|
||||
scx_bpf_kick_cpu(cpu_of(rq), SCX_KICK_IDLE);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (unlikely(!p->scx.slice)) {
|
||||
if (!scx_rq_bypassing(rq) && !scx_warned_zero_slice) {
|
||||
@ -4979,7 +4997,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
|
||||
|
||||
if (!cpumask_equal(housekeeping_cpumask(HK_TYPE_DOMAIN),
|
||||
cpu_possible_mask)) {
|
||||
pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation");
|
||||
pr_err("sched_ext: Not compatible with \"isolcpus=\" domain isolation\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
@ -751,8 +751,9 @@ enum scx_rq_flags {
|
||||
*/
|
||||
SCX_RQ_ONLINE = 1 << 0,
|
||||
SCX_RQ_CAN_STOP_TICK = 1 << 1,
|
||||
SCX_RQ_BAL_KEEP = 1 << 2, /* balance decided to keep current */
|
||||
SCX_RQ_BYPASSING = 1 << 3,
|
||||
SCX_RQ_BAL_PENDING = 1 << 2, /* balance hasn't run yet */
|
||||
SCX_RQ_BAL_KEEP = 1 << 3, /* balance decided to keep current */
|
||||
SCX_RQ_BYPASSING = 1 << 4,
|
||||
|
||||
SCX_RQ_IN_WAKEUP = 1 << 16,
|
||||
SCX_RQ_IN_BALANCE = 1 << 17,
|
||||
|
@ -35,6 +35,6 @@ print(f'enabled : {read_static_key("__scx_ops_enabled")}')
|
||||
print(f'switching_all : {read_int("scx_switching_all")}')
|
||||
print(f'switched_all : {read_static_key("__scx_switched_all")}')
|
||||
print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})')
|
||||
print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}')
|
||||
print(f'bypass_depth : {prog["scx_ops_bypass_depth"].value_()}')
|
||||
print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
|
||||
print(f'enable_seq : {read_atomic("scx_enable_seq")}')
|
||||
|
Loading…
Reference in New Issue
Block a user