A few scheduler fixes:
- Plug a race between pick_next_task_fair() and try_to_wake_up() where both try to write to the same task, even though both paths hold a runqueue lock, but obviously from different runqueues. The problem is that the store to task::on_rq in __block_task() is visible to try_to_wake_up() which assumes that the task is not queued. Both sides then operate on the same task. Cure it by rearranging __block_task() so the the store to task::on_rq is the last operation on the task. - Prevent a potential NULL pointer dereference in task_numa_work() task_numa_work() iterates the VMAs of a process. A concurrent unmap of the address space can result in a NULL pointer return from vma_next() which is unchecked. Add the missing NULL pointer check to prevent this. - Operate on the correct scheduler policy in task_should_scx() task_should_scx() returns true when a task should be handled by sched EXT. It checks the tasks scheduling policy. This fails when the check is done before a policy has been set. Cure it by handing the policy into task_should_scx() so it operates on the requested value. - Add the missing handling of sched EXT in the delayed dequeue mechanism. This was simply forgotten. -----BEGIN PGP SIGNATURE----- iQJHBAABCgAxFiEEQp8+kY+LLUocC4bMphj1TA10mKEFAmcnTqATHHRnbHhAbGlu dXRyb25peC5kZQAKCRCmGPVMDXSYoX/aD/4yvskeG9i7wAj2NOdDTAs1K0gLURt+ nHDb1YkoIOXOfanaG7ZdBWb4sYnsnLX/KIhVsDQiXACFr6G0IjQ1zaN1iRtEkH79 5BfVi98gAXdFU3y+EGqyaqiAp7MFOBTmsfJi5095fX0L+2aViSAjDEvHzvvC/hXD tmq47vFQEgIZPSxljEaKPaNmyDM+geusv5lX/lABH5MG0fYsT85VV6BQ2T1LsN1O WFBLD/uPEOSXumyZW8nV8yE2PioLDJz8W+uSnr38/HCH99mtJApqZyskaagKtr0g vLhOfoaYVR/j5ODUk6LExZ8zy140zDzUWzC5+RNnyb8jQf/Lx88fTNZY8/Wsm5m9 oKtoiGzkL0LG/c05Cjh/vqReK26qILK4+ynDGaowDmTlUTS2jeNZL1ABlIwWkaLP 5TDegJPkoUA1Z4YegxtRFROGHp1J+lfbqz537bghMaqdJXMaG84qjSszsPz9NbS9 F7K63JKjfXAF6N8bhKvZk4jAbD97EYf3B0o8E69TjoZxaiuKf00xK7HGWmuQD3u3 lOHkfIZzf5b7ELNgcketCYsbJvxbI4oQrp/9V425ORSr1Ih2GxCT51/x/NlFHoEH ujIjAe2YQyLhb26M0RG8Xao3BPT7RGMR058C8lwxtPLuPNIwB8MqCsXmU9xlEypg iexGnsj6zXTddg== =4mie -----END PGP SIGNATURE----- Merge tag 'sched-urgent-2024-11-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler fixes from Thomas Gleixner: - Plug a race between pick_next_task_fair() and try_to_wake_up() where both try to write to the same task, even though both paths hold a runqueue lock, but obviously from different runqueues. The problem is that the store to task::on_rq in __block_task() is visible to try_to_wake_up() which assumes that the task is not queued. Both sides then operate on the same task. Cure it by rearranging __block_task() so the the store to task::on_rq is the last operation on the task. - Prevent a potential NULL pointer dereference in task_numa_work() task_numa_work() iterates the VMAs of a process. A concurrent unmap of the address space can result in a NULL pointer return from vma_next() which is unchecked. Add the missing NULL pointer check to prevent this. - Operate on the correct scheduler policy in task_should_scx() task_should_scx() returns true when a task should be handled by sched EXT. It checks the tasks scheduling policy. This fails when the check is done before a policy has been set. Cure it by handing the policy into task_should_scx() so it operates on the requested value. - Add the missing handling of sched EXT in the delayed dequeue mechanism. This was simply forgotten. * tag 'sched-urgent-2024-11-03' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/ext: Fix scx vs sched_delayed sched: Pass correct scheduling policy to __setscheduler_class sched/numa: Fix the potential null pointer dereference in task_numa_work() sched: Fix pick_next_task_fair() vs try_to_wake_up() race
This commit is contained in:
commit
33e83ffe4c
@ -4711,7 +4711,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
|
||||
if (rt_prio(p->prio)) {
|
||||
p->sched_class = &rt_sched_class;
|
||||
#ifdef CONFIG_SCHED_CLASS_EXT
|
||||
} else if (task_should_scx(p)) {
|
||||
} else if (task_should_scx(p->policy)) {
|
||||
p->sched_class = &ext_sched_class;
|
||||
#endif
|
||||
} else {
|
||||
@ -7025,7 +7025,7 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
|
||||
}
|
||||
EXPORT_SYMBOL(default_wake_function);
|
||||
|
||||
const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
|
||||
const struct sched_class *__setscheduler_class(int policy, int prio)
|
||||
{
|
||||
if (dl_prio(prio))
|
||||
return &dl_sched_class;
|
||||
@ -7034,7 +7034,7 @@ const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
|
||||
return &rt_sched_class;
|
||||
|
||||
#ifdef CONFIG_SCHED_CLASS_EXT
|
||||
if (task_should_scx(p))
|
||||
if (task_should_scx(policy))
|
||||
return &ext_sched_class;
|
||||
#endif
|
||||
|
||||
@ -7142,7 +7142,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
|
||||
queue_flag &= ~DEQUEUE_MOVE;
|
||||
|
||||
prev_class = p->sched_class;
|
||||
next_class = __setscheduler_class(p, prio);
|
||||
next_class = __setscheduler_class(p->policy, prio);
|
||||
|
||||
if (prev_class != next_class && p->se.sched_delayed)
|
||||
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
|
||||
|
@ -4257,14 +4257,14 @@ static const struct kset_uevent_ops scx_uevent_ops = {
|
||||
* Used by sched_fork() and __setscheduler_prio() to pick the matching
|
||||
* sched_class. dl/rt are already handled.
|
||||
*/
|
||||
bool task_should_scx(struct task_struct *p)
|
||||
bool task_should_scx(int policy)
|
||||
{
|
||||
if (!scx_enabled() ||
|
||||
unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
|
||||
return false;
|
||||
if (READ_ONCE(scx_switching_all))
|
||||
return true;
|
||||
return p->policy == SCHED_EXT;
|
||||
return policy == SCHED_EXT;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -4494,11 +4494,16 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
|
||||
scx_task_iter_start(&sti);
|
||||
while ((p = scx_task_iter_next_locked(&sti))) {
|
||||
const struct sched_class *old_class = p->sched_class;
|
||||
const struct sched_class *new_class =
|
||||
__setscheduler_class(p->policy, p->prio);
|
||||
struct sched_enq_and_set_ctx ctx;
|
||||
|
||||
if (old_class != new_class && p->se.sched_delayed)
|
||||
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
|
||||
|
||||
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
|
||||
|
||||
p->sched_class = __setscheduler_class(p, p->prio);
|
||||
p->sched_class = new_class;
|
||||
check_class_changing(task_rq(p), p, old_class);
|
||||
|
||||
sched_enq_and_set_task(&ctx);
|
||||
@ -5204,12 +5209,17 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
|
||||
scx_task_iter_start(&sti);
|
||||
while ((p = scx_task_iter_next_locked(&sti))) {
|
||||
const struct sched_class *old_class = p->sched_class;
|
||||
const struct sched_class *new_class =
|
||||
__setscheduler_class(p->policy, p->prio);
|
||||
struct sched_enq_and_set_ctx ctx;
|
||||
|
||||
if (old_class != new_class && p->se.sched_delayed)
|
||||
dequeue_task(task_rq(p), p, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
|
||||
|
||||
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
|
||||
|
||||
p->scx.slice = SCX_SLICE_DFL;
|
||||
p->sched_class = __setscheduler_class(p, p->prio);
|
||||
p->sched_class = new_class;
|
||||
check_class_changing(task_rq(p), p, old_class);
|
||||
|
||||
sched_enq_and_set_task(&ctx);
|
||||
|
@ -18,7 +18,7 @@ bool scx_can_stop_tick(struct rq *rq);
|
||||
void scx_rq_activate(struct rq *rq);
|
||||
void scx_rq_deactivate(struct rq *rq);
|
||||
int scx_check_setscheduler(struct task_struct *p, int policy);
|
||||
bool task_should_scx(struct task_struct *p);
|
||||
bool task_should_scx(int policy);
|
||||
void init_sched_ext_class(void);
|
||||
|
||||
static inline u32 scx_cpuperf_target(s32 cpu)
|
||||
|
@ -3369,7 +3369,7 @@ retry_pids:
|
||||
vma = vma_next(&vmi);
|
||||
}
|
||||
|
||||
do {
|
||||
for (; vma; vma = vma_next(&vmi)) {
|
||||
if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
|
||||
is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
|
||||
trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE);
|
||||
@ -3491,7 +3491,7 @@ retry_pids:
|
||||
*/
|
||||
if (vma_pids_forced)
|
||||
break;
|
||||
} for_each_vma(vmi, vma);
|
||||
}
|
||||
|
||||
/*
|
||||
* If no VMAs are remaining and VMAs were skipped due to the PID
|
||||
@ -5625,8 +5625,9 @@ pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq)
|
||||
struct sched_entity *se = pick_eevdf(cfs_rq);
|
||||
if (se->sched_delayed) {
|
||||
dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED);
|
||||
SCHED_WARN_ON(se->sched_delayed);
|
||||
SCHED_WARN_ON(se->on_rq);
|
||||
/*
|
||||
* Must not reference @se again, see __block_task().
|
||||
*/
|
||||
return NULL;
|
||||
}
|
||||
return se;
|
||||
@ -7176,7 +7177,11 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
|
||||
/* Fix-up what dequeue_task_fair() skipped */
|
||||
hrtick_update(rq);
|
||||
|
||||
/* Fix-up what block_task() skipped. */
|
||||
/*
|
||||
* Fix-up what block_task() skipped.
|
||||
*
|
||||
* Must be last, @p might not be valid after this.
|
||||
*/
|
||||
__block_task(rq, p);
|
||||
}
|
||||
|
||||
@ -7193,12 +7198,14 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
||||
if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE))))
|
||||
util_est_dequeue(&rq->cfs, p);
|
||||
|
||||
if (dequeue_entities(rq, &p->se, flags) < 0) {
|
||||
util_est_update(&rq->cfs, p, DEQUEUE_SLEEP);
|
||||
return false;
|
||||
}
|
||||
|
||||
util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP);
|
||||
if (dequeue_entities(rq, &p->se, flags) < 0)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Must not reference @p after dequeue_entities(DEQUEUE_DELAYED).
|
||||
*/
|
||||
|
||||
hrtick_update(rq);
|
||||
return true;
|
||||
}
|
||||
|
@ -2769,8 +2769,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
|
||||
|
||||
static inline void __block_task(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
WRITE_ONCE(p->on_rq, 0);
|
||||
ASSERT_EXCLUSIVE_WRITER(p->on_rq);
|
||||
if (p->sched_contributes_to_load)
|
||||
rq->nr_uninterruptible++;
|
||||
|
||||
@ -2778,6 +2776,38 @@ static inline void __block_task(struct rq *rq, struct task_struct *p)
|
||||
atomic_inc(&rq->nr_iowait);
|
||||
delayacct_blkio_start();
|
||||
}
|
||||
|
||||
ASSERT_EXCLUSIVE_WRITER(p->on_rq);
|
||||
|
||||
/*
|
||||
* The moment this write goes through, ttwu() can swoop in and migrate
|
||||
* this task, rendering our rq->__lock ineffective.
|
||||
*
|
||||
* __schedule() try_to_wake_up()
|
||||
* LOCK rq->__lock LOCK p->pi_lock
|
||||
* pick_next_task()
|
||||
* pick_next_task_fair()
|
||||
* pick_next_entity()
|
||||
* dequeue_entities()
|
||||
* __block_task()
|
||||
* RELEASE p->on_rq = 0 if (p->on_rq && ...)
|
||||
* break;
|
||||
*
|
||||
* ACQUIRE (after ctrl-dep)
|
||||
*
|
||||
* cpu = select_task_rq();
|
||||
* set_task_cpu(p, cpu);
|
||||
* ttwu_queue()
|
||||
* ttwu_do_activate()
|
||||
* LOCK rq->__lock
|
||||
* activate_task()
|
||||
* STORE p->on_rq = 1
|
||||
* UNLOCK rq->__lock
|
||||
*
|
||||
* Callers must ensure to not reference @p after this -- we no longer
|
||||
* own it.
|
||||
*/
|
||||
smp_store_release(&p->on_rq, 0);
|
||||
}
|
||||
|
||||
extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
|
||||
@ -3800,7 +3830,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
|
||||
|
||||
extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
|
||||
extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
|
||||
extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio);
|
||||
extern const struct sched_class *__setscheduler_class(int policy, int prio);
|
||||
extern void set_load_weight(struct task_struct *p, bool update_load);
|
||||
extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
|
||||
extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
|
||||
|
@ -707,7 +707,7 @@ change:
|
||||
}
|
||||
|
||||
prev_class = p->sched_class;
|
||||
next_class = __setscheduler_class(p, newprio);
|
||||
next_class = __setscheduler_class(policy, newprio);
|
||||
|
||||
if (prev_class != next_class && p->se.sched_delayed)
|
||||
dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
|
||||
|
Loading…
Reference in New Issue
Block a user