diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index bbaeace9d1e3..0f2df67f710b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2428,7 +2428,6 @@ again: else p = dl_se->server_pick_next(dl_se); if (!p) { - WARN_ON_ONCE(1); dl_se->dl_yielded = 1; update_curr_dl_se(rq, dl_se, 0); goto again; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25b14dffeb37..da5065a226ee 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5379,19 +5379,38 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); -static void +static bool dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { - int action = UPDATE_TG; + update_curr(cfs_rq); + if (flags & DEQUEUE_DELAYED) { + SCHED_WARN_ON(!se->sched_delayed); + } else { + bool sleep = flags & DEQUEUE_SLEEP; + + /* + * DELAY_DEQUEUE relies on spurious wakeups, special task + * states must not suffer spurious wakeups, excempt them. + */ + if (flags & DEQUEUE_SPECIAL) + sleep = false; + + SCHED_WARN_ON(sleep && se->sched_delayed); + + if (sched_feat(DELAY_DEQUEUE) && sleep && + !entity_eligible(cfs_rq, se)) { + if (cfs_rq->next == se) + cfs_rq->next = NULL; + se->sched_delayed = 1; + return false; + } + } + + int action = UPDATE_TG; if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) action |= DO_DETACH; - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); - /* * When dequeuing a sched_entity, we must: * - Update loads to have both entity and cfs_rq synced with now. @@ -5428,8 +5447,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) update_min_vruntime(cfs_rq); + if (flags & DEQUEUE_DELAYED) + se->sched_delayed = 0; + if (cfs_rq->nr_running == 0) update_idle_cfs_rq_clock_pelt(cfs_rq); + + return true; } static void @@ -5828,11 +5852,21 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) idle_task_delta = cfs_rq->idle_h_nr_running; for_each_sched_entity(se) { struct cfs_rq *qcfs_rq = cfs_rq_of(se); + int flags; + /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) goto done; - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + /* + * Abuse SPECIAL to avoid delayed dequeue in this instance. + * This avoids teaching dequeue_entities() about throttled + * entities and keeps things relatively simple. + */ + flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; + if (se->sched_delayed) + flags |= DEQUEUE_DELAYED; + dequeue_entity(qcfs_rq, se, flags); if (cfs_rq_is_idle(group_cfs_rq(se))) idle_task_delta = cfs_rq->h_nr_running; @@ -6918,6 +6952,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) bool was_sched_idle = sched_idle_rq(rq); int rq_h_nr_running = rq->cfs.h_nr_running; bool task_sleep = flags & DEQUEUE_SLEEP; + bool task_delayed = flags & DEQUEUE_DELAYED; struct task_struct *p = NULL; int idle_h_nr_running = 0; int h_nr_running = 0; @@ -6931,7 +6966,13 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - dequeue_entity(cfs_rq, se, flags); + + if (!dequeue_entity(cfs_rq, se, flags)) { + if (p && &p->se == se) + return -1; + + break; + } cfs_rq->h_nr_running -= h_nr_running; cfs_rq->idle_h_nr_running -= idle_h_nr_running; @@ -6956,6 +6997,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) break; } flags |= DEQUEUE_SLEEP; + flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL); } for_each_sched_entity(se) { @@ -6985,6 +7027,17 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; + if (p && task_delayed) { + SCHED_WARN_ON(!task_sleep); + SCHED_WARN_ON(p->on_rq != 1); + + /* Fix-up what dequeue_task_fair() skipped */ + hrtick_update(rq); + + /* Fix-up what block_task() skipped. */ + __block_task(rq, p); + } + return 1; } @@ -6997,8 +7050,10 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) { util_est_dequeue(&rq->cfs, p); - if (dequeue_entities(rq, &p->se, flags) < 0) + if (dequeue_entities(rq, &p->se, flags) < 0) { + util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); return false; + } util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); hrtick_update(rq); @@ -12971,6 +13026,11 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) /* ensure bandwidth has been allocated on our new cfs_rq */ account_cfs_rq_runtime(cfs_rq, 0); } + + if (!first) + return; + + SCHED_WARN_ON(se->sched_delayed); } void init_cfs_rq(struct cfs_rq *cfs_rq) diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 97fb2d492089..1feaa7bbc278 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -28,6 +28,15 @@ SCHED_FEAT(NEXT_BUDDY, false) */ SCHED_FEAT(CACHE_HOT_BUDDY, true) +/* + * Delay dequeueing tasks until they get selected or woken. + * + * By delaying the dequeue for non-eligible tasks, they remain in the + * competition and can burn off their negative lag. When they get selected + * they'll have positive lag by definition. + */ +SCHED_FEAT(DELAY_DEQUEUE, true) + /* * Allow wakeup-time preemption of the current task: */