Scheduler changes for v6.11:
- Update Daniel Bristot de Oliveira's entry in MAINTAINERS, and credit him in CREDITS. - Harmonize the lock-yielding behavior on dynamically selected preemption models with static ones. - Reorganize the code a bit: split out sched/syscalls.c to reduce the size of sched/core.c - Micro-optimize psi_group_change() - Fix set_load_weight() for SCHED_IDLE tasks - Misc cleanups & fixes Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmaVtVARHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1iqTQ/9GLNzNBnl0oBWCiybeQjyWsZ6BiZi48R0 C1g9/RKy++OyGOjn/yqYK0Kg8cdfoGzHGioMMAucHFW1nXZwVw17xAJK127N0apF 83up7AnFJw/JGr1bI0FwuozqHAs4Z5KzHTv2KBxhYuO77lyYna6/t0liRUbF8ZUZ I/nqav7wDB8RBIB5hEJ/uYLDX7qWdUlyFB+mcvV4ANA99yr++OgipCp6Ob3Rz3cP O676nKJY4vpNbZ/B6bpKg8ezULRP8re2qD3GJRf2huS63uu/Z5ct7ouLVZ1DwN53 mFDBTYUMI2ToV0pseikuqwnmrjxAKcEajTyZpD3vckafd2TlWIopkQZoQ9XLLlIZ DxO+KoekaHTSVy8FWlO8O+iE3IAdUUgECEpNveX45Pb7nFP+5dtFqqnVIdNqCq5e zEuQvizaa5m+A1POZhZKya+z9jbLXXx+gtPCbbADTBWtuyl8azUIh3vjn0bykmv4 IVV/wvUm+BPEIhnKusZZOgB0vLtxUdntBBfUSxqoSOad9L+0/UtSKoKI6wvW00q8 ZkW+85yS3YFiN9W61276RLis2j7OAjE0eDJ96wfhooma2JRDJU4Wmg5oWg8x3WuA JRmK0s63Qik5gpwG5rHQsR5jNqYWTj5Lp7So+M1kRfFsOM/RXQ/AneSXZu/P7d65 LnYWzbKu76c= =lLab -----END PGP SIGNATURE----- Merge tag 'sched-core-2024-07-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler updates from Ingo Molnar: - Update Daniel Bristot de Oliveira's entry in MAINTAINERS, and credit him in CREDITS - Harmonize the lock-yielding behavior on dynamically selected preemption models with static ones - Reorganize the code a bit: split out sched/syscalls.c to reduce the size of sched/core.c - Micro-optimize psi_group_change() - Fix set_load_weight() for SCHED_IDLE tasks - Misc cleanups & fixes * tag 'sched-core-2024-07-16' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Update MAINTAINERS and CREDITS sched/fair: set_load_weight() must also call reweight_task() for SCHED_IDLE tasks sched/psi: Optimise psi_group_change a bit sched/core: Drop spinlocks on contention iff kernel is preemptible sched/core: Move preempt_model_*() helpers from sched.h to preempt.h sched/balance: Skip unnecessary updates to idle load balancer's flags idle: Remove stale RCU comment sched/headers: Move struct pre-declarations to the beginning of the header sched/core: Clean up kernel/sched/sched.h a bit sched/core: Simplify prefetch_curr_exec_start() sched: Fix spelling in comments sched/syscalls: Split out kernel/sched/syscalls.c from kernel/sched/core.c
This commit is contained in:
commit
4a996d90b9
3
CREDITS
3
CREDITS
@ -271,6 +271,9 @@ D: Driver for WaveFront soundcards (Turtle Beach Maui, Tropez, Tropez+)
|
||||
D: Various bugfixes and changes to sound drivers
|
||||
S: USA
|
||||
|
||||
N: Daniel Bristot de Oliveira
|
||||
D: Scheduler contributions, notably: SCHED_DEADLINE
|
||||
|
||||
N: Carlos Henrique Bauer
|
||||
E: chbauer@acm.org
|
||||
E: bauer@atlas.unisinos.br
|
||||
|
@ -4728,7 +4728,9 @@
|
||||
none - Limited to cond_resched() calls
|
||||
voluntary - Limited to cond_resched() and might_sleep() calls
|
||||
full - Any section that isn't explicitly preempt disabled
|
||||
can be preempted anytime.
|
||||
can be preempted anytime. Tasks will also yield
|
||||
contended spinlocks (if the critical section isn't
|
||||
explicitly preempt disabled beyond the lock itself).
|
||||
|
||||
print-fatal-signals=
|
||||
[KNL] debug: print fatal signals
|
||||
|
@ -20047,7 +20047,6 @@ R: Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
|
||||
R: Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
|
||||
R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
|
||||
R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
|
||||
R: Daniel Bristot de Oliveira <bristot@redhat.com> (SCHED_DEADLINE)
|
||||
R: Valentin Schneider <vschneid@redhat.com> (TOPOLOGY)
|
||||
L: linux-kernel@vger.kernel.org
|
||||
S: Maintained
|
||||
|
@ -481,4 +481,45 @@ DEFINE_LOCK_GUARD_0(preempt, preempt_disable(), preempt_enable())
|
||||
DEFINE_LOCK_GUARD_0(preempt_notrace, preempt_disable_notrace(), preempt_enable_notrace())
|
||||
DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable())
|
||||
|
||||
#ifdef CONFIG_PREEMPT_DYNAMIC
|
||||
|
||||
extern bool preempt_model_none(void);
|
||||
extern bool preempt_model_voluntary(void);
|
||||
extern bool preempt_model_full(void);
|
||||
|
||||
#else
|
||||
|
||||
static inline bool preempt_model_none(void)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_PREEMPT_NONE);
|
||||
}
|
||||
static inline bool preempt_model_voluntary(void)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY);
|
||||
}
|
||||
static inline bool preempt_model_full(void)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_PREEMPT);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static inline bool preempt_model_rt(void)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_PREEMPT_RT);
|
||||
}
|
||||
|
||||
/*
|
||||
* Does the preemption model allow non-cooperative preemption?
|
||||
*
|
||||
* For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with
|
||||
* CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the
|
||||
* kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the
|
||||
* PREEMPT_NONE model.
|
||||
*/
|
||||
static inline bool preempt_model_preemptible(void)
|
||||
{
|
||||
return preempt_model_full() || preempt_model_rt();
|
||||
}
|
||||
|
||||
#endif /* __LINUX_PREEMPT_H */
|
||||
|
@ -2064,47 +2064,6 @@ extern int __cond_resched_rwlock_write(rwlock_t *lock);
|
||||
__cond_resched_rwlock_write(lock); \
|
||||
})
|
||||
|
||||
#ifdef CONFIG_PREEMPT_DYNAMIC
|
||||
|
||||
extern bool preempt_model_none(void);
|
||||
extern bool preempt_model_voluntary(void);
|
||||
extern bool preempt_model_full(void);
|
||||
|
||||
#else
|
||||
|
||||
static inline bool preempt_model_none(void)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_PREEMPT_NONE);
|
||||
}
|
||||
static inline bool preempt_model_voluntary(void)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY);
|
||||
}
|
||||
static inline bool preempt_model_full(void)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_PREEMPT);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static inline bool preempt_model_rt(void)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_PREEMPT_RT);
|
||||
}
|
||||
|
||||
/*
|
||||
* Does the preemption model allow non-cooperative preemption?
|
||||
*
|
||||
* For !CONFIG_PREEMPT_DYNAMIC kernels this is an exact match with
|
||||
* CONFIG_PREEMPTION; for CONFIG_PREEMPT_DYNAMIC this doesn't work as the
|
||||
* kernel is *built* with CONFIG_PREEMPTION=y but may run with e.g. the
|
||||
* PREEMPT_NONE model.
|
||||
*/
|
||||
static inline bool preempt_model_preemptible(void)
|
||||
{
|
||||
return preempt_model_full() || preempt_model_rt();
|
||||
}
|
||||
|
||||
static __always_inline bool need_resched(void)
|
||||
{
|
||||
return unlikely(tif_need_resched());
|
||||
|
@ -462,11 +462,10 @@ static __always_inline int spin_is_contended(spinlock_t *lock)
|
||||
*/
|
||||
static inline int spin_needbreak(spinlock_t *lock)
|
||||
{
|
||||
#ifdef CONFIG_PREEMPTION
|
||||
if (!preempt_model_preemptible())
|
||||
return 0;
|
||||
|
||||
return spin_is_contended(lock);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
@ -479,11 +478,10 @@ static inline int spin_needbreak(spinlock_t *lock)
|
||||
*/
|
||||
static inline int rwlock_needbreak(rwlock_t *lock)
|
||||
{
|
||||
#ifdef CONFIG_PREEMPTION
|
||||
if (!preempt_model_preemptible())
|
||||
return 0;
|
||||
|
||||
return rwlock_is_contended(lock);
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -52,3 +52,4 @@
|
||||
#include "cputime.c"
|
||||
#include "deadline.c"
|
||||
|
||||
#include "syscalls.c"
|
||||
|
@ -340,7 +340,7 @@ again:
|
||||
this_clock = sched_clock_local(my_scd);
|
||||
/*
|
||||
* We must enforce atomic readout on 32-bit, otherwise the
|
||||
* update on the remote CPU can hit inbetween the readout of
|
||||
* update on the remote CPU can hit in between the readout of
|
||||
* the low 32-bit and the high 32-bit portion.
|
||||
*/
|
||||
remote_clock = cmpxchg64(&scd->clock, 0, 0);
|
||||
@ -444,7 +444,7 @@ notrace void sched_clock_tick_stable(void)
|
||||
}
|
||||
|
||||
/*
|
||||
* We are going deep-idle (irqs are disabled):
|
||||
* We are going deep-idle (IRQs are disabled):
|
||||
*/
|
||||
notrace void sched_clock_idle_sleep_event(void)
|
||||
{
|
||||
|
1874
kernel/sched/core.c
1874
kernel/sched/core.c
File diff suppressed because it is too large
Load Diff
@ -279,7 +279,7 @@ void __sched_core_account_forceidle(struct rq *rq)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Note: this will account forceidle to the current cpu, even
|
||||
* Note: this will account forceidle to the current CPU, even
|
||||
* if it comes from our SMT sibling.
|
||||
*/
|
||||
__account_forceidle_time(p, delta);
|
||||
|
@ -14,11 +14,11 @@
|
||||
* They are only modified in vtime_account, on corresponding CPU
|
||||
* with interrupts disabled. So, writes are safe.
|
||||
* They are read and saved off onto struct rq in update_rq_clock().
|
||||
* This may result in other CPU reading this CPU's irq time and can
|
||||
* This may result in other CPU reading this CPU's IRQ time and can
|
||||
* race with irq/vtime_account on this CPU. We would either get old
|
||||
* or new value with a side effect of accounting a slice of irq time to wrong
|
||||
* task when irq is in progress while we read rq->clock. That is a worthy
|
||||
* compromise in place of having locks on each irq in account_system_time.
|
||||
* or new value with a side effect of accounting a slice of IRQ time to wrong
|
||||
* task when IRQ is in progress while we read rq->clock. That is a worthy
|
||||
* compromise in place of having locks on each IRQ in account_system_time.
|
||||
*/
|
||||
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
|
||||
|
||||
@ -269,7 +269,7 @@ static __always_inline u64 steal_account_process_time(u64 maxtime)
|
||||
}
|
||||
|
||||
/*
|
||||
* Account how much elapsed time was spent in steal, irq, or softirq time.
|
||||
* Account how much elapsed time was spent in steal, IRQ, or softirq time.
|
||||
*/
|
||||
static inline u64 account_other_time(u64 max)
|
||||
{
|
||||
@ -370,7 +370,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
|
||||
* Check for hardirq is done both for system and user time as there is
|
||||
* no timer going off while we are on hardirq and hence we may never get an
|
||||
* opportunity to update it solely in system time.
|
||||
* p->stime and friends are only updated on system time and not on irq
|
||||
* p->stime and friends are only updated on system time and not on IRQ
|
||||
* softirq as those do not count in task exec_runtime any more.
|
||||
*/
|
||||
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
||||
@ -380,7 +380,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
||||
|
||||
/*
|
||||
* When returning from idle, many ticks can get accounted at
|
||||
* once, including some ticks of steal, irq, and softirq time.
|
||||
* once, including some ticks of steal, IRQ, and softirq time.
|
||||
* Subtract those ticks from the amount of time accounted to
|
||||
* idle, or potentially user or system time. Due to rounding,
|
||||
* other time can exceed ticks occasionally.
|
||||
|
@ -708,7 +708,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
|
||||
}
|
||||
|
||||
/*
|
||||
* And we finally need to fixup root_domain(s) bandwidth accounting,
|
||||
* And we finally need to fix up root_domain(s) bandwidth accounting,
|
||||
* since p is still hanging out in the old (now moved to default) root
|
||||
* domain.
|
||||
*/
|
||||
@ -992,7 +992,7 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se)
|
||||
* is detected, the runtime and deadline need to be updated.
|
||||
*
|
||||
* If the task has an implicit deadline, i.e., deadline == period, the Original
|
||||
* CBS is applied. the runtime is replenished and a new absolute deadline is
|
||||
* CBS is applied. The runtime is replenished and a new absolute deadline is
|
||||
* set, as in the previous cases.
|
||||
*
|
||||
* However, the Original CBS does not work properly for tasks with
|
||||
@ -1294,7 +1294,7 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
|
||||
* Since rq->dl.running_bw and rq->dl.this_bw contain utilizations multiplied
|
||||
* by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
|
||||
* Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT, dl_bw
|
||||
* is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
|
||||
* is multiplied by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
|
||||
* Since delta is a 64 bit variable, to have an overflow its value should be
|
||||
* larger than 2^(64 - 20 - 8), which is more than 64 seconds. So, overflow is
|
||||
* not an issue here.
|
||||
@ -2493,7 +2493,7 @@ static void pull_dl_task(struct rq *this_rq)
|
||||
src_rq = cpu_rq(cpu);
|
||||
|
||||
/*
|
||||
* It looks racy, abd it is! However, as in sched_rt.c,
|
||||
* It looks racy, and it is! However, as in sched_rt.c,
|
||||
* we are fine with this.
|
||||
*/
|
||||
if (this_rq->dl.dl_nr_running &&
|
||||
|
@ -61,7 +61,7 @@
|
||||
* Options are:
|
||||
*
|
||||
* SCHED_TUNABLESCALING_NONE - unscaled, always *1
|
||||
* SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
|
||||
* SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus)
|
||||
* SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
|
||||
*
|
||||
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
|
||||
@ -3835,15 +3835,14 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
|
||||
}
|
||||
}
|
||||
|
||||
void reweight_task(struct task_struct *p, int prio)
|
||||
void reweight_task(struct task_struct *p, const struct load_weight *lw)
|
||||
{
|
||||
struct sched_entity *se = &p->se;
|
||||
struct cfs_rq *cfs_rq = cfs_rq_of(se);
|
||||
struct load_weight *load = &se->load;
|
||||
unsigned long weight = scale_load(sched_prio_to_weight[prio]);
|
||||
|
||||
reweight_entity(cfs_rq, se, weight);
|
||||
load->inv_weight = sched_prio_to_wmult[prio];
|
||||
reweight_entity(cfs_rq, se, lw->weight);
|
||||
load->inv_weight = lw->inv_weight;
|
||||
}
|
||||
|
||||
static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
|
||||
@ -8719,7 +8718,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p)
|
||||
* topology where each level pairs two lower groups (or better). This results
|
||||
* in O(log n) layers. Furthermore we reduce the number of CPUs going up the
|
||||
* tree to only the first of the previous level and we decrease the frequency
|
||||
* of load-balance at each level inv. proportional to the number of CPUs in
|
||||
* of load-balance at each level inversely proportional to the number of CPUs in
|
||||
* the groups.
|
||||
*
|
||||
* This yields:
|
||||
@ -11885,6 +11884,13 @@ static void kick_ilb(unsigned int flags)
|
||||
if (ilb_cpu < 0)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Don't bother if no new NOHZ balance work items for ilb_cpu,
|
||||
* i.e. all bits in flags are already set in ilb_cpu.
|
||||
*/
|
||||
if ((atomic_read(nohz_flags(ilb_cpu)) & flags) == flags)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Access to rq::nohz_csd is serialized by NOHZ_KICK_MASK; he who sets
|
||||
* the first flag owns it; cleared by nohz_csd_func().
|
||||
|
@ -172,19 +172,13 @@ static void cpuidle_idle_call(void)
|
||||
|
||||
/*
|
||||
* Check if the idle task must be rescheduled. If it is the
|
||||
* case, exit the function after re-enabling the local irq.
|
||||
* case, exit the function after re-enabling the local IRQ.
|
||||
*/
|
||||
if (need_resched()) {
|
||||
local_irq_enable();
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* The RCU framework needs to be told that we are entering an idle
|
||||
* section, so no more rcu read side critical sections and one more
|
||||
* step to the grace period
|
||||
*/
|
||||
|
||||
if (cpuidle_not_available(drv, dev)) {
|
||||
tick_nohz_idle_stop_tick();
|
||||
|
||||
@ -244,7 +238,7 @@ exit_idle:
|
||||
__current_set_polling();
|
||||
|
||||
/*
|
||||
* It is up to the idle functions to reenable local interrupts
|
||||
* It is up to the idle functions to re-enable local interrupts
|
||||
*/
|
||||
if (WARN_ON_ONCE(irqs_disabled()))
|
||||
local_irq_enable();
|
||||
@ -320,7 +314,7 @@ static void do_idle(void)
|
||||
rcu_nocb_flush_deferred_wakeup();
|
||||
|
||||
/*
|
||||
* In poll mode we reenable interrupts and spin. Also if we
|
||||
* In poll mode we re-enable interrupts and spin. Also if we
|
||||
* detected in the wakeup from idle path that the tick
|
||||
* broadcast device expired for us, we don't want to go deep
|
||||
* idle as we know that the IPI is going to arrive right away.
|
||||
|
@ -45,7 +45,7 @@
|
||||
* again, being late doesn't loose the delta, just wrecks the sample.
|
||||
*
|
||||
* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
|
||||
* this would add another cross-CPU cacheline miss and atomic operation
|
||||
* this would add another cross-CPU cache-line miss and atomic operation
|
||||
* to the wakeup path. Instead we increment on whatever CPU the task ran
|
||||
* when it went into uninterruptible state and decrement on whatever CPU
|
||||
* did the wakeup. This means that only the sum of nr_uninterruptible over
|
||||
@ -62,7 +62,7 @@ EXPORT_SYMBOL(avenrun); /* should be removed */
|
||||
|
||||
/**
|
||||
* get_avenrun - get the load average array
|
||||
* @loads: pointer to dest load array
|
||||
* @loads: pointer to destination load array
|
||||
* @offset: offset to add
|
||||
* @shift: shift count to shift the result left
|
||||
*
|
||||
|
@ -417,7 +417,7 @@ int update_hw_load_avg(u64 now, struct rq *rq, u64 capacity)
|
||||
|
||||
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
|
||||
/*
|
||||
* irq:
|
||||
* IRQ:
|
||||
*
|
||||
* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
|
||||
* util_sum = cpu_scale * load_sum
|
||||
@ -432,7 +432,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* We can't use clock_pelt because irq time is not accounted in
|
||||
* We can't use clock_pelt because IRQ time is not accounted in
|
||||
* clock_task. Instead we directly scale the running time to
|
||||
* reflect the real amount of computation
|
||||
*/
|
||||
|
@ -41,7 +41,7 @@
|
||||
* What it means for a task to be productive is defined differently
|
||||
* for each resource. For IO, productive means a running task. For
|
||||
* memory, productive means a running task that isn't a reclaimer. For
|
||||
* CPU, productive means an oncpu task.
|
||||
* CPU, productive means an on-CPU task.
|
||||
*
|
||||
* Naturally, the FULL state doesn't exist for the CPU resource at the
|
||||
* system level, but exist at the cgroup level. At the cgroup level,
|
||||
@ -49,7 +49,7 @@
|
||||
* resource which is being used by others outside of the cgroup or
|
||||
* throttled by the cgroup cpu.max configuration.
|
||||
*
|
||||
* The percentage of wallclock time spent in those compound stall
|
||||
* The percentage of wall clock time spent in those compound stall
|
||||
* states gives pressure numbers between 0 and 100 for each resource,
|
||||
* where the SOME percentage indicates workload slowdowns and the FULL
|
||||
* percentage indicates reduced CPU utilization:
|
||||
@ -218,28 +218,32 @@ void __init psi_init(void)
|
||||
group_init(&psi_system);
|
||||
}
|
||||
|
||||
static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
|
||||
static u32 test_states(unsigned int *tasks, u32 state_mask)
|
||||
{
|
||||
switch (state) {
|
||||
case PSI_IO_SOME:
|
||||
return unlikely(tasks[NR_IOWAIT]);
|
||||
case PSI_IO_FULL:
|
||||
return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
|
||||
case PSI_MEM_SOME:
|
||||
return unlikely(tasks[NR_MEMSTALL]);
|
||||
case PSI_MEM_FULL:
|
||||
return unlikely(tasks[NR_MEMSTALL] &&
|
||||
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
|
||||
case PSI_CPU_SOME:
|
||||
return unlikely(tasks[NR_RUNNING] > oncpu);
|
||||
case PSI_CPU_FULL:
|
||||
return unlikely(tasks[NR_RUNNING] && !oncpu);
|
||||
case PSI_NONIDLE:
|
||||
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
|
||||
tasks[NR_RUNNING];
|
||||
default:
|
||||
return false;
|
||||
const bool oncpu = state_mask & PSI_ONCPU;
|
||||
|
||||
if (tasks[NR_IOWAIT]) {
|
||||
state_mask |= BIT(PSI_IO_SOME);
|
||||
if (!tasks[NR_RUNNING])
|
||||
state_mask |= BIT(PSI_IO_FULL);
|
||||
}
|
||||
|
||||
if (tasks[NR_MEMSTALL]) {
|
||||
state_mask |= BIT(PSI_MEM_SOME);
|
||||
if (tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING])
|
||||
state_mask |= BIT(PSI_MEM_FULL);
|
||||
}
|
||||
|
||||
if (tasks[NR_RUNNING] > oncpu)
|
||||
state_mask |= BIT(PSI_CPU_SOME);
|
||||
|
||||
if (tasks[NR_RUNNING] && !oncpu)
|
||||
state_mask |= BIT(PSI_CPU_FULL);
|
||||
|
||||
if (tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING])
|
||||
state_mask |= BIT(PSI_NONIDLE);
|
||||
|
||||
return state_mask;
|
||||
}
|
||||
|
||||
static void get_recent_times(struct psi_group *group, int cpu,
|
||||
@ -345,7 +349,7 @@ static void collect_percpu_times(struct psi_group *group,
|
||||
|
||||
/*
|
||||
* Collect the per-cpu time buckets and average them into a
|
||||
* single time sample that is normalized to wallclock time.
|
||||
* single time sample that is normalized to wall clock time.
|
||||
*
|
||||
* For averaging, each CPU is weighted by its non-idle time in
|
||||
* the sampling period. This eliminates artifacts from uneven
|
||||
@ -770,7 +774,6 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
||||
{
|
||||
struct psi_group_cpu *groupc;
|
||||
unsigned int t, m;
|
||||
enum psi_states s;
|
||||
u32 state_mask;
|
||||
|
||||
lockdep_assert_rq_held(cpu_rq(cpu));
|
||||
@ -842,10 +845,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
||||
return;
|
||||
}
|
||||
|
||||
for (s = 0; s < NR_PSI_STATES; s++) {
|
||||
if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
|
||||
state_mask |= (1 << s);
|
||||
}
|
||||
state_mask = test_states(groupc->tasks, state_mask);
|
||||
|
||||
/*
|
||||
* Since we care about lost potential, a memstall is FULL
|
||||
@ -1205,7 +1205,7 @@ void psi_cgroup_restart(struct psi_group *group)
|
||||
/*
|
||||
* After we disable psi_group->enabled, we don't actually
|
||||
* stop percpu tasks accounting in each psi_group_cpu,
|
||||
* instead only stop test_state() loop, record_times()
|
||||
* instead only stop test_states() loop, record_times()
|
||||
* and averaging worker, see psi_group_change() for details.
|
||||
*
|
||||
* When disable cgroup PSI, this function has nothing to sync
|
||||
@ -1213,7 +1213,7 @@ void psi_cgroup_restart(struct psi_group *group)
|
||||
* would see !psi_group->enabled and only do task accounting.
|
||||
*
|
||||
* When re-enable cgroup PSI, this function use psi_group_change()
|
||||
* to get correct state mask from test_state() loop on tasks[],
|
||||
* to get correct state mask from test_states() loop on tasks[],
|
||||
* and restart groupc->state_start from now, use .clear = .set = 0
|
||||
* here since no task status really changed.
|
||||
*/
|
||||
|
@ -140,7 +140,7 @@ void init_rt_rq(struct rt_rq *rt_rq)
|
||||
INIT_LIST_HEAD(array->queue + i);
|
||||
__clear_bit(i, array->bitmap);
|
||||
}
|
||||
/* delimiter for bitsearch: */
|
||||
/* delimiter for bit-search: */
|
||||
__set_bit(MAX_RT_PRIO, array->bitmap);
|
||||
|
||||
#if defined CONFIG_SMP
|
||||
@ -1135,7 +1135,7 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
|
||||
|
||||
/*
|
||||
* This may have been our highest task, and therefore
|
||||
* we may have some recomputation to do
|
||||
* we may have some re-computation to do
|
||||
*/
|
||||
if (prio == prev_prio) {
|
||||
struct rt_prio_array *array = &rt_rq->active;
|
||||
@ -1571,7 +1571,7 @@ select_task_rq_rt(struct task_struct *p, int cpu, int flags)
|
||||
*
|
||||
* For equal prio tasks, we just let the scheduler sort it out.
|
||||
*
|
||||
* Otherwise, just let it ride on the affined RQ and the
|
||||
* Otherwise, just let it ride on the affine RQ and the
|
||||
* post-schedule router will push the preempted task away
|
||||
*
|
||||
* This test is optimistic, if we get it wrong the load-balancer
|
||||
@ -2147,14 +2147,14 @@ static void push_rt_tasks(struct rq *rq)
|
||||
* if its the only CPU with multiple RT tasks queued, and a large number
|
||||
* of CPUs scheduling a lower priority task at the same time.
|
||||
*
|
||||
* Each root domain has its own irq work function that can iterate over
|
||||
* Each root domain has its own IRQ work function that can iterate over
|
||||
* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
|
||||
* task must be checked if there's one or many CPUs that are lowering
|
||||
* their priority, there's a single irq work iterator that will try to
|
||||
* their priority, there's a single IRQ work iterator that will try to
|
||||
* push off RT tasks that are waiting to run.
|
||||
*
|
||||
* When a CPU schedules a lower priority task, it will kick off the
|
||||
* irq work iterator that will jump to each CPU with overloaded RT tasks.
|
||||
* IRQ work iterator that will jump to each CPU with overloaded RT tasks.
|
||||
* As it only takes the first CPU that schedules a lower priority task
|
||||
* to start the process, the rto_start variable is incremented and if
|
||||
* the atomic result is one, then that CPU will try to take the rto_lock.
|
||||
@ -2162,7 +2162,7 @@ static void push_rt_tasks(struct rq *rq)
|
||||
* CPUs scheduling lower priority tasks.
|
||||
*
|
||||
* All CPUs that are scheduling a lower priority task will increment the
|
||||
* rt_loop_next variable. This will make sure that the irq work iterator
|
||||
* rt_loop_next variable. This will make sure that the IRQ work iterator
|
||||
* checks all RT overloaded CPUs whenever a CPU schedules a new lower
|
||||
* priority task, even if the iterator is in the middle of a scan. Incrementing
|
||||
* the rt_loop_next will cause the iterator to perform another scan.
|
||||
@ -2242,7 +2242,7 @@ static void tell_cpu_to_push(struct rq *rq)
|
||||
* The rto_cpu is updated under the lock, if it has a valid CPU
|
||||
* then the IPI is still running and will continue due to the
|
||||
* update to loop_next, and nothing needs to be done here.
|
||||
* Otherwise it is finishing up and an ipi needs to be sent.
|
||||
* Otherwise it is finishing up and an IPI needs to be sent.
|
||||
*/
|
||||
if (rq->rd->rto_cpu < 0)
|
||||
cpu = rto_next_cpu(rq->rd);
|
||||
@ -2594,7 +2594,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
|
||||
watchdog(rq, p);
|
||||
|
||||
/*
|
||||
* RR tasks need a special form of timeslice management.
|
||||
* RR tasks need a special form of time-slice management.
|
||||
* FIFO tasks have no timeslices.
|
||||
*/
|
||||
if (p->policy != SCHED_RR)
|
||||
@ -2900,7 +2900,7 @@ static int sched_rt_global_constraints(void)
|
||||
|
||||
int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
|
||||
{
|
||||
/* Don't accept realtime tasks when there is no way for them to run */
|
||||
/* Don't accept real-time tasks when there is no way for them to run */
|
||||
if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
|
||||
return 0;
|
||||
|
||||
@ -3001,7 +3001,7 @@ static int sched_rr_handler(struct ctl_table *table, int write, void *buffer,
|
||||
ret = proc_dointvec(table, write, buffer, lenp, ppos);
|
||||
/*
|
||||
* Make sure that internally we keep jiffies.
|
||||
* Also, writing zero resets the timeslice to default:
|
||||
* Also, writing zero resets the time-slice to default:
|
||||
*/
|
||||
if (!ret && write) {
|
||||
sched_rr_timeslice =
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -224,7 +224,7 @@ static inline void sched_info_dequeue(struct rq *rq, struct task_struct *t)
|
||||
/*
|
||||
* Called when a task finally hits the CPU. We can now calculate how
|
||||
* long it was waiting to run. We also note when it began so that we
|
||||
* can keep stats on how long its timeslice is.
|
||||
* can keep stats on how long its time-slice is.
|
||||
*/
|
||||
static void sched_info_arrive(struct rq *rq, struct task_struct *t)
|
||||
{
|
||||
|
1699
kernel/sched/syscalls.c
Normal file
1699
kernel/sched/syscalls.c
Normal file
File diff suppressed because it is too large
Load Diff
@ -501,7 +501,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
|
||||
cpumask_clear_cpu(rq->cpu, old_rd->span);
|
||||
|
||||
/*
|
||||
* If we dont want to free the old_rd yet then
|
||||
* If we don't want to free the old_rd yet then
|
||||
* set old_rd to NULL to skip the freeing later
|
||||
* in this function:
|
||||
*/
|
||||
@ -1176,7 +1176,7 @@ fail:
|
||||
* uniquely identify each group (for a given domain):
|
||||
*
|
||||
* - The first is the balance_cpu (see should_we_balance() and the
|
||||
* load-balance blub in fair.c); for each group we only want 1 CPU to
|
||||
* load-balance blurb in fair.c); for each group we only want 1 CPU to
|
||||
* continue balancing at a higher domain.
|
||||
*
|
||||
* - The second is the sched_group_capacity; we want all identical groups
|
||||
@ -1388,7 +1388,7 @@ static inline void asym_cpu_capacity_update_data(int cpu)
|
||||
|
||||
/*
|
||||
* Search if capacity already exits. If not, track which the entry
|
||||
* where we should insert to keep the list ordered descendingly.
|
||||
* where we should insert to keep the list ordered descending.
|
||||
*/
|
||||
list_for_each_entry(entry, &asym_cap_list, link) {
|
||||
if (capacity == entry->capacity)
|
||||
@ -1853,7 +1853,7 @@ void sched_init_numa(int offline_node)
|
||||
struct cpumask ***masks;
|
||||
|
||||
/*
|
||||
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
|
||||
* O(nr_nodes^2) de-duplicating selection sort -- in order to find the
|
||||
* unique distances in the node_distance() table.
|
||||
*/
|
||||
distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
|
||||
@ -2750,7 +2750,7 @@ match2:
|
||||
}
|
||||
|
||||
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
|
||||
/* Build perf. domains: */
|
||||
/* Build perf domains: */
|
||||
for (i = 0; i < ndoms_new; i++) {
|
||||
for (j = 0; j < n && !sched_energy_update; j++) {
|
||||
if (cpumask_equal(doms_new[i], doms_cur[j]) &&
|
||||
@ -2759,7 +2759,7 @@ match2:
|
||||
goto match3;
|
||||
}
|
||||
}
|
||||
/* No match - add perf. domains for a new rd */
|
||||
/* No match - add perf domains for a new rd */
|
||||
has_eas |= build_perf_domains(doms_new[i]);
|
||||
match3:
|
||||
;
|
||||
|
@ -33,7 +33,7 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
|
||||
EXPORT_SYMBOL(wake_bit_function);
|
||||
|
||||
/*
|
||||
* To allow interruptible waiting and asynchronous (i.e. nonblocking)
|
||||
* To allow interruptible waiting and asynchronous (i.e. non-blocking)
|
||||
* waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are
|
||||
* permitted return codes. Nonzero return codes halt waiting and return.
|
||||
*/
|
||||
@ -133,7 +133,7 @@ EXPORT_SYMBOL(__wake_up_bit);
|
||||
* @bit: the bit of the word being waited on
|
||||
*
|
||||
* There is a standard hashed waitqueue table for generic use. This
|
||||
* is the part of the hashtable's accessor API that wakes up waiters
|
||||
* is the part of the hash-table's accessor API that wakes up waiters
|
||||
* on a bit. For instance, if one were to have waiters on a bitflag,
|
||||
* one would call wake_up_bit() after clearing the bit.
|
||||
*
|
||||
|
Loading…
Reference in New Issue
Block a user