0edd1b1784
This commit adds the state machine that takes the per-CPU idle data as input and produces a full-system-idle indication as output. This state machine is driven out of RCU's quiescent-state-forcing mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU idle state and then rcu_sysidle_report() to drive the state machine. The full-system-idle state is sampled using rcu_sys_is_idle(), which also drives the state machine if RCU is idle (and does so by forcing RCU to become non-idle). This function returns true if all but the timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long enough to avoid memory contention on the full_sysidle_state state variable. The rcu_sysidle_force_exit() may be called externally to reset the state machine back into non-idle state. For large systems the state machine is driven out of RCU's force-quiescent-state logic, which provides good scalability at the price of millisecond-scale latencies on the transition to full-system-idle state. This is not so good for battery-powered systems, which are usually small enough that they don't need to care about scalability, but which do care deeply about energy efficiency. Small systems therefore drive the state machine directly out of the idle-entry code. The number of CPUs in a "small" system is defined by a new NO_HZ_FULL_SYSIDLE_SMALL Kconfig parameter, which defaults to 8. Note that this is a build-time definition. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> [ paulmck: Use true and false for boolean constants per Lai Jiangshan. ] Reviewed-by: Josh Triplett <josh@joshtriplett.org> [ paulmck: Simplify logic and provide better comments for memory barriers, based on review comments and questions by Lai Jiangshan. ]
206 lines
6.3 KiB
Plaintext
206 lines
6.3 KiB
Plaintext
#
|
|
# Timer subsystem related configuration options
|
|
#
|
|
|
|
# Options selectable by arch Kconfig
|
|
|
|
# Watchdog function for clocksources to detect instabilities
|
|
config CLOCKSOURCE_WATCHDOG
|
|
bool
|
|
|
|
# Architecture has extra clocksource data
|
|
config ARCH_CLOCKSOURCE_DATA
|
|
bool
|
|
|
|
# Timekeeping vsyscall support
|
|
config GENERIC_TIME_VSYSCALL
|
|
bool
|
|
|
|
# Timekeeping vsyscall support
|
|
config GENERIC_TIME_VSYSCALL_OLD
|
|
bool
|
|
|
|
# ktime_t scalar 64bit nsec representation
|
|
config KTIME_SCALAR
|
|
bool
|
|
|
|
# Old style timekeeping
|
|
config ARCH_USES_GETTIMEOFFSET
|
|
bool
|
|
|
|
# The generic clock events infrastructure
|
|
config GENERIC_CLOCKEVENTS
|
|
bool
|
|
|
|
# Migration helper. Builds, but does not invoke
|
|
config GENERIC_CLOCKEVENTS_BUILD
|
|
bool
|
|
default y
|
|
depends on GENERIC_CLOCKEVENTS
|
|
|
|
# Architecture can handle broadcast in a driver-agnostic way
|
|
config ARCH_HAS_TICK_BROADCAST
|
|
bool
|
|
|
|
# Clockevents broadcasting infrastructure
|
|
config GENERIC_CLOCKEVENTS_BROADCAST
|
|
bool
|
|
depends on GENERIC_CLOCKEVENTS
|
|
|
|
# Automatically adjust the min. reprogramming time for
|
|
# clock event device
|
|
config GENERIC_CLOCKEVENTS_MIN_ADJUST
|
|
bool
|
|
|
|
# Generic update of CMOS clock
|
|
config GENERIC_CMOS_UPDATE
|
|
bool
|
|
|
|
if GENERIC_CLOCKEVENTS
|
|
menu "Timers subsystem"
|
|
|
|
# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
|
|
# only related to the tick functionality. Oneshot clockevent devices
|
|
# are supported independ of this.
|
|
config TICK_ONESHOT
|
|
bool
|
|
|
|
config NO_HZ_COMMON
|
|
bool
|
|
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
|
|
select TICK_ONESHOT
|
|
|
|
choice
|
|
prompt "Timer tick handling"
|
|
default NO_HZ_IDLE if NO_HZ
|
|
|
|
config HZ_PERIODIC
|
|
bool "Periodic timer ticks (constant rate, no dynticks)"
|
|
help
|
|
This option keeps the tick running periodically at a constant
|
|
rate, even when the CPU doesn't need it.
|
|
|
|
config NO_HZ_IDLE
|
|
bool "Idle dynticks system (tickless idle)"
|
|
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
|
|
select NO_HZ_COMMON
|
|
help
|
|
This option enables a tickless idle system: timer interrupts
|
|
will only trigger on an as-needed basis when the system is idle.
|
|
This is usually interesting for energy saving.
|
|
|
|
Most of the time you want to say Y here.
|
|
|
|
config NO_HZ_FULL
|
|
bool "Full dynticks system (tickless)"
|
|
# NO_HZ_COMMON dependency
|
|
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
|
|
# We need at least one periodic CPU for timekeeping
|
|
depends on SMP
|
|
# RCU_USER_QS dependency
|
|
depends on HAVE_CONTEXT_TRACKING
|
|
# VIRT_CPU_ACCOUNTING_GEN dependency
|
|
depends on 64BIT
|
|
select NO_HZ_COMMON
|
|
select RCU_USER_QS
|
|
select RCU_NOCB_CPU
|
|
select VIRT_CPU_ACCOUNTING_GEN
|
|
select CONTEXT_TRACKING_FORCE
|
|
select IRQ_WORK
|
|
help
|
|
Adaptively try to shutdown the tick whenever possible, even when
|
|
the CPU is running tasks. Typically this requires running a single
|
|
task on the CPU. Chances for running tickless are maximized when
|
|
the task mostly runs in userspace and has few kernel activity.
|
|
|
|
You need to fill up the nohz_full boot parameter with the
|
|
desired range of dynticks CPUs.
|
|
|
|
This is implemented at the expense of some overhead in user <-> kernel
|
|
transitions: syscalls, exceptions and interrupts. Even when it's
|
|
dynamically off.
|
|
|
|
Say N.
|
|
|
|
endchoice
|
|
|
|
config NO_HZ_FULL_ALL
|
|
bool "Full dynticks system on all CPUs by default"
|
|
depends on NO_HZ_FULL
|
|
help
|
|
If the user doesn't pass the nohz_full boot option to
|
|
define the range of full dynticks CPUs, consider that all
|
|
CPUs in the system are full dynticks by default.
|
|
Note the boot CPU will still be kept outside the range to
|
|
handle the timekeeping duty.
|
|
|
|
config NO_HZ_FULL_SYSIDLE
|
|
bool "Detect full-system idle state for full dynticks system"
|
|
depends on NO_HZ_FULL
|
|
default n
|
|
help
|
|
At least one CPU must keep the scheduling-clock tick running for
|
|
timekeeping purposes whenever there is a non-idle CPU, where
|
|
"non-idle" also includes dynticks CPUs as long as they are
|
|
running non-idle tasks. Because the underlying adaptive-tick
|
|
support cannot distinguish between all CPUs being idle and
|
|
all CPUs each running a single task in dynticks mode, the
|
|
underlying support simply ensures that there is always a CPU
|
|
handling the scheduling-clock tick, whether or not all CPUs
|
|
are idle. This Kconfig option enables scalable detection of
|
|
the all-CPUs-idle state, thus allowing the scheduling-clock
|
|
tick to be disabled when all CPUs are idle. Note that scalable
|
|
detection of the all-CPUs-idle state means that larger systems
|
|
will be slower to declare the all-CPUs-idle state.
|
|
|
|
Say Y if you would like to help debug all-CPUs-idle detection.
|
|
|
|
Say N if you are unsure.
|
|
|
|
config NO_HZ_FULL_SYSIDLE_SMALL
|
|
int "Number of CPUs above which large-system approach is used"
|
|
depends on NO_HZ_FULL_SYSIDLE
|
|
range 1 NR_CPUS
|
|
default 8
|
|
help
|
|
The full-system idle detection mechanism takes a lazy approach
|
|
on large systems, as is required to attain decent scalability.
|
|
However, on smaller systems, scalability is not anywhere near as
|
|
large a concern as is energy efficiency. The sysidle subsystem
|
|
therefore uses a fast but non-scalable algorithm for small
|
|
systems and a lazier but scalable algorithm for large systems.
|
|
This Kconfig parameter defines the number of CPUs in the largest
|
|
system that will be considered to be "small".
|
|
|
|
The default value will be fine in most cases. Battery-powered
|
|
systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger
|
|
numbers of CPUs, and (3) are suffering from battery-lifetime
|
|
problems due to long sysidle latencies might wish to experiment
|
|
with larger values for this Kconfig parameter. On the other
|
|
hand, they might be even better served by disabling NO_HZ_FULL
|
|
entirely, given that NO_HZ_FULL is intended for HPC and
|
|
real-time workloads that at present do not tend to be run on
|
|
battery-powered systems.
|
|
|
|
Take the default if you are unsure.
|
|
|
|
config NO_HZ
|
|
bool "Old Idle dynticks config"
|
|
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
|
|
help
|
|
This is the old config entry that enables dynticks idle.
|
|
We keep it around for a little while to enforce backward
|
|
compatibility with older config files.
|
|
|
|
config HIGH_RES_TIMERS
|
|
bool "High Resolution Timer Support"
|
|
depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
|
|
select TICK_ONESHOT
|
|
help
|
|
This option enables high resolution timer support. If your
|
|
hardware is not capable then this option only increases
|
|
the size of the kernel image.
|
|
|
|
endmenu
|
|
endif
|