1
linux/kernel/power/main.c
Rafael J. Wysocki 32bdfac546 PM: Do not hold dpm_list_mtx while disabling/enabling nonboot CPUs
We shouldn't hold dpm_list_mtx while executing
[disable|enable]_nonboot_cpus(), because theoretically this may lead
to a deadlock as shown by the following example (provided by Johannes
Berg):

CPU 3       CPU 2                     CPU 1
                                      suspend/hibernate
            something:
            rtnl_lock()               device_pm_lock()
                                       -> mutex_lock(&dpm_list_mtx)

            mutex_lock(&dpm_list_mtx)

linkwatch_work
 -> rtnl_lock()
                                      disable_nonboot_cpus()
                                       -> flush CPU 3 workqueue

Fortunately, device drivers are supposed to stop any activities that
might lead to the registration of new device objects way before
disable_nonboot_cpus() is called, so it shouldn't be necessary to
hold dpm_list_mtx over the entire late part of device suspend and
early part of device resume.

Thus, during the late suspend and the early resume of devices acquire
dpm_list_mtx only when dpm_list is going to be traversed and release
it right after that.

This patch is reported to fix the regressions tracked as
http://bugzilla.kernel.org/show_bug.cgi?id=13245.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Reported-by: Miles Lane <miles.lane@gmail.com>
Tested-by: Ming Lei <tom.leiming@gmail.com>
2009-05-24 21:15:07 +02:00

750 lines
16 KiB
C

/*
* kernel/power/main.c - PM subsystem core functionality.
*
* Copyright (c) 2003 Patrick Mochel
* Copyright (c) 2003 Open Source Development Lab
*
* This file is released under the GPLv2
*
*/
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/kobject.h>
#include <linux/string.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/console.h>
#include <linux/cpu.h>
#include <linux/resume-trace.h>
#include <linux/freezer.h>
#include <linux/vmstat.h>
#include <linux/syscalls.h>
#include "power.h"
DEFINE_MUTEX(pm_mutex);
unsigned int pm_flags;
EXPORT_SYMBOL(pm_flags);
#ifdef CONFIG_PM_SLEEP
/* Routines for PM-transition notifications */
static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
int register_pm_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&pm_chain_head, nb);
}
EXPORT_SYMBOL_GPL(register_pm_notifier);
int unregister_pm_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&pm_chain_head, nb);
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
int pm_notifier_call_chain(unsigned long val)
{
return (blocking_notifier_call_chain(&pm_chain_head, val, NULL)
== NOTIFY_BAD) ? -EINVAL : 0;
}
#ifdef CONFIG_PM_DEBUG
int pm_test_level = TEST_NONE;
static const char * const pm_tests[__TEST_AFTER_LAST] = {
[TEST_NONE] = "none",
[TEST_CORE] = "core",
[TEST_CPUS] = "processors",
[TEST_PLATFORM] = "platform",
[TEST_DEVICES] = "devices",
[TEST_FREEZER] = "freezer",
};
static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
char *s = buf;
int level;
for (level = TEST_FIRST; level <= TEST_MAX; level++)
if (pm_tests[level]) {
if (level == pm_test_level)
s += sprintf(s, "[%s] ", pm_tests[level]);
else
s += sprintf(s, "%s ", pm_tests[level]);
}
if (s != buf)
/* convert the last space to a newline */
*(s-1) = '\n';
return (s - buf);
}
static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
const char * const *s;
int level;
char *p;
int len;
int error = -EINVAL;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
mutex_lock(&pm_mutex);
level = TEST_FIRST;
for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
pm_test_level = level;
error = 0;
break;
}
mutex_unlock(&pm_mutex);
return error ? error : n;
}
power_attr(pm_test);
#endif /* CONFIG_PM_DEBUG */
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_SUSPEND
static int suspend_test(int level)
{
#ifdef CONFIG_PM_DEBUG
if (pm_test_level == level) {
printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
mdelay(5000);
return 1;
}
#endif /* !CONFIG_PM_DEBUG */
return 0;
}
#ifdef CONFIG_PM_TEST_SUSPEND
/*
* We test the system suspend code by setting an RTC wakealarm a short
* time in the future, then suspending. Suspending the devices won't
* normally take long ... some systems only need a few milliseconds.
*
* The time it takes is system-specific though, so when we test this
* during system bootup we allow a LOT of time.
*/
#define TEST_SUSPEND_SECONDS 5
static unsigned long suspend_test_start_time;
static void suspend_test_start(void)
{
/* FIXME Use better timebase than "jiffies", ideally a clocksource.
* What we want is a hardware counter that will work correctly even
* during the irqs-are-off stages of the suspend/resume cycle...
*/
suspend_test_start_time = jiffies;
}
static void suspend_test_finish(const char *label)
{
long nj = jiffies - suspend_test_start_time;
unsigned msec;
msec = jiffies_to_msecs(abs(nj));
pr_info("PM: %s took %d.%03d seconds\n", label,
msec / 1000, msec % 1000);
/* Warning on suspend means the RTC alarm period needs to be
* larger -- the system was sooo slooowwww to suspend that the
* alarm (should have) fired before the system went to sleep!
*
* Warning on either suspend or resume also means the system
* has some performance issues. The stack dump of a WARN_ON
* is more likely to get the right attention than a printk...
*/
WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
}
#else
static void suspend_test_start(void)
{
}
static void suspend_test_finish(const char *label)
{
}
#endif
/* This is just an arbitrary number */
#define FREE_PAGE_NUMBER (100)
static struct platform_suspend_ops *suspend_ops;
/**
* suspend_set_ops - Set the global suspend method table.
* @ops: Pointer to ops structure.
*/
void suspend_set_ops(struct platform_suspend_ops *ops)
{
mutex_lock(&pm_mutex);
suspend_ops = ops;
mutex_unlock(&pm_mutex);
}
/**
* suspend_valid_only_mem - generic memory-only valid callback
*
* Platform drivers that implement mem suspend only and only need
* to check for that in their .valid callback can use this instead
* of rolling their own .valid callback.
*/
int suspend_valid_only_mem(suspend_state_t state)
{
return state == PM_SUSPEND_MEM;
}
/**
* suspend_prepare - Do prep work before entering low-power state.
*
* This is common code that is called for each state that we're entering.
* Run suspend notifiers, allocate a console and stop all processes.
*/
static int suspend_prepare(void)
{
int error;
unsigned int free_pages;
if (!suspend_ops || !suspend_ops->enter)
return -EPERM;
pm_prepare_console();
error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
if (error)
goto Finish;
error = usermodehelper_disable();
if (error)
goto Finish;
if (suspend_freeze_processes()) {
error = -EAGAIN;
goto Thaw;
}
free_pages = global_page_state(NR_FREE_PAGES);
if (free_pages < FREE_PAGE_NUMBER) {
pr_debug("PM: free some memory\n");
shrink_all_memory(FREE_PAGE_NUMBER - free_pages);
if (nr_free_pages() < FREE_PAGE_NUMBER) {
error = -ENOMEM;
printk(KERN_ERR "PM: No enough memory\n");
}
}
if (!error)
return 0;
Thaw:
suspend_thaw_processes();
usermodehelper_enable();
Finish:
pm_notifier_call_chain(PM_POST_SUSPEND);
pm_restore_console();
return error;
}
/* default implementation */
void __attribute__ ((weak)) arch_suspend_disable_irqs(void)
{
local_irq_disable();
}
/* default implementation */
void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
{
local_irq_enable();
}
/**
* suspend_enter - enter the desired system sleep state.
* @state: state to enter
*
* This function should be called after devices have been suspended.
*/
static int suspend_enter(suspend_state_t state)
{
int error;
if (suspend_ops->prepare) {
error = suspend_ops->prepare();
if (error)
return error;
}
error = device_power_down(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down\n");
goto Platfrom_finish;
}
if (suspend_ops->prepare_late) {
error = suspend_ops->prepare_late();
if (error)
goto Power_up_devices;
}
if (suspend_test(TEST_PLATFORM))
goto Platform_wake;
error = disable_nonboot_cpus();
if (error || suspend_test(TEST_CPUS))
goto Enable_cpus;
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
error = sysdev_suspend(PMSG_SUSPEND);
if (!error) {
if (!suspend_test(TEST_CORE))
error = suspend_ops->enter(state);
sysdev_resume();
}
arch_suspend_enable_irqs();
BUG_ON(irqs_disabled());
Enable_cpus:
enable_nonboot_cpus();
Platform_wake:
if (suspend_ops->wake)
suspend_ops->wake();
Power_up_devices:
device_power_up(PMSG_RESUME);
Platfrom_finish:
if (suspend_ops->finish)
suspend_ops->finish();
return error;
}
/**
* suspend_devices_and_enter - suspend devices and enter the desired system
* sleep state.
* @state: state to enter
*/
int suspend_devices_and_enter(suspend_state_t state)
{
int error;
if (!suspend_ops)
return -ENOSYS;
if (suspend_ops->begin) {
error = suspend_ops->begin(state);
if (error)
goto Close;
}
suspend_console();
suspend_test_start();
error = device_suspend(PMSG_SUSPEND);
if (error) {
printk(KERN_ERR "PM: Some devices failed to suspend\n");
goto Recover_platform;
}
suspend_test_finish("suspend devices");
if (suspend_test(TEST_DEVICES))
goto Recover_platform;
suspend_enter(state);
Resume_devices:
suspend_test_start();
device_resume(PMSG_RESUME);
suspend_test_finish("resume devices");
resume_console();
Close:
if (suspend_ops->end)
suspend_ops->end();
return error;
Recover_platform:
if (suspend_ops->recover)
suspend_ops->recover();
goto Resume_devices;
}
/**
* suspend_finish - Do final work before exiting suspend sequence.
*
* Call platform code to clean up, restart processes, and free the
* console that we've allocated. This is not called for suspend-to-disk.
*/
static void suspend_finish(void)
{
suspend_thaw_processes();
usermodehelper_enable();
pm_notifier_call_chain(PM_POST_SUSPEND);
pm_restore_console();
}
static const char * const pm_states[PM_SUSPEND_MAX] = {
[PM_SUSPEND_STANDBY] = "standby",
[PM_SUSPEND_MEM] = "mem",
};
static inline int valid_state(suspend_state_t state)
{
/* All states need lowlevel support and need to be valid
* to the lowlevel implementation, no valid callback
* implies that none are valid. */
if (!suspend_ops || !suspend_ops->valid || !suspend_ops->valid(state))
return 0;
return 1;
}
/**
* enter_state - Do common work of entering low-power state.
* @state: pm_state structure for state we're entering.
*
* Make sure we're the only ones trying to enter a sleep state. Fail
* if someone has beat us to it, since we don't want anything weird to
* happen when we wake up.
* Then, do the setup for suspend, enter the state, and cleaup (after
* we've woken up).
*/
static int enter_state(suspend_state_t state)
{
int error;
if (!valid_state(state))
return -ENODEV;
if (!mutex_trylock(&pm_mutex))
return -EBUSY;
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
error = suspend_prepare();
if (error)
goto Unlock;
if (suspend_test(TEST_FREEZER))
goto Finish;
pr_debug("PM: Entering %s sleep\n", pm_states[state]);
error = suspend_devices_and_enter(state);
Finish:
pr_debug("PM: Finishing wakeup.\n");
suspend_finish();
Unlock:
mutex_unlock(&pm_mutex);
return error;
}
/**
* pm_suspend - Externally visible function for suspending system.
* @state: Enumerated value of state to enter.
*
* Determine whether or not value is within range, get state
* structure, and enter (above).
*/
int pm_suspend(suspend_state_t state)
{
if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
return enter_state(state);
return -EINVAL;
}
EXPORT_SYMBOL(pm_suspend);
#endif /* CONFIG_SUSPEND */
struct kobject *power_kobj;
/**
* state - control system power state.
*
* show() returns what states are supported, which is hard-coded to
* 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and
* 'disk' (Suspend-to-Disk).
*
* store() accepts one of those strings, translates it into the
* proper enumerated value, and initiates a suspend transition.
*/
static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
char *s = buf;
#ifdef CONFIG_SUSPEND
int i;
for (i = 0; i < PM_SUSPEND_MAX; i++) {
if (pm_states[i] && valid_state(i))
s += sprintf(s,"%s ", pm_states[i]);
}
#endif
#ifdef CONFIG_HIBERNATION
s += sprintf(s, "%s\n", "disk");
#else
if (s != buf)
/* convert the last space to a newline */
*(s-1) = '\n';
#endif
return (s - buf);
}
static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
suspend_state_t state = PM_SUSPEND_STANDBY;
const char * const *s;
#endif
char *p;
int len;
int error = -EINVAL;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
/* First, check if we are requested to hibernate */
if (len == 4 && !strncmp(buf, "disk", len)) {
error = hibernate();
goto Exit;
}
#ifdef CONFIG_SUSPEND
for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
break;
}
if (state < PM_SUSPEND_MAX && *s)
error = enter_state(state);
#endif
Exit:
return error ? error : n;
}
power_attr(state);
#ifdef CONFIG_PM_TRACE
int pm_trace_enabled;
static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
return sprintf(buf, "%d\n", pm_trace_enabled);
}
static ssize_t
pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
int val;
if (sscanf(buf, "%d", &val) == 1) {
pm_trace_enabled = !!val;
return n;
}
return -EINVAL;
}
power_attr(pm_trace);
#endif /* CONFIG_PM_TRACE */
static struct attribute * g[] = {
&state_attr.attr,
#ifdef CONFIG_PM_TRACE
&pm_trace_attr.attr,
#endif
#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG)
&pm_test_attr.attr,
#endif
NULL,
};
static struct attribute_group attr_group = {
.attrs = g,
};
static int __init pm_init(void)
{
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
return sysfs_create_group(power_kobj, &attr_group);
}
core_initcall(pm_init);
#ifdef CONFIG_PM_TEST_SUSPEND
#include <linux/rtc.h>
/*
* To test system suspend, we need a hands-off mechanism to resume the
* system. RTCs wake alarms are a common self-contained mechanism.
*/
static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
{
static char err_readtime[] __initdata =
KERN_ERR "PM: can't read %s time, err %d\n";
static char err_wakealarm [] __initdata =
KERN_ERR "PM: can't set %s wakealarm, err %d\n";
static char err_suspend[] __initdata =
KERN_ERR "PM: suspend test failed, error %d\n";
static char info_test[] __initdata =
KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
unsigned long now;
struct rtc_wkalrm alm;
int status;
/* this may fail if the RTC hasn't been initialized */
status = rtc_read_time(rtc, &alm.time);
if (status < 0) {
printk(err_readtime, dev_name(&rtc->dev), status);
return;
}
rtc_tm_to_time(&alm.time, &now);
memset(&alm, 0, sizeof alm);
rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
alm.enabled = true;
status = rtc_set_alarm(rtc, &alm);
if (status < 0) {
printk(err_wakealarm, dev_name(&rtc->dev), status);
return;
}
if (state == PM_SUSPEND_MEM) {
printk(info_test, pm_states[state]);
status = pm_suspend(state);
if (status == -ENODEV)
state = PM_SUSPEND_STANDBY;
}
if (state == PM_SUSPEND_STANDBY) {
printk(info_test, pm_states[state]);
status = pm_suspend(state);
}
if (status < 0)
printk(err_suspend, status);
/* Some platforms can't detect that the alarm triggered the
* wakeup, or (accordingly) disable it after it afterwards.
* It's supposed to give oneshot behavior; cope.
*/
alm.enabled = false;
rtc_set_alarm(rtc, &alm);
}
static int __init has_wakealarm(struct device *dev, void *name_ptr)
{
struct rtc_device *candidate = to_rtc_device(dev);
if (!candidate->ops->set_alarm)
return 0;
if (!device_may_wakeup(candidate->dev.parent))
return 0;
*(const char **)name_ptr = dev_name(dev);
return 1;
}
/*
* Kernel options like "test_suspend=mem" force suspend/resume sanity tests
* at startup time. They're normally disabled, for faster boot and because
* we can't know which states really work on this particular system.
*/
static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
static char warn_bad_state[] __initdata =
KERN_WARNING "PM: can't test '%s' suspend state\n";
static int __init setup_test_suspend(char *value)
{
unsigned i;
/* "=mem" ==> "mem" */
value++;
for (i = 0; i < PM_SUSPEND_MAX; i++) {
if (!pm_states[i])
continue;
if (strcmp(pm_states[i], value) != 0)
continue;
test_state = (__force suspend_state_t) i;
return 0;
}
printk(warn_bad_state, value);
return 0;
}
__setup("test_suspend", setup_test_suspend);
static int __init test_suspend(void)
{
static char warn_no_rtc[] __initdata =
KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
char *pony = NULL;
struct rtc_device *rtc = NULL;
/* PM is initialized by now; is that state testable? */
if (test_state == PM_SUSPEND_ON)
goto done;
if (!valid_state(test_state)) {
printk(warn_bad_state, pm_states[test_state]);
goto done;
}
/* RTCs have initialized by now too ... can we use one? */
class_find_device(rtc_class, NULL, &pony, has_wakealarm);
if (pony)
rtc = rtc_class_open(pony);
if (!rtc) {
printk(warn_no_rtc);
goto done;
}
/* go for it */
test_wakealarm(rtc, test_state);
rtc_class_close(rtc);
done:
return 0;
}
late_initcall(test_suspend);
#endif /* CONFIG_PM_TEST_SUSPEND */