1
linux/mm/compaction.c

798 lines
21 KiB
C
Raw Normal View History

/*
* linux/mm/compaction.c
*
* Memory compaction for the reduction of external fragmentation. Note that
* this heavily depends upon page migration to do all the real heavy
* lifting
*
* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
*/
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/sysctl.h>
#include <linux/sysfs.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>
/*
* compact_control is used to track pages being migrated and the free pages
* they are being migrated to during memory compaction. The free_pfn starts
* at the end of a zone and migrate_pfn begins at the start. Movable pages
* are moved to the end of a zone during a compaction run and the run
* completes when free_pfn <= migrate_pfn
*/
struct compact_control {
struct list_head freepages; /* List of free pages to migrate to */
struct list_head migratepages; /* List of pages being migrated */
unsigned long nr_freepages; /* Number of isolated free pages */
unsigned long nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
bool sync; /* Synchronous migration */
int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
};
static unsigned long release_freepages(struct list_head *freelist)
{
struct page *page, *next;
unsigned long count = 0;
list_for_each_entry_safe(page, next, freelist, lru) {
list_del(&page->lru);
__free_page(page);
count++;
}
return count;
}
/* Isolate free pages onto a private freelist. Must hold zone->lock */
static unsigned long isolate_freepages_block(struct zone *zone,
unsigned long blockpfn,
struct list_head *freelist)
{
unsigned long zone_end_pfn, end_pfn;
int nr_scanned = 0, total_isolated = 0;
struct page *cursor;
/* Get the last PFN we should scan for free pages at */
zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
/* Find the first usable PFN in the block to initialse page cursor */
for (; blockpfn < end_pfn; blockpfn++) {
if (pfn_valid_within(blockpfn))
break;
}
cursor = pfn_to_page(blockpfn);
/* Isolate free pages. This assumes the block is valid */
for (; blockpfn < end_pfn; blockpfn++, cursor++) {
int isolated, i;
struct page *page = cursor;
if (!pfn_valid_within(blockpfn))
continue;
nr_scanned++;
if (!PageBuddy(page))
continue;
/* Found a free page, break it into order-0 pages */
isolated = split_free_page(page);
total_isolated += isolated;
for (i = 0; i < isolated; i++) {
list_add(&page->lru, freelist);
page++;
}
/* If a page was split, advance to the end of it */
if (isolated) {
blockpfn += isolated - 1;
cursor += isolated - 1;
}
}
trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
return total_isolated;
}
/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct page *page)
{
int migratetype = get_pageblock_migratetype(page);
/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
return false;
/* If the page is a large free page, then allow migration */
if (PageBuddy(page) && page_order(page) >= pageblock_order)
return true;
/* If the block is MIGRATE_MOVABLE, allow migration */
if (migratetype == MIGRATE_MOVABLE)
return true;
/* Otherwise skip the block */
return false;
}
/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
*/
static void isolate_freepages(struct zone *zone,
struct compact_control *cc)
{
struct page *page;
unsigned long high_pfn, low_pfn, pfn;
unsigned long flags;
int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
/*
* Initialise the free scanner. The starting point is where we last
* scanned from (or the end of the zone if starting). The low point
* is the end of the pageblock the migration scanner is using.
*/
pfn = cc->free_pfn;
low_pfn = cc->migrate_pfn + pageblock_nr_pages;
/*
* Take care that if the migration scanner is at the end of the zone
* that the free scanner does not accidentally move to the next zone
* in the next isolation cycle.
*/
high_pfn = min(low_pfn, pfn);
/*
* Isolate free pages until enough are available to migrate the
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
pfn -= pageblock_nr_pages) {
unsigned long isolated;
if (!pfn_valid(pfn))
continue;
/*
* Check for overlapping nodes/zones. It's possible on some
* configurations to have a setup like
* node0 node1 node0
* i.e. it's possible that all pages within a zones range of
* pages do not belong to a single zone.
*/
page = pfn_to_page(pfn);
if (page_zone(page) != zone)
continue;
/* Check the block is suitable for migration */
if (!suitable_migration_target(page))
continue;
mm: compaction: minimise the time IRQs are disabled while isolating free pages compaction_alloc() isolates free pages to be used as migration targets. While its scanning, IRQs are disabled on the mistaken assumption the scanning should be short. Analysis showed that IRQs were in fact being disabled for substantial time. A simple test was run using large anonymous mappings with transparent hugepage support enabled to trigger frequent compactions. A monitor sampled what the worst IRQ-off latencies were and a post-processing tool found the following; Total sampled time IRQs off (not real total time): 22355 Event compaction_alloc..compaction_alloc 8409 us count 1 Event compaction_alloc..compaction_alloc 7341 us count 1 Event compaction_alloc..compaction_alloc 2463 us count 1 Event compaction_alloc..compaction_alloc 2054 us count 1 Event shrink_inactive_list..shrink_zone 1864 us count 1 Event shrink_inactive_list..shrink_zone 88 us count 1 Event save_args..call_softirq 36 us count 1 Event save_args..call_softirq 35 us count 2 Event __make_request..__blk_run_queue 24 us count 1 Event __alloc_pages_nodemask..__alloc_pages_nodemask 6 us count 1 i.e. compaction is disabled IRQs for a prolonged period of time - 8ms in one instance. The full report generated by the tool can be found at http://www.csn.ul.ie/~mel/postings/minfree-20110225/irqsoff-vanilla-micro.report This patch reduces the time IRQs are disabled by simply disabling IRQs at the last possible minute. An updated IRQs-off summary report then looks like; Total sampled time IRQs off (not real total time): 5493 Event shrink_inactive_list..shrink_zone 1596 us count 1 Event shrink_inactive_list..shrink_zone 1530 us count 1 Event shrink_inactive_list..shrink_zone 956 us count 1 Event shrink_inactive_list..shrink_zone 541 us count 1 Event shrink_inactive_list..shrink_zone 531 us count 1 Event split_huge_page..add_to_swap 232 us count 1 Event save_args..call_softirq 36 us count 1 Event save_args..call_softirq 35 us count 2 Event __wake_up..__wake_up 1 us count 1 A full report is again available at http://www.csn.ul.ie/~mel/postings/minfree-20110225/irqsoff-minimiseirq-free-v1r4-micro.report As should be obvious, IRQ disabled latencies due to compaction are almost elimimnated for this particular test. [aarcange@redhat.com: Fix initialisation of isolated] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujisu.com> Reviewed-by: Minchan Kim <minchan.kim@gmail.com> Acked-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Arthur Marsh <arthur.marsh@internode.on.net> Cc: Clemens Ladisch <cladisch@googlemail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-22 16:33:08 -07:00
/*
* Found a block suitable for isolating free pages from. Now
* we disabled interrupts, double check things are ok and
* isolate the pages. This is to minimise the time IRQs
* are disabled
*/
isolated = 0;
spin_lock_irqsave(&zone->lock, flags);
if (suitable_migration_target(page)) {
isolated = isolate_freepages_block(zone, pfn, freelist);
nr_freepages += isolated;
}
spin_unlock_irqrestore(&zone->lock, flags);
/*
* Record the highest PFN we isolated pages from. When next
* looking for free pages, the search will restart here as
* page migration may have returned some pages to the allocator
*/
if (isolated)
high_pfn = max(high_pfn, pfn);
}
/* split_free_page does not map the pages */
list_for_each_entry(page, freelist, lru) {
arch_alloc_page(page, 0);
kernel_map_pages(page, 1, 1);
}
cc->free_pfn = high_pfn;
cc->nr_freepages = nr_freepages;
}
/* Update the number of anon and file isolated pages in the zone */
static void acct_isolated(struct zone *zone, struct compact_control *cc)
{
struct page *page;
unsigned int count[2] = { 0, };
list_for_each_entry(page, &cc->migratepages, lru)
count[!!page_is_file_cache(page)]++;
__mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
__mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
}
/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(struct zone *zone)
{
unsigned long active, inactive, isolated;
inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_ANON);
active = zone_page_state(zone, NR_ACTIVE_FILE) +
zone_page_state(zone, NR_ACTIVE_ANON);
isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
zone_page_state(zone, NR_ISOLATED_ANON);
return isolated > (inactive + active) / 2;
}
/* possible outcome of isolate_migratepages */
typedef enum {
ISOLATE_ABORT, /* Abort compaction now */
ISOLATE_NONE, /* No pages isolated, continue scanning */
ISOLATE_SUCCESS, /* Pages isolated, migrate */
} isolate_migrate_t;
/*
* Isolate all pages that can be migrated from the block pointed to by
* the migrate scanner within compact_control.
*/
static isolate_migrate_t isolate_migratepages(struct zone *zone,
struct compact_control *cc)
{
unsigned long low_pfn, end_pfn;
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
/* Do not scan outside zone boundaries */
low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
/* Only scan within a pageblock boundary */
end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
/* Do not cross the free scanner or scan within a memory hole */
if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
cc->migrate_pfn = end_pfn;
return ISOLATE_NONE;
}
/*
* Ensure that there are not too many pages isolated from the LRU
* list by either parallel reclaimers or compaction. If there are,
* delay for some time until fewer pages are isolated
*/
while (unlikely(too_many_isolated(zone))) {
/* async migration should just abort */
if (!cc->sync)
return ISOLATE_ABORT;
congestion_wait(BLK_RW_ASYNC, HZ/10);
if (fatal_signal_pending(current))
return ISOLATE_ABORT;
}
/* Time to isolate some pages for migration */
mm: compaction: minimise the time IRQs are disabled while isolating pages for migration compaction_alloc() isolates pages for migration in isolate_migratepages. While it's scanning, IRQs are disabled on the mistaken assumption the scanning should be short. Tests show this to be true for the most part but contention times on the LRU lock can be increased. Before this patch, the IRQ disabled times for a simple test looked like Total sampled time IRQs off (not real total time): 5493 Event shrink_inactive_list..shrink_zone 1596 us count 1 Event shrink_inactive_list..shrink_zone 1530 us count 1 Event shrink_inactive_list..shrink_zone 956 us count 1 Event shrink_inactive_list..shrink_zone 541 us count 1 Event shrink_inactive_list..shrink_zone 531 us count 1 Event split_huge_page..add_to_swap 232 us count 1 Event save_args..call_softirq 36 us count 1 Event save_args..call_softirq 35 us count 2 Event __wake_up..__wake_up 1 us count 1 This patch reduces the worst-case IRQs-disabled latencies by releasing the lock every SWAP_CLUSTER_MAX pages that are scanned and releasing the CPU if necessary. The cost of this is that the processing performing compaction will be slower but IRQs being disabled for too long a time has worse consequences as the following report shows; Total sampled time IRQs off (not real total time): 4367 Event shrink_inactive_list..shrink_zone 881 us count 1 Event shrink_inactive_list..shrink_zone 875 us count 1 Event shrink_inactive_list..shrink_zone 868 us count 1 Event shrink_inactive_list..shrink_zone 555 us count 1 Event split_huge_page..add_to_swap 495 us count 1 Event compact_zone..compact_zone_order 269 us count 1 Event split_huge_page..add_to_swap 266 us count 1 Event shrink_inactive_list..shrink_zone 85 us count 1 Event save_args..call_softirq 36 us count 2 Event __wake_up..__wake_up 1 us count 1 [akpm@linux-foundation.org: simplify with s/unlocked/locked/] Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Arthur Marsh <arthur.marsh@internode.on.net> Cc: Clemens Ladisch <cladisch@googlemail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-22 16:33:10 -07:00
cond_resched();
spin_lock_irq(&zone->lru_lock);
for (; low_pfn < end_pfn; low_pfn++) {
struct page *page;
mm: compaction: minimise the time IRQs are disabled while isolating pages for migration compaction_alloc() isolates pages for migration in isolate_migratepages. While it's scanning, IRQs are disabled on the mistaken assumption the scanning should be short. Tests show this to be true for the most part but contention times on the LRU lock can be increased. Before this patch, the IRQ disabled times for a simple test looked like Total sampled time IRQs off (not real total time): 5493 Event shrink_inactive_list..shrink_zone 1596 us count 1 Event shrink_inactive_list..shrink_zone 1530 us count 1 Event shrink_inactive_list..shrink_zone 956 us count 1 Event shrink_inactive_list..shrink_zone 541 us count 1 Event shrink_inactive_list..shrink_zone 531 us count 1 Event split_huge_page..add_to_swap 232 us count 1 Event save_args..call_softirq 36 us count 1 Event save_args..call_softirq 35 us count 2 Event __wake_up..__wake_up 1 us count 1 This patch reduces the worst-case IRQs-disabled latencies by releasing the lock every SWAP_CLUSTER_MAX pages that are scanned and releasing the CPU if necessary. The cost of this is that the processing performing compaction will be slower but IRQs being disabled for too long a time has worse consequences as the following report shows; Total sampled time IRQs off (not real total time): 4367 Event shrink_inactive_list..shrink_zone 881 us count 1 Event shrink_inactive_list..shrink_zone 875 us count 1 Event shrink_inactive_list..shrink_zone 868 us count 1 Event shrink_inactive_list..shrink_zone 555 us count 1 Event split_huge_page..add_to_swap 495 us count 1 Event compact_zone..compact_zone_order 269 us count 1 Event split_huge_page..add_to_swap 266 us count 1 Event shrink_inactive_list..shrink_zone 85 us count 1 Event save_args..call_softirq 36 us count 2 Event __wake_up..__wake_up 1 us count 1 [akpm@linux-foundation.org: simplify with s/unlocked/locked/] Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Arthur Marsh <arthur.marsh@internode.on.net> Cc: Clemens Ladisch <cladisch@googlemail.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-22 16:33:10 -07:00
bool locked = true;
/* give a chance to irqs before checking need_resched() */
if (!((low_pfn+1) % SWAP_CLUSTER_MAX)) {
spin_unlock_irq(&zone->lru_lock);
locked = false;
}
if (need_resched() || spin_is_contended(&zone->lru_lock)) {
if (locked)
spin_unlock_irq(&zone->lru_lock);
cond_resched();
spin_lock_irq(&zone->lru_lock);
if (fatal_signal_pending(current))
break;
} else if (!locked)
spin_lock_irq(&zone->lru_lock);
mm: compaction: check pfn_valid when entering a new MAX_ORDER_NR_PAGES block during isolation for migration When isolating for migration, migration starts at the start of a zone which is not necessarily pageblock aligned. Further, it stops isolating when COMPACT_CLUSTER_MAX pages are isolated so migrate_pfn is generally not aligned. This allows isolate_migratepages() to call pfn_to_page() on an invalid PFN which can result in a crash. This was originally reported against a 3.0-based kernel with the following trace in a crash dump. PID: 9902 TASK: d47aecd0 CPU: 0 COMMAND: "memcg_process_s" #0 [d72d3ad0] crash_kexec at c028cfdb #1 [d72d3b24] oops_end at c05c5322 #2 [d72d3b38] __bad_area_nosemaphore at c0227e60 #3 [d72d3bec] bad_area at c0227fb6 #4 [d72d3c00] do_page_fault at c05c72ec #5 [d72d3c80] error_code (via page_fault) at c05c47a4 EAX: 00000000 EBX: 000c0000 ECX: 00000001 EDX: 00000807 EBP: 000c0000 DS: 007b ESI: 00000001 ES: 007b EDI: f3000a80 GS: 6f50 CS: 0060 EIP: c030b15a ERR: ffffffff EFLAGS: 00010002 #6 [d72d3cb4] isolate_migratepages at c030b15a #7 [d72d3d14] zone_watermark_ok at c02d26cb #8 [d72d3d2c] compact_zone at c030b8de #9 [d72d3d68] compact_zone_order at c030bba1 #10 [d72d3db4] try_to_compact_pages at c030bc84 #11 [d72d3ddc] __alloc_pages_direct_compact at c02d61e7 #12 [d72d3e08] __alloc_pages_slowpath at c02d66c7 #13 [d72d3e78] __alloc_pages_nodemask at c02d6a97 #14 [d72d3eb8] alloc_pages_vma at c030a845 #15 [d72d3ed4] do_huge_pmd_anonymous_page at c03178eb #16 [d72d3f00] handle_mm_fault at c02f36c6 #17 [d72d3f30] do_page_fault at c05c70ed #18 [d72d3fb0] error_code (via page_fault) at c05c47a4 EAX: b71ff000 EBX: 00000001 ECX: 00001600 EDX: 00000431 DS: 007b ESI: 08048950 ES: 007b EDI: bfaa3788 SS: 007b ESP: bfaa36e0 EBP: bfaa3828 GS: 6f50 CS: 0073 EIP: 080487c8 ERR: ffffffff EFLAGS: 00010202 It was also reported by Herbert van den Bergh against 3.1-based kernel with the following snippet from the console log. BUG: unable to handle kernel paging request at 01c00008 IP: [<c0522399>] isolate_migratepages+0x119/0x390 *pdpt = 000000002f7ce001 *pde = 0000000000000000 It is expected that it also affects 3.2.x and current mainline. The problem is that pfn_valid is only called on the first PFN being checked and that PFN is not necessarily aligned. Lets say we have a case like this H = MAX_ORDER_NR_PAGES boundary | = pageblock boundary m = cc->migrate_pfn f = cc->free_pfn o = memory hole H------|------H------|----m-Hoooooo|ooooooH-f----|------H The migrate_pfn is just below a memory hole and the free scanner is beyond the hole. When isolate_migratepages started, it scans from migrate_pfn to migrate_pfn+pageblock_nr_pages which is now in a memory hole. It checks pfn_valid() on the first PFN but then scans into the hole where there are not necessarily valid struct pages. This patch ensures that isolate_migratepages calls pfn_valid when necessary. Reported-by: Herbert van den Bergh <herbert.van.den.bergh@oracle.com> Tested-by: Herbert van den Bergh <herbert.van.den.bergh@oracle.com> Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Michal Nazarewicz <mina86@mina86.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-02-03 16:37:18 -07:00
/*
* migrate_pfn does not necessarily start aligned to a
* pageblock. Ensure that pfn_valid is called when moving
* into a new MAX_ORDER_NR_PAGES range in case of large
* memory holes within the zone
*/
if ((low_pfn & (MAX_ORDER_NR_PAGES - 1)) == 0) {
if (!pfn_valid(low_pfn)) {
low_pfn += MAX_ORDER_NR_PAGES - 1;
continue;
}
}
if (!pfn_valid_within(low_pfn))
continue;
nr_scanned++;
mm: compaction: check for overlapping nodes during isolation for migration When isolating pages for migration, migration starts at the start of a zone while the free scanner starts at the end of the zone. Migration avoids entering a new zone by never going beyond the free scanned. Unfortunately, in very rare cases nodes can overlap. When this happens, migration isolates pages without the LRU lock held, corrupting lists which will trigger errors in reclaim or during page free such as in the following oops BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [<ffffffff810f795c>] free_pcppages_bulk+0xcc/0x450 PGD 1dda554067 PUD 1e1cb58067 PMD 0 Oops: 0000 [#1] SMP CPU 37 Pid: 17088, comm: memcg_process_s Tainted: G X RIP: free_pcppages_bulk+0xcc/0x450 Process memcg_process_s (pid: 17088, threadinfo ffff881c2926e000, task ffff881c2926c0c0) Call Trace: free_hot_cold_page+0x17e/0x1f0 __pagevec_free+0x90/0xb0 release_pages+0x22a/0x260 pagevec_lru_move_fn+0xf3/0x110 putback_lru_page+0x66/0xe0 unmap_and_move+0x156/0x180 migrate_pages+0x9e/0x1b0 compact_zone+0x1f3/0x2f0 compact_zone_order+0xa2/0xe0 try_to_compact_pages+0xdf/0x110 __alloc_pages_direct_compact+0xee/0x1c0 __alloc_pages_slowpath+0x370/0x830 __alloc_pages_nodemask+0x1b1/0x1c0 alloc_pages_vma+0x9b/0x160 do_huge_pmd_anonymous_page+0x160/0x270 do_page_fault+0x207/0x4c0 page_fault+0x25/0x30 The "X" in the taint flag means that external modules were loaded but but is unrelated to the bug triggering. The real problem was because the PFN layout looks like this Zone PFN ranges: DMA 0x00000010 -> 0x00001000 DMA32 0x00001000 -> 0x00100000 Normal 0x00100000 -> 0x01e80000 Movable zone start PFN for each node early_node_map[14] active PFN ranges 0: 0x00000010 -> 0x0000009b 0: 0x00000100 -> 0x0007a1ec 0: 0x0007a354 -> 0x0007a379 0: 0x0007f7ff -> 0x0007f800 0: 0x00100000 -> 0x00680000 1: 0x00680000 -> 0x00e80000 0: 0x00e80000 -> 0x01080000 1: 0x01080000 -> 0x01280000 0: 0x01280000 -> 0x01480000 1: 0x01480000 -> 0x01680000 0: 0x01680000 -> 0x01880000 1: 0x01880000 -> 0x01a80000 0: 0x01a80000 -> 0x01c80000 1: 0x01c80000 -> 0x01e80000 The fix is straight-forward. isolate_migratepages() has to make a similar check to isolate_freepage to ensure that it never isolates pages from a zone it does not hold the LRU lock for. This was discovered in a 3.0-based kernel but it affects 3.1.x, 3.2.x and current mainline. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Michal Nazarewicz <mina86@mina86.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-02-08 18:13:38 -07:00
/*
* Get the page and ensure the page is within the same zone.
* See the comment in isolate_freepages about overlapping
* nodes. It is deliberate that the new zone lock is not taken
* as memory compaction should not move pages between nodes.
*/
page = pfn_to_page(low_pfn);
mm: compaction: check for overlapping nodes during isolation for migration When isolating pages for migration, migration starts at the start of a zone while the free scanner starts at the end of the zone. Migration avoids entering a new zone by never going beyond the free scanned. Unfortunately, in very rare cases nodes can overlap. When this happens, migration isolates pages without the LRU lock held, corrupting lists which will trigger errors in reclaim or during page free such as in the following oops BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: [<ffffffff810f795c>] free_pcppages_bulk+0xcc/0x450 PGD 1dda554067 PUD 1e1cb58067 PMD 0 Oops: 0000 [#1] SMP CPU 37 Pid: 17088, comm: memcg_process_s Tainted: G X RIP: free_pcppages_bulk+0xcc/0x450 Process memcg_process_s (pid: 17088, threadinfo ffff881c2926e000, task ffff881c2926c0c0) Call Trace: free_hot_cold_page+0x17e/0x1f0 __pagevec_free+0x90/0xb0 release_pages+0x22a/0x260 pagevec_lru_move_fn+0xf3/0x110 putback_lru_page+0x66/0xe0 unmap_and_move+0x156/0x180 migrate_pages+0x9e/0x1b0 compact_zone+0x1f3/0x2f0 compact_zone_order+0xa2/0xe0 try_to_compact_pages+0xdf/0x110 __alloc_pages_direct_compact+0xee/0x1c0 __alloc_pages_slowpath+0x370/0x830 __alloc_pages_nodemask+0x1b1/0x1c0 alloc_pages_vma+0x9b/0x160 do_huge_pmd_anonymous_page+0x160/0x270 do_page_fault+0x207/0x4c0 page_fault+0x25/0x30 The "X" in the taint flag means that external modules were loaded but but is unrelated to the bug triggering. The real problem was because the PFN layout looks like this Zone PFN ranges: DMA 0x00000010 -> 0x00001000 DMA32 0x00001000 -> 0x00100000 Normal 0x00100000 -> 0x01e80000 Movable zone start PFN for each node early_node_map[14] active PFN ranges 0: 0x00000010 -> 0x0000009b 0: 0x00000100 -> 0x0007a1ec 0: 0x0007a354 -> 0x0007a379 0: 0x0007f7ff -> 0x0007f800 0: 0x00100000 -> 0x00680000 1: 0x00680000 -> 0x00e80000 0: 0x00e80000 -> 0x01080000 1: 0x01080000 -> 0x01280000 0: 0x01280000 -> 0x01480000 1: 0x01480000 -> 0x01680000 0: 0x01680000 -> 0x01880000 1: 0x01880000 -> 0x01a80000 0: 0x01a80000 -> 0x01c80000 1: 0x01c80000 -> 0x01e80000 The fix is straight-forward. isolate_migratepages() has to make a similar check to isolate_freepage to ensure that it never isolates pages from a zone it does not hold the LRU lock for. This was discovered in a 3.0-based kernel but it affects 3.1.x, 3.2.x and current mainline. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Michal Nazarewicz <mina86@mina86.com> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-02-08 18:13:38 -07:00
if (page_zone(page) != zone)
continue;
/* Skip if free */
if (PageBuddy(page))
continue;
/*
* For async migration, also only scan in MOVABLE blocks. Async
* migration is optimistic to see if the minimum amount of work
* satisfies the allocation
*/
pageblock_nr = low_pfn >> pageblock_order;
if (!cc->sync && last_pageblock_nr != pageblock_nr &&
get_pageblock_migratetype(page) != MIGRATE_MOVABLE) {
low_pfn += pageblock_nr_pages;
low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1;
last_pageblock_nr = pageblock_nr;
continue;
}
if (!PageLRU(page))
continue;
/*
* PageLRU is set, and lru_lock excludes isolation,
* splitting and collapsing (collapsing has already
* happened if PageLRU is set).
*/
if (PageTransHuge(page)) {
low_pfn += (1 << compound_order(page)) - 1;
continue;
}
if (!cc->sync)
mode |= ISOLATE_ASYNC_MIGRATE;
/* Try isolate the page */
if (__isolate_lru_page(page, mode, 0) != 0)
continue;
VM_BUG_ON(PageTransCompound(page));
/* Successfully isolated */
del_page_from_lru_list(zone, page, page_lru(page));
list_add(&page->lru, migratelist);
cc->nr_migratepages++;
nr_isolated++;
/* Avoid isolating too much */
if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
++low_pfn;
break;
}
}
acct_isolated(zone, cc);
spin_unlock_irq(&zone->lru_lock);
cc->migrate_pfn = low_pfn;
trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
return ISOLATE_SUCCESS;
}
/*
* This is a migrate-callback that "allocates" freepages by taking pages
* from the isolated freelists in the block we are migrating to.
*/
static struct page *compaction_alloc(struct page *migratepage,
unsigned long data,
int **result)
{
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;
/* Isolate free pages if necessary */
if (list_empty(&cc->freepages)) {
isolate_freepages(cc->zone, cc);
if (list_empty(&cc->freepages))
return NULL;
}
freepage = list_entry(cc->freepages.next, struct page, lru);
list_del(&freepage->lru);
cc->nr_freepages--;
return freepage;
}
/*
* We cannot control nr_migratepages and nr_freepages fully when migration is
* running as migrate_pages() has no knowledge of compact_control. When
* migration is complete, we count the number of pages on the lists by hand.
*/
static void update_nr_listpages(struct compact_control *cc)
{
int nr_migratepages = 0;
int nr_freepages = 0;
struct page *page;
list_for_each_entry(page, &cc->migratepages, lru)
nr_migratepages++;
list_for_each_entry(page, &cc->freepages, lru)
nr_freepages++;
cc->nr_migratepages = nr_migratepages;
cc->nr_freepages = nr_freepages;
}
static int compact_finished(struct zone *zone,
struct compact_control *cc)
{
unsigned int order;
unsigned long watermark;
if (fatal_signal_pending(current))
return COMPACT_PARTIAL;
/* Compaction run completes if the migrate and free scanner meet */
if (cc->free_pfn <= cc->migrate_pfn)
return COMPACT_COMPLETE;
/*
* order == -1 is expected when compacting via
* /proc/sys/vm/compact_memory
*/
if (cc->order == -1)
return COMPACT_CONTINUE;
/* Compaction run is not finished if the watermark is not met */
watermark = low_wmark_pages(zone);
watermark += (1 << cc->order);
if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
for (order = cc->order; order < MAX_ORDER; order++) {
/* Job done if page is free of the right migratetype */
if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
return COMPACT_PARTIAL;
/* Job done if allocation would set block type */
if (order >= pageblock_order && zone->free_area[order].nr_free)
return COMPACT_PARTIAL;
}
return COMPACT_CONTINUE;
}
/*
* compaction_suitable: Is this suitable to run compaction on this zone now?
* Returns
* COMPACT_SKIPPED - If there are too few free pages for compaction
* COMPACT_PARTIAL - If the allocation would succeed without compaction
* COMPACT_CONTINUE - If compaction should run now
*/
unsigned long compaction_suitable(struct zone *zone, int order)
{
int fragindex;
unsigned long watermark;
/*
* order == -1 is expected when compacting via
* /proc/sys/vm/compact_memory
*/
if (order == -1)
return COMPACT_CONTINUE;
/*
* Watermarks for order-0 must be met for compaction. Note the 2UL.
* This is because during migration, copies of pages need to be
* allocated and for a short time, the footprint is higher
*/
watermark = low_wmark_pages(zone) + (2UL << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
return COMPACT_SKIPPED;
/*
* fragmentation index determines if allocation failures are due to
* low memory or external fragmentation
*
* index of -1000 implies allocations might succeed depending on
* watermarks
* index towards 0 implies failure is due to lack of memory
* index towards 1000 implies failure is due to fragmentation
*
* Only compact if a failure would be due to fragmentation.
*/
fragindex = fragmentation_index(zone, order);
if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
return COMPACT_SKIPPED;
if (fragindex == -1000 && zone_watermark_ok(zone, order, watermark,
0, 0))
return COMPACT_PARTIAL;
return COMPACT_CONTINUE;
}
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
ret = compaction_suitable(zone, cc->order);
switch (ret) {
case COMPACT_PARTIAL:
case COMPACT_SKIPPED:
/* Compaction is likely to fail */
return ret;
case COMPACT_CONTINUE:
/* Fall through to compaction */
;
}
/* Setup to move all movable pages to the end of the zone */
cc->migrate_pfn = zone->zone_start_pfn;
cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
cc->free_pfn &= ~(pageblock_nr_pages-1);
migrate_prep_local();
while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
unsigned long nr_migrate, nr_remaining;
int err;
switch (isolate_migratepages(zone, cc)) {
case ISOLATE_ABORT:
ret = COMPACT_PARTIAL;
goto out;
case ISOLATE_NONE:
continue;
case ISOLATE_SUCCESS:
;
}
nr_migrate = cc->nr_migratepages;
err = migrate_pages(&cc->migratepages, compaction_alloc,
(unsigned long)cc, false,
cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC);
update_nr_listpages(cc);
nr_remaining = cc->nr_migratepages;
count_vm_event(COMPACTBLOCKS);
count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
if (nr_remaining)
count_vm_events(COMPACTPAGEFAILED, nr_remaining);
trace_mm_compaction_migratepages(nr_migrate - nr_remaining,
nr_remaining);
/* Release LRU pages not migrated */
if (err) {
putback_lru_pages(&cc->migratepages);
cc->nr_migratepages = 0;
}
}
out:
/* Release free pages and check accounting */
cc->nr_freepages -= release_freepages(&cc->freepages);
VM_BUG_ON(cc->nr_freepages != 0);
return ret;
}
static unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
mm: compaction: prevent kswapd compacting memory to reduce CPU usage This patch reverts 5a03b051 ("thp: use compaction in kswapd for GFP_ATOMIC order > 0") due to reports stating that kswapd CPU usage was higher and IRQs were being disabled more frequently. This was reported at http://www.spinics.net/linux/fedora/alsa-user/msg09885.html. Without this patch applied, CPU usage by kswapd hovers around the 20% mark according to the tester (Arthur Marsh: http://www.spinics.net/linux/fedora/alsa-user/msg09899.html). With this patch applied, it's around 2%. The problem is not related to THP which specifies __GFP_NO_KSWAPD but is triggered by high-order allocations hitting the low watermark for their order and waking kswapd on kernels with CONFIG_COMPACTION set. The most common trigger for this is network cards configured for jumbo frames but it's also possible it'll be triggered by fork-heavy workloads (order-1) and some wireless cards which depend on order-1 allocations. The symptoms for the user will be high CPU usage by kswapd in low-memory situations which could be confused with another writeback problem. While a patch like 5a03b051 may be reintroduced in the future, this patch plays it safe for now and reverts it. [mel@csn.ul.ie: Beefed up the changelog] Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Reported-by: Arthur Marsh <arthur.marsh@internode.on.net> Tested-by: Arthur Marsh <arthur.marsh@internode.on.net> Cc: <stable@kernel.org> [2.6.38.1] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-22 16:30:38 -07:00
bool sync)
{
struct compact_control cc = {
.nr_freepages = 0,
.nr_migratepages = 0,
.order = order,
.migratetype = allocflags_to_migratetype(gfp_mask),
.zone = zone,
.sync = sync,
};
INIT_LIST_HEAD(&cc.freepages);
INIT_LIST_HEAD(&cc.migratepages);
return compact_zone(zone, &cc);
}
int sysctl_extfrag_threshold = 500;
/**
* try_to_compact_pages - Direct compact to satisfy a high-order allocation
* @zonelist: The zonelist used for the current allocation
* @order: The order of the current allocation
* @gfp_mask: The GFP mask of the current allocation
* @nodemask: The allowed nodes to allocate from
* @sync: Whether migration is synchronous or not
*
* This is the main entry point for direct page compaction.
*/
unsigned long try_to_compact_pages(struct zonelist *zonelist,
int order, gfp_t gfp_mask, nodemask_t *nodemask,
bool sync)
{
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
struct zone *zone;
int rc = COMPACT_SKIPPED;
/*
* Check whether it is worth even starting compaction. The order check is
* made because an assumption is made that the page allocator can satisfy
* the "cheaper" orders without taking special steps
*/
if (!order || !may_enter_fs || !may_perform_io)
return rc;
count_vm_event(COMPACTSTALL);
/* Compact each zone in the list */
for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
nodemask) {
int status;
mm: compaction: prevent kswapd compacting memory to reduce CPU usage This patch reverts 5a03b051 ("thp: use compaction in kswapd for GFP_ATOMIC order > 0") due to reports stating that kswapd CPU usage was higher and IRQs were being disabled more frequently. This was reported at http://www.spinics.net/linux/fedora/alsa-user/msg09885.html. Without this patch applied, CPU usage by kswapd hovers around the 20% mark according to the tester (Arthur Marsh: http://www.spinics.net/linux/fedora/alsa-user/msg09899.html). With this patch applied, it's around 2%. The problem is not related to THP which specifies __GFP_NO_KSWAPD but is triggered by high-order allocations hitting the low watermark for their order and waking kswapd on kernels with CONFIG_COMPACTION set. The most common trigger for this is network cards configured for jumbo frames but it's also possible it'll be triggered by fork-heavy workloads (order-1) and some wireless cards which depend on order-1 allocations. The symptoms for the user will be high CPU usage by kswapd in low-memory situations which could be confused with another writeback problem. While a patch like 5a03b051 may be reintroduced in the future, this patch plays it safe for now and reverts it. [mel@csn.ul.ie: Beefed up the changelog] Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Mel Gorman <mel@csn.ul.ie> Reported-by: Arthur Marsh <arthur.marsh@internode.on.net> Tested-by: Arthur Marsh <arthur.marsh@internode.on.net> Cc: <stable@kernel.org> [2.6.38.1] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-03-22 16:30:38 -07:00
status = compact_zone_order(zone, order, gfp_mask, sync);
rc = max(status, rc);
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0))
break;
}
return rc;
}
/* Compact all zones within a node */
static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc)
{
int zoneid;
struct zone *zone;
for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
continue;
cc->nr_freepages = 0;
cc->nr_migratepages = 0;
cc->zone = zone;
INIT_LIST_HEAD(&cc->freepages);
INIT_LIST_HEAD(&cc->migratepages);
if (cc->order == -1 || !compaction_deferred(zone, cc->order))
compact_zone(zone, cc);
if (cc->order > 0) {
int ok = zone_watermark_ok(zone, cc->order,
low_wmark_pages(zone), 0, 0);
if (ok && cc->order > zone->compact_order_failed)
zone->compact_order_failed = cc->order + 1;
/* Currently async compaction is never deferred. */
else if (!ok && cc->sync)
defer_compaction(zone, cc->order);
}
VM_BUG_ON(!list_empty(&cc->freepages));
VM_BUG_ON(!list_empty(&cc->migratepages));
}
return 0;
}
int compact_pgdat(pg_data_t *pgdat, int order)
{
struct compact_control cc = {
.order = order,
.sync = false,
};
return __compact_pgdat(pgdat, &cc);
}
static int compact_node(int nid)
{
struct compact_control cc = {
.order = -1,
.sync = true,
};
compact_pgdat: workaround lockdep warning in kswapd I get this lockdep warning from swapping load on linux-next, due to "vmscan: kswapd carefully call compaction". ================================= [ INFO: inconsistent lock state ] 3.3.0-rc2-next-20120201 #5 Not tainted --------------------------------- inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage. kswapd0/28 [HC0[0]:SC0[0]:HE1:SE1] takes: (pcpu_alloc_mutex){+.+.?.}, at: [<ffffffff810d6684>] pcpu_alloc+0x67/0x325 {RECLAIM_FS-ON-W} state was registered at: [<ffffffff81099b75>] mark_held_locks+0xd7/0x103 [<ffffffff8109a13c>] lockdep_trace_alloc+0x85/0x9e [<ffffffff810f6bdc>] __kmalloc+0x6c/0x14b [<ffffffff810d57fd>] pcpu_mem_zalloc+0x59/0x62 [<ffffffff810d5d16>] pcpu_extend_area_map+0x26/0xb1 [<ffffffff810d679f>] pcpu_alloc+0x182/0x325 [<ffffffff810d694d>] __alloc_percpu+0xb/0xd [<ffffffff8142ebfd>] snmp_mib_init+0x1e/0x2e [<ffffffff8185cd8d>] ipv4_mib_init_net+0x7a/0x184 [<ffffffff813dc963>] ops_init.clone.0+0x6b/0x73 [<ffffffff813dc9cc>] register_pernet_operations+0x61/0xa0 [<ffffffff813dca8e>] register_pernet_subsys+0x29/0x42 [<ffffffff8185d044>] inet_init+0x1ad/0x252 [<ffffffff810002e3>] do_one_initcall+0x7a/0x12f [<ffffffff81832bc5>] kernel_init+0x9d/0x11e [<ffffffff814e51e4>] kernel_thread_helper+0x4/0x10 irq event stamp: 656613 hardirqs last enabled at (656613): [<ffffffff814e0ddc>] __mutex_unlock_slowpath+0x104/0x128 hardirqs last disabled at (656612): [<ffffffff814e0d34>] __mutex_unlock_slowpath+0x5c/0x128 softirqs last enabled at (655568): [<ffffffff8105b4a5>] __do_softirq+0x120/0x136 softirqs last disabled at (654757): [<ffffffff814e52dc>] call_softirq+0x1c/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(pcpu_alloc_mutex); <Interrupt> lock(pcpu_alloc_mutex); *** DEADLOCK *** no locks held by kswapd0/28. stack backtrace: Pid: 28, comm: kswapd0 Not tainted 3.3.0-rc2-next-20120201 #5 Call Trace: [<ffffffff810981f4>] print_usage_bug+0x1bf/0x1d0 [<ffffffff81096c3e>] ? print_irq_inversion_bug+0x1d9/0x1d9 [<ffffffff810982c0>] mark_lock_irq+0xbb/0x22e [<ffffffff810c5399>] ? free_hot_cold_page+0x13d/0x14f [<ffffffff81098684>] mark_lock+0x251/0x331 [<ffffffff81098893>] mark_irqflags+0x12f/0x141 [<ffffffff81098e32>] __lock_acquire+0x58d/0x753 [<ffffffff810d6684>] ? pcpu_alloc+0x67/0x325 [<ffffffff81099433>] lock_acquire+0x54/0x6a [<ffffffff810d6684>] ? pcpu_alloc+0x67/0x325 [<ffffffff8107a5b8>] ? add_preempt_count+0xa9/0xae [<ffffffff814e0a21>] mutex_lock_nested+0x5e/0x315 [<ffffffff810d6684>] ? pcpu_alloc+0x67/0x325 [<ffffffff81098f81>] ? __lock_acquire+0x6dc/0x753 [<ffffffff810c9fb0>] ? __pagevec_release+0x2c/0x2c [<ffffffff810d6684>] pcpu_alloc+0x67/0x325 [<ffffffff810c9fb0>] ? __pagevec_release+0x2c/0x2c [<ffffffff810d694d>] __alloc_percpu+0xb/0xd [<ffffffff8106c35e>] schedule_on_each_cpu+0x23/0x110 [<ffffffff810c9fcb>] lru_add_drain_all+0x10/0x12 [<ffffffff810f126f>] __compact_pgdat+0x20/0x182 [<ffffffff810f15c2>] compact_pgdat+0x27/0x29 [<ffffffff810c306b>] ? zone_watermark_ok+0x1a/0x1c [<ffffffff810cdf6f>] balance_pgdat+0x732/0x751 [<ffffffff810ce0ed>] kswapd+0x15f/0x178 [<ffffffff810cdf8e>] ? balance_pgdat+0x751/0x751 [<ffffffff8106fd11>] kthread+0x84/0x8c [<ffffffff814e51e4>] kernel_thread_helper+0x4/0x10 [<ffffffff810787ed>] ? finish_task_switch+0x85/0xea [<ffffffff814e3861>] ? retint_restore_args+0xe/0xe [<ffffffff8106fc8d>] ? __init_kthread_worker+0x56/0x56 [<ffffffff814e51e0>] ? gs_change+0xb/0xb The RECLAIM_FS notations indicate that it's doing the GFP_FS checking that Nick hacked into lockdep a while back: I think we're intended to read that "<Interrupt>" in the DEADLOCK scenario as "<Direct reclaim>". I'm hazy, I have not reached any conclusion as to whether it's right to complain or not; but I believe it's uneasy about kswapd now doing the mutex_lock(&pcpu_alloc_mutex) which lru_add_drain_all() entails. Nor have I reached any conclusion as to whether it's important for kswapd to do that draining or not. But so as not to get blocked on this, with lockdep disabled from giving further reports, here's a patch which removes the lru_add_drain_all() from kswapd's callpath (and calls it only once from compact_nodes(), instead of once per node). Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:33:53 -07:00
return __compact_pgdat(NODE_DATA(nid), &cc);
}
/* Compact all nodes in the system */
static int compact_nodes(void)
{
int nid;
compact_pgdat: workaround lockdep warning in kswapd I get this lockdep warning from swapping load on linux-next, due to "vmscan: kswapd carefully call compaction". ================================= [ INFO: inconsistent lock state ] 3.3.0-rc2-next-20120201 #5 Not tainted --------------------------------- inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage. kswapd0/28 [HC0[0]:SC0[0]:HE1:SE1] takes: (pcpu_alloc_mutex){+.+.?.}, at: [<ffffffff810d6684>] pcpu_alloc+0x67/0x325 {RECLAIM_FS-ON-W} state was registered at: [<ffffffff81099b75>] mark_held_locks+0xd7/0x103 [<ffffffff8109a13c>] lockdep_trace_alloc+0x85/0x9e [<ffffffff810f6bdc>] __kmalloc+0x6c/0x14b [<ffffffff810d57fd>] pcpu_mem_zalloc+0x59/0x62 [<ffffffff810d5d16>] pcpu_extend_area_map+0x26/0xb1 [<ffffffff810d679f>] pcpu_alloc+0x182/0x325 [<ffffffff810d694d>] __alloc_percpu+0xb/0xd [<ffffffff8142ebfd>] snmp_mib_init+0x1e/0x2e [<ffffffff8185cd8d>] ipv4_mib_init_net+0x7a/0x184 [<ffffffff813dc963>] ops_init.clone.0+0x6b/0x73 [<ffffffff813dc9cc>] register_pernet_operations+0x61/0xa0 [<ffffffff813dca8e>] register_pernet_subsys+0x29/0x42 [<ffffffff8185d044>] inet_init+0x1ad/0x252 [<ffffffff810002e3>] do_one_initcall+0x7a/0x12f [<ffffffff81832bc5>] kernel_init+0x9d/0x11e [<ffffffff814e51e4>] kernel_thread_helper+0x4/0x10 irq event stamp: 656613 hardirqs last enabled at (656613): [<ffffffff814e0ddc>] __mutex_unlock_slowpath+0x104/0x128 hardirqs last disabled at (656612): [<ffffffff814e0d34>] __mutex_unlock_slowpath+0x5c/0x128 softirqs last enabled at (655568): [<ffffffff8105b4a5>] __do_softirq+0x120/0x136 softirqs last disabled at (654757): [<ffffffff814e52dc>] call_softirq+0x1c/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(pcpu_alloc_mutex); <Interrupt> lock(pcpu_alloc_mutex); *** DEADLOCK *** no locks held by kswapd0/28. stack backtrace: Pid: 28, comm: kswapd0 Not tainted 3.3.0-rc2-next-20120201 #5 Call Trace: [<ffffffff810981f4>] print_usage_bug+0x1bf/0x1d0 [<ffffffff81096c3e>] ? print_irq_inversion_bug+0x1d9/0x1d9 [<ffffffff810982c0>] mark_lock_irq+0xbb/0x22e [<ffffffff810c5399>] ? free_hot_cold_page+0x13d/0x14f [<ffffffff81098684>] mark_lock+0x251/0x331 [<ffffffff81098893>] mark_irqflags+0x12f/0x141 [<ffffffff81098e32>] __lock_acquire+0x58d/0x753 [<ffffffff810d6684>] ? pcpu_alloc+0x67/0x325 [<ffffffff81099433>] lock_acquire+0x54/0x6a [<ffffffff810d6684>] ? pcpu_alloc+0x67/0x325 [<ffffffff8107a5b8>] ? add_preempt_count+0xa9/0xae [<ffffffff814e0a21>] mutex_lock_nested+0x5e/0x315 [<ffffffff810d6684>] ? pcpu_alloc+0x67/0x325 [<ffffffff81098f81>] ? __lock_acquire+0x6dc/0x753 [<ffffffff810c9fb0>] ? __pagevec_release+0x2c/0x2c [<ffffffff810d6684>] pcpu_alloc+0x67/0x325 [<ffffffff810c9fb0>] ? __pagevec_release+0x2c/0x2c [<ffffffff810d694d>] __alloc_percpu+0xb/0xd [<ffffffff8106c35e>] schedule_on_each_cpu+0x23/0x110 [<ffffffff810c9fcb>] lru_add_drain_all+0x10/0x12 [<ffffffff810f126f>] __compact_pgdat+0x20/0x182 [<ffffffff810f15c2>] compact_pgdat+0x27/0x29 [<ffffffff810c306b>] ? zone_watermark_ok+0x1a/0x1c [<ffffffff810cdf6f>] balance_pgdat+0x732/0x751 [<ffffffff810ce0ed>] kswapd+0x15f/0x178 [<ffffffff810cdf8e>] ? balance_pgdat+0x751/0x751 [<ffffffff8106fd11>] kthread+0x84/0x8c [<ffffffff814e51e4>] kernel_thread_helper+0x4/0x10 [<ffffffff810787ed>] ? finish_task_switch+0x85/0xea [<ffffffff814e3861>] ? retint_restore_args+0xe/0xe [<ffffffff8106fc8d>] ? __init_kthread_worker+0x56/0x56 [<ffffffff814e51e0>] ? gs_change+0xb/0xb The RECLAIM_FS notations indicate that it's doing the GFP_FS checking that Nick hacked into lockdep a while back: I think we're intended to read that "<Interrupt>" in the DEADLOCK scenario as "<Direct reclaim>". I'm hazy, I have not reached any conclusion as to whether it's right to complain or not; but I believe it's uneasy about kswapd now doing the mutex_lock(&pcpu_alloc_mutex) which lru_add_drain_all() entails. Nor have I reached any conclusion as to whether it's important for kswapd to do that draining or not. But so as not to get blocked on this, with lockdep disabled from giving further reports, here's a patch which removes the lru_add_drain_all() from kswapd's callpath (and calls it only once from compact_nodes(), instead of once per node). Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:33:53 -07:00
/* Flush pending updates to the LRU lists */
lru_add_drain_all();
for_each_online_node(nid)
compact_node(nid);
return COMPACT_COMPLETE;
}
/* The written value is actually unused, all memory is compacted */
int sysctl_compact_memory;
/* This is the entry point for compacting all nodes via /proc/sys/vm */
int sysctl_compaction_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
if (write)
return compact_nodes();
return 0;
}
int sysctl_extfrag_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec_minmax(table, write, buffer, length, ppos);
return 0;
}
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
ssize_t sysfs_compact_node(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
compact_pgdat: workaround lockdep warning in kswapd I get this lockdep warning from swapping load on linux-next, due to "vmscan: kswapd carefully call compaction". ================================= [ INFO: inconsistent lock state ] 3.3.0-rc2-next-20120201 #5 Not tainted --------------------------------- inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage. kswapd0/28 [HC0[0]:SC0[0]:HE1:SE1] takes: (pcpu_alloc_mutex){+.+.?.}, at: [<ffffffff810d6684>] pcpu_alloc+0x67/0x325 {RECLAIM_FS-ON-W} state was registered at: [<ffffffff81099b75>] mark_held_locks+0xd7/0x103 [<ffffffff8109a13c>] lockdep_trace_alloc+0x85/0x9e [<ffffffff810f6bdc>] __kmalloc+0x6c/0x14b [<ffffffff810d57fd>] pcpu_mem_zalloc+0x59/0x62 [<ffffffff810d5d16>] pcpu_extend_area_map+0x26/0xb1 [<ffffffff810d679f>] pcpu_alloc+0x182/0x325 [<ffffffff810d694d>] __alloc_percpu+0xb/0xd [<ffffffff8142ebfd>] snmp_mib_init+0x1e/0x2e [<ffffffff8185cd8d>] ipv4_mib_init_net+0x7a/0x184 [<ffffffff813dc963>] ops_init.clone.0+0x6b/0x73 [<ffffffff813dc9cc>] register_pernet_operations+0x61/0xa0 [<ffffffff813dca8e>] register_pernet_subsys+0x29/0x42 [<ffffffff8185d044>] inet_init+0x1ad/0x252 [<ffffffff810002e3>] do_one_initcall+0x7a/0x12f [<ffffffff81832bc5>] kernel_init+0x9d/0x11e [<ffffffff814e51e4>] kernel_thread_helper+0x4/0x10 irq event stamp: 656613 hardirqs last enabled at (656613): [<ffffffff814e0ddc>] __mutex_unlock_slowpath+0x104/0x128 hardirqs last disabled at (656612): [<ffffffff814e0d34>] __mutex_unlock_slowpath+0x5c/0x128 softirqs last enabled at (655568): [<ffffffff8105b4a5>] __do_softirq+0x120/0x136 softirqs last disabled at (654757): [<ffffffff814e52dc>] call_softirq+0x1c/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(pcpu_alloc_mutex); <Interrupt> lock(pcpu_alloc_mutex); *** DEADLOCK *** no locks held by kswapd0/28. stack backtrace: Pid: 28, comm: kswapd0 Not tainted 3.3.0-rc2-next-20120201 #5 Call Trace: [<ffffffff810981f4>] print_usage_bug+0x1bf/0x1d0 [<ffffffff81096c3e>] ? print_irq_inversion_bug+0x1d9/0x1d9 [<ffffffff810982c0>] mark_lock_irq+0xbb/0x22e [<ffffffff810c5399>] ? free_hot_cold_page+0x13d/0x14f [<ffffffff81098684>] mark_lock+0x251/0x331 [<ffffffff81098893>] mark_irqflags+0x12f/0x141 [<ffffffff81098e32>] __lock_acquire+0x58d/0x753 [<ffffffff810d6684>] ? pcpu_alloc+0x67/0x325 [<ffffffff81099433>] lock_acquire+0x54/0x6a [<ffffffff810d6684>] ? pcpu_alloc+0x67/0x325 [<ffffffff8107a5b8>] ? add_preempt_count+0xa9/0xae [<ffffffff814e0a21>] mutex_lock_nested+0x5e/0x315 [<ffffffff810d6684>] ? pcpu_alloc+0x67/0x325 [<ffffffff81098f81>] ? __lock_acquire+0x6dc/0x753 [<ffffffff810c9fb0>] ? __pagevec_release+0x2c/0x2c [<ffffffff810d6684>] pcpu_alloc+0x67/0x325 [<ffffffff810c9fb0>] ? __pagevec_release+0x2c/0x2c [<ffffffff810d694d>] __alloc_percpu+0xb/0xd [<ffffffff8106c35e>] schedule_on_each_cpu+0x23/0x110 [<ffffffff810c9fcb>] lru_add_drain_all+0x10/0x12 [<ffffffff810f126f>] __compact_pgdat+0x20/0x182 [<ffffffff810f15c2>] compact_pgdat+0x27/0x29 [<ffffffff810c306b>] ? zone_watermark_ok+0x1a/0x1c [<ffffffff810cdf6f>] balance_pgdat+0x732/0x751 [<ffffffff810ce0ed>] kswapd+0x15f/0x178 [<ffffffff810cdf8e>] ? balance_pgdat+0x751/0x751 [<ffffffff8106fd11>] kthread+0x84/0x8c [<ffffffff814e51e4>] kernel_thread_helper+0x4/0x10 [<ffffffff810787ed>] ? finish_task_switch+0x85/0xea [<ffffffff814e3861>] ? retint_restore_args+0xe/0xe [<ffffffff8106fc8d>] ? __init_kthread_worker+0x56/0x56 [<ffffffff814e51e0>] ? gs_change+0xb/0xb The RECLAIM_FS notations indicate that it's doing the GFP_FS checking that Nick hacked into lockdep a while back: I think we're intended to read that "<Interrupt>" in the DEADLOCK scenario as "<Direct reclaim>". I'm hazy, I have not reached any conclusion as to whether it's right to complain or not; but I believe it's uneasy about kswapd now doing the mutex_lock(&pcpu_alloc_mutex) which lru_add_drain_all() entails. Nor have I reached any conclusion as to whether it's important for kswapd to do that draining or not. But so as not to get blocked on this, with lockdep disabled from giving further reports, here's a patch which removes the lru_add_drain_all() from kswapd's callpath (and calls it only once from compact_nodes(), instead of once per node). Signed-off-by: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-21 16:33:53 -07:00
int nid = dev->id;
if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
/* Flush pending updates to the LRU lists */
lru_add_drain_all();
compact_node(nid);
}
return count;
}
static DEVICE_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
int compaction_register_node(struct node *node)
{
return device_create_file(&node->dev, &dev_attr_compact);
}
void compaction_unregister_node(struct node *node)
{
return device_remove_file(&node->dev, &dev_attr_compact);
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */