2005-10-29 18:16:54 -07:00
|
|
|
/*
|
|
|
|
* linux/mm/memory_hotplug.c
|
|
|
|
*
|
|
|
|
* Copyright (C)
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/stddef.h>
|
|
|
|
#include <linux/mm.h>
|
|
|
|
#include <linux/swap.h>
|
|
|
|
#include <linux/interrupt.h>
|
|
|
|
#include <linux/pagemap.h>
|
|
|
|
#include <linux/bootmem.h>
|
|
|
|
#include <linux/compiler.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/pagevec.h>
|
2006-09-29 02:01:25 -07:00
|
|
|
#include <linux/writeback.h>
|
2005-10-29 18:16:54 -07:00
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/sysctl.h>
|
|
|
|
#include <linux/cpu.h>
|
|
|
|
#include <linux/memory.h>
|
|
|
|
#include <linux/memory_hotplug.h>
|
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/vmalloc.h>
|
2006-06-27 02:53:35 -07:00
|
|
|
#include <linux/ioport.h>
|
2007-10-16 01:26:12 -07:00
|
|
|
#include <linux/delay.h>
|
|
|
|
#include <linux/migrate.h>
|
|
|
|
#include <linux/page-isolation.h>
|
2008-10-18 20:25:58 -07:00
|
|
|
#include <linux/pfn.h>
|
2009-11-17 15:06:22 -07:00
|
|
|
#include <linux/suspend.h>
|
2009-12-14 18:58:11 -07:00
|
|
|
#include <linux/mm_inline.h>
|
2010-03-05 14:41:58 -07:00
|
|
|
#include <linux/firmware-map.h>
|
2005-10-29 18:16:54 -07:00
|
|
|
|
|
|
|
#include <asm/tlbflush.h>
|
|
|
|
|
2008-04-28 10:40:08 -07:00
|
|
|
#include "internal.h"
|
|
|
|
|
2006-09-30 23:27:09 -07:00
|
|
|
/* add this memory to iomem resource */
|
|
|
|
static struct resource *register_memory_resource(u64 start, u64 size)
|
|
|
|
{
|
|
|
|
struct resource *res;
|
|
|
|
res = kzalloc(sizeof(struct resource), GFP_KERNEL);
|
|
|
|
BUG_ON(!res);
|
|
|
|
|
|
|
|
res->name = "System RAM";
|
|
|
|
res->start = start;
|
|
|
|
res->end = start + size - 1;
|
2007-11-14 17:59:20 -07:00
|
|
|
res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
|
2006-09-30 23:27:09 -07:00
|
|
|
if (request_resource(&iomem_resource, res) < 0) {
|
|
|
|
printk("System RAM resource %llx - %llx cannot be added\n",
|
|
|
|
(unsigned long long)res->start, (unsigned long long)res->end);
|
|
|
|
kfree(res);
|
|
|
|
res = NULL;
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void release_memory_resource(struct resource *res)
|
|
|
|
{
|
|
|
|
if (!res)
|
|
|
|
return;
|
|
|
|
release_resource(res);
|
|
|
|
kfree(res);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2006-09-30 23:27:08 -07:00
|
|
|
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
memory hotplug: register section/node id to free
This patch set is to free pages which is allocated by bootmem for
memory-hotremove. Some structures of memory management are allocated by
bootmem. ex) memmap, etc.
To remove memory physically, some of them must be freed according to
circumstance. This patch set makes basis to free those pages, and free
memmaps.
Basic my idea is using remain members of struct page to remember information
of users of bootmem (section number or node id). When the section is
removing, kernel can confirm it. By this information, some issues can be
solved.
1) When the memmap of removing section is allocated on other
section by bootmem, it should/can be free.
2) When the memmap of removing section is allocated on the
same section, it shouldn't be freed. Because the section has to be
logical memory offlined already and all pages must be isolated against
page allocater. If it is freed, page allocator may use it which will
be removed physically soon.
3) When removing section has other section's memmap,
kernel will be able to show easily which section should be removed
before it for user. (Not implemented yet)
4) When the above case 2), the page isolation will be able to check and skip
memmap's page when logical memory offline (offline_pages()).
Current page isolation code fails in this case because this page is
just reserved page and it can't distinguish this pages can be
removed or not. But, it will be able to do by this patch.
(Not implemented yet.)
5) The node information like pgdat has similar issues. But, this
will be able to be solved too by this.
(Not implemented yet, but, remembering node id in the pages.)
Fortunately, current bootmem allocator just keeps PageReserved flags,
and doesn't use any other members of page struct. The users of
bootmem doesn't use them too.
This patch:
This is to register information which is node or section's id. Kernel can
distinguish which node/section uses the pages allcated by bootmem. This is
basis for hot-remove sections or nodes.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 02:13:31 -07:00
|
|
|
#ifndef CONFIG_SPARSEMEM_VMEMMAP
|
2008-07-23 21:28:17 -07:00
|
|
|
static void get_page_bootmem(unsigned long info, struct page *page, int type)
|
memory hotplug: register section/node id to free
This patch set is to free pages which is allocated by bootmem for
memory-hotremove. Some structures of memory management are allocated by
bootmem. ex) memmap, etc.
To remove memory physically, some of them must be freed according to
circumstance. This patch set makes basis to free those pages, and free
memmaps.
Basic my idea is using remain members of struct page to remember information
of users of bootmem (section number or node id). When the section is
removing, kernel can confirm it. By this information, some issues can be
solved.
1) When the memmap of removing section is allocated on other
section by bootmem, it should/can be free.
2) When the memmap of removing section is allocated on the
same section, it shouldn't be freed. Because the section has to be
logical memory offlined already and all pages must be isolated against
page allocater. If it is freed, page allocator may use it which will
be removed physically soon.
3) When removing section has other section's memmap,
kernel will be able to show easily which section should be removed
before it for user. (Not implemented yet)
4) When the above case 2), the page isolation will be able to check and skip
memmap's page when logical memory offline (offline_pages()).
Current page isolation code fails in this case because this page is
just reserved page and it can't distinguish this pages can be
removed or not. But, it will be able to do by this patch.
(Not implemented yet.)
5) The node information like pgdat has similar issues. But, this
will be able to be solved too by this.
(Not implemented yet, but, remembering node id in the pages.)
Fortunately, current bootmem allocator just keeps PageReserved flags,
and doesn't use any other members of page struct. The users of
bootmem doesn't use them too.
This patch:
This is to register information which is node or section's id. Kernel can
distinguish which node/section uses the pages allcated by bootmem. This is
basis for hot-remove sections or nodes.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 02:13:31 -07:00
|
|
|
{
|
2008-07-23 21:28:17 -07:00
|
|
|
atomic_set(&page->_mapcount, type);
|
memory hotplug: register section/node id to free
This patch set is to free pages which is allocated by bootmem for
memory-hotremove. Some structures of memory management are allocated by
bootmem. ex) memmap, etc.
To remove memory physically, some of them must be freed according to
circumstance. This patch set makes basis to free those pages, and free
memmaps.
Basic my idea is using remain members of struct page to remember information
of users of bootmem (section number or node id). When the section is
removing, kernel can confirm it. By this information, some issues can be
solved.
1) When the memmap of removing section is allocated on other
section by bootmem, it should/can be free.
2) When the memmap of removing section is allocated on the
same section, it shouldn't be freed. Because the section has to be
logical memory offlined already and all pages must be isolated against
page allocater. If it is freed, page allocator may use it which will
be removed physically soon.
3) When removing section has other section's memmap,
kernel will be able to show easily which section should be removed
before it for user. (Not implemented yet)
4) When the above case 2), the page isolation will be able to check and skip
memmap's page when logical memory offline (offline_pages()).
Current page isolation code fails in this case because this page is
just reserved page and it can't distinguish this pages can be
removed or not. But, it will be able to do by this patch.
(Not implemented yet.)
5) The node information like pgdat has similar issues. But, this
will be able to be solved too by this.
(Not implemented yet, but, remembering node id in the pages.)
Fortunately, current bootmem allocator just keeps PageReserved flags,
and doesn't use any other members of page struct. The users of
bootmem doesn't use them too.
This patch:
This is to register information which is node or section's id. Kernel can
distinguish which node/section uses the pages allcated by bootmem. This is
basis for hot-remove sections or nodes.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 02:13:31 -07:00
|
|
|
SetPagePrivate(page);
|
|
|
|
set_page_private(page, info);
|
|
|
|
atomic_inc(&page->_count);
|
|
|
|
}
|
|
|
|
|
2009-12-14 18:59:44 -07:00
|
|
|
/* reference to __meminit __free_pages_bootmem is valid
|
|
|
|
* so use __ref to tell modpost not to generate a warning */
|
|
|
|
void __ref put_page_bootmem(struct page *page)
|
memory hotplug: register section/node id to free
This patch set is to free pages which is allocated by bootmem for
memory-hotremove. Some structures of memory management are allocated by
bootmem. ex) memmap, etc.
To remove memory physically, some of them must be freed according to
circumstance. This patch set makes basis to free those pages, and free
memmaps.
Basic my idea is using remain members of struct page to remember information
of users of bootmem (section number or node id). When the section is
removing, kernel can confirm it. By this information, some issues can be
solved.
1) When the memmap of removing section is allocated on other
section by bootmem, it should/can be free.
2) When the memmap of removing section is allocated on the
same section, it shouldn't be freed. Because the section has to be
logical memory offlined already and all pages must be isolated against
page allocater. If it is freed, page allocator may use it which will
be removed physically soon.
3) When removing section has other section's memmap,
kernel will be able to show easily which section should be removed
before it for user. (Not implemented yet)
4) When the above case 2), the page isolation will be able to check and skip
memmap's page when logical memory offline (offline_pages()).
Current page isolation code fails in this case because this page is
just reserved page and it can't distinguish this pages can be
removed or not. But, it will be able to do by this patch.
(Not implemented yet.)
5) The node information like pgdat has similar issues. But, this
will be able to be solved too by this.
(Not implemented yet, but, remembering node id in the pages.)
Fortunately, current bootmem allocator just keeps PageReserved flags,
and doesn't use any other members of page struct. The users of
bootmem doesn't use them too.
This patch:
This is to register information which is node or section's id. Kernel can
distinguish which node/section uses the pages allcated by bootmem. This is
basis for hot-remove sections or nodes.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 02:13:31 -07:00
|
|
|
{
|
2008-07-23 21:28:17 -07:00
|
|
|
int type;
|
memory hotplug: register section/node id to free
This patch set is to free pages which is allocated by bootmem for
memory-hotremove. Some structures of memory management are allocated by
bootmem. ex) memmap, etc.
To remove memory physically, some of them must be freed according to
circumstance. This patch set makes basis to free those pages, and free
memmaps.
Basic my idea is using remain members of struct page to remember information
of users of bootmem (section number or node id). When the section is
removing, kernel can confirm it. By this information, some issues can be
solved.
1) When the memmap of removing section is allocated on other
section by bootmem, it should/can be free.
2) When the memmap of removing section is allocated on the
same section, it shouldn't be freed. Because the section has to be
logical memory offlined already and all pages must be isolated against
page allocater. If it is freed, page allocator may use it which will
be removed physically soon.
3) When removing section has other section's memmap,
kernel will be able to show easily which section should be removed
before it for user. (Not implemented yet)
4) When the above case 2), the page isolation will be able to check and skip
memmap's page when logical memory offline (offline_pages()).
Current page isolation code fails in this case because this page is
just reserved page and it can't distinguish this pages can be
removed or not. But, it will be able to do by this patch.
(Not implemented yet.)
5) The node information like pgdat has similar issues. But, this
will be able to be solved too by this.
(Not implemented yet, but, remembering node id in the pages.)
Fortunately, current bootmem allocator just keeps PageReserved flags,
and doesn't use any other members of page struct. The users of
bootmem doesn't use them too.
This patch:
This is to register information which is node or section's id. Kernel can
distinguish which node/section uses the pages allcated by bootmem. This is
basis for hot-remove sections or nodes.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 02:13:31 -07:00
|
|
|
|
2008-07-23 21:28:17 -07:00
|
|
|
type = atomic_read(&page->_mapcount);
|
|
|
|
BUG_ON(type >= -1);
|
memory hotplug: register section/node id to free
This patch set is to free pages which is allocated by bootmem for
memory-hotremove. Some structures of memory management are allocated by
bootmem. ex) memmap, etc.
To remove memory physically, some of them must be freed according to
circumstance. This patch set makes basis to free those pages, and free
memmaps.
Basic my idea is using remain members of struct page to remember information
of users of bootmem (section number or node id). When the section is
removing, kernel can confirm it. By this information, some issues can be
solved.
1) When the memmap of removing section is allocated on other
section by bootmem, it should/can be free.
2) When the memmap of removing section is allocated on the
same section, it shouldn't be freed. Because the section has to be
logical memory offlined already and all pages must be isolated against
page allocater. If it is freed, page allocator may use it which will
be removed physically soon.
3) When removing section has other section's memmap,
kernel will be able to show easily which section should be removed
before it for user. (Not implemented yet)
4) When the above case 2), the page isolation will be able to check and skip
memmap's page when logical memory offline (offline_pages()).
Current page isolation code fails in this case because this page is
just reserved page and it can't distinguish this pages can be
removed or not. But, it will be able to do by this patch.
(Not implemented yet.)
5) The node information like pgdat has similar issues. But, this
will be able to be solved too by this.
(Not implemented yet, but, remembering node id in the pages.)
Fortunately, current bootmem allocator just keeps PageReserved flags,
and doesn't use any other members of page struct. The users of
bootmem doesn't use them too.
This patch:
This is to register information which is node or section's id. Kernel can
distinguish which node/section uses the pages allcated by bootmem. This is
basis for hot-remove sections or nodes.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 02:13:31 -07:00
|
|
|
|
|
|
|
if (atomic_dec_return(&page->_count) == 1) {
|
|
|
|
ClearPagePrivate(page);
|
|
|
|
set_page_private(page, 0);
|
|
|
|
reset_page_mapcount(page);
|
|
|
|
__free_pages_bootmem(page, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2008-07-23 21:28:12 -07:00
|
|
|
static void register_page_bootmem_info_section(unsigned long start_pfn)
|
memory hotplug: register section/node id to free
This patch set is to free pages which is allocated by bootmem for
memory-hotremove. Some structures of memory management are allocated by
bootmem. ex) memmap, etc.
To remove memory physically, some of them must be freed according to
circumstance. This patch set makes basis to free those pages, and free
memmaps.
Basic my idea is using remain members of struct page to remember information
of users of bootmem (section number or node id). When the section is
removing, kernel can confirm it. By this information, some issues can be
solved.
1) When the memmap of removing section is allocated on other
section by bootmem, it should/can be free.
2) When the memmap of removing section is allocated on the
same section, it shouldn't be freed. Because the section has to be
logical memory offlined already and all pages must be isolated against
page allocater. If it is freed, page allocator may use it which will
be removed physically soon.
3) When removing section has other section's memmap,
kernel will be able to show easily which section should be removed
before it for user. (Not implemented yet)
4) When the above case 2), the page isolation will be able to check and skip
memmap's page when logical memory offline (offline_pages()).
Current page isolation code fails in this case because this page is
just reserved page and it can't distinguish this pages can be
removed or not. But, it will be able to do by this patch.
(Not implemented yet.)
5) The node information like pgdat has similar issues. But, this
will be able to be solved too by this.
(Not implemented yet, but, remembering node id in the pages.)
Fortunately, current bootmem allocator just keeps PageReserved flags,
and doesn't use any other members of page struct. The users of
bootmem doesn't use them too.
This patch:
This is to register information which is node or section's id. Kernel can
distinguish which node/section uses the pages allcated by bootmem. This is
basis for hot-remove sections or nodes.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 02:13:31 -07:00
|
|
|
{
|
|
|
|
unsigned long *usemap, mapsize, section_nr, i;
|
|
|
|
struct mem_section *ms;
|
|
|
|
struct page *page, *memmap;
|
|
|
|
|
|
|
|
if (!pfn_valid(start_pfn))
|
|
|
|
return;
|
|
|
|
|
|
|
|
section_nr = pfn_to_section_nr(start_pfn);
|
|
|
|
ms = __nr_to_section(section_nr);
|
|
|
|
|
|
|
|
/* Get section's memmap address */
|
|
|
|
memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Get page for the memmap's phys address
|
|
|
|
* XXX: need more consideration for sparse_vmemmap...
|
|
|
|
*/
|
|
|
|
page = virt_to_page(memmap);
|
|
|
|
mapsize = sizeof(struct page) * PAGES_PER_SECTION;
|
|
|
|
mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
|
|
|
|
|
|
|
|
/* remember memmap's page */
|
|
|
|
for (i = 0; i < mapsize; i++, page++)
|
|
|
|
get_page_bootmem(section_nr, page, SECTION_INFO);
|
|
|
|
|
|
|
|
usemap = __nr_to_section(section_nr)->pageblock_flags;
|
|
|
|
page = virt_to_page(usemap);
|
|
|
|
|
|
|
|
mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
|
|
|
|
|
|
|
|
for (i = 0; i < mapsize; i++, page++)
|
2008-07-23 21:28:17 -07:00
|
|
|
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
|
memory hotplug: register section/node id to free
This patch set is to free pages which is allocated by bootmem for
memory-hotremove. Some structures of memory management are allocated by
bootmem. ex) memmap, etc.
To remove memory physically, some of them must be freed according to
circumstance. This patch set makes basis to free those pages, and free
memmaps.
Basic my idea is using remain members of struct page to remember information
of users of bootmem (section number or node id). When the section is
removing, kernel can confirm it. By this information, some issues can be
solved.
1) When the memmap of removing section is allocated on other
section by bootmem, it should/can be free.
2) When the memmap of removing section is allocated on the
same section, it shouldn't be freed. Because the section has to be
logical memory offlined already and all pages must be isolated against
page allocater. If it is freed, page allocator may use it which will
be removed physically soon.
3) When removing section has other section's memmap,
kernel will be able to show easily which section should be removed
before it for user. (Not implemented yet)
4) When the above case 2), the page isolation will be able to check and skip
memmap's page when logical memory offline (offline_pages()).
Current page isolation code fails in this case because this page is
just reserved page and it can't distinguish this pages can be
removed or not. But, it will be able to do by this patch.
(Not implemented yet.)
5) The node information like pgdat has similar issues. But, this
will be able to be solved too by this.
(Not implemented yet, but, remembering node id in the pages.)
Fortunately, current bootmem allocator just keeps PageReserved flags,
and doesn't use any other members of page struct. The users of
bootmem doesn't use them too.
This patch:
This is to register information which is node or section's id. Kernel can
distinguish which node/section uses the pages allcated by bootmem. This is
basis for hot-remove sections or nodes.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Badari Pulavarty <pbadari@us.ibm.com>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-28 02:13:31 -07:00
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
void register_page_bootmem_info_node(struct pglist_data *pgdat)
|
|
|
|
{
|
|
|
|
unsigned long i, pfn, end_pfn, nr_pages;
|
|
|
|
int node = pgdat->node_id;
|
|
|
|
struct page *page;
|
|
|
|
struct zone *zone;
|
|
|
|
|
|
|
|
nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
|
|
|
|
page = virt_to_page(pgdat);
|
|
|
|
|
|
|
|
for (i = 0; i < nr_pages; i++, page++)
|
|
|
|
get_page_bootmem(node, page, NODE_INFO);
|
|
|
|
|
|
|
|
zone = &pgdat->node_zones[0];
|
|
|
|
for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
|
|
|
|
if (zone->wait_table) {
|
|
|
|
nr_pages = zone->wait_table_hash_nr_entries
|
|
|
|
* sizeof(wait_queue_head_t);
|
|
|
|
nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
|
|
|
|
page = virt_to_page(zone->wait_table);
|
|
|
|
|
|
|
|
for (i = 0; i < nr_pages; i++, page++)
|
|
|
|
get_page_bootmem(node, page, NODE_INFO);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pfn = pgdat->node_start_pfn;
|
|
|
|
end_pfn = pfn + pgdat->node_spanned_pages;
|
|
|
|
|
|
|
|
/* register_section info */
|
|
|
|
for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
|
|
|
|
register_page_bootmem_info_section(pfn);
|
|
|
|
|
|
|
|
}
|
|
|
|
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
|
|
|
|
|
2008-05-14 16:05:52 -07:00
|
|
|
static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
unsigned long old_zone_end_pfn;
|
|
|
|
|
|
|
|
zone_span_writelock(zone);
|
|
|
|
|
|
|
|
old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
|
|
|
|
if (start_pfn < zone->zone_start_pfn)
|
|
|
|
zone->zone_start_pfn = start_pfn;
|
|
|
|
|
|
|
|
zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
|
|
|
|
zone->zone_start_pfn;
|
|
|
|
|
|
|
|
zone_span_writeunlock(zone);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
|
|
|
|
unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
unsigned long old_pgdat_end_pfn =
|
|
|
|
pgdat->node_start_pfn + pgdat->node_spanned_pages;
|
|
|
|
|
|
|
|
if (start_pfn < pgdat->node_start_pfn)
|
|
|
|
pgdat->node_start_pfn = start_pfn;
|
|
|
|
|
|
|
|
pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
|
|
|
|
pgdat->node_start_pfn;
|
|
|
|
}
|
|
|
|
|
2008-11-22 10:33:24 -07:00
|
|
|
static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
|
2005-10-29 18:16:54 -07:00
|
|
|
{
|
|
|
|
struct pglist_data *pgdat = zone->zone_pgdat;
|
|
|
|
int nr_pages = PAGES_PER_SECTION;
|
|
|
|
int nid = pgdat->node_id;
|
|
|
|
int zone_type;
|
2008-05-14 16:05:52 -07:00
|
|
|
unsigned long flags;
|
2005-10-29 18:16:54 -07:00
|
|
|
|
|
|
|
zone_type = zone - pgdat->node_zones;
|
2008-05-14 16:05:52 -07:00
|
|
|
if (!zone->wait_table) {
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
ret = init_currently_empty_zone(zone, phys_start_pfn,
|
|
|
|
nr_pages, MEMMAP_HOTPLUG);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
pgdat_resize_lock(zone->zone_pgdat, &flags);
|
|
|
|
grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
|
|
|
|
grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
|
|
|
|
phys_start_pfn + nr_pages);
|
|
|
|
pgdat_resize_unlock(zone->zone_pgdat, &flags);
|
2007-01-11 00:15:30 -07:00
|
|
|
memmap_init_zone(nr_pages, nid, zone_type,
|
|
|
|
phys_start_pfn, MEMMAP_HOTPLUG);
|
2006-06-23 02:03:10 -07:00
|
|
|
return 0;
|
2005-10-29 18:16:54 -07:00
|
|
|
}
|
|
|
|
|
2009-01-06 15:39:14 -07:00
|
|
|
static int __meminit __add_section(int nid, struct zone *zone,
|
|
|
|
unsigned long phys_start_pfn)
|
2005-10-29 18:16:54 -07:00
|
|
|
{
|
|
|
|
int nr_pages = PAGES_PER_SECTION;
|
|
|
|
int ret;
|
|
|
|
|
2006-08-05 12:15:06 -07:00
|
|
|
if (pfn_valid(phys_start_pfn))
|
|
|
|
return -EEXIST;
|
|
|
|
|
2005-10-29 18:16:55 -07:00
|
|
|
ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
|
2005-10-29 18:16:54 -07:00
|
|
|
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
2006-06-23 02:03:10 -07:00
|
|
|
ret = __add_zone(zone, phys_start_pfn);
|
|
|
|
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
2009-01-06 15:39:14 -07:00
|
|
|
return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
|
2005-10-29 18:16:54 -07:00
|
|
|
}
|
|
|
|
|
2008-04-28 02:13:34 -07:00
|
|
|
#ifdef CONFIG_SPARSEMEM_VMEMMAP
|
|
|
|
static int __remove_section(struct zone *zone, struct mem_section *ms)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* XXX: Freeing memmap with vmemmap is not implement yet.
|
|
|
|
* This should be removed later.
|
|
|
|
*/
|
|
|
|
return -EBUSY;
|
|
|
|
}
|
|
|
|
#else
|
2008-04-28 02:12:01 -07:00
|
|
|
static int __remove_section(struct zone *zone, struct mem_section *ms)
|
|
|
|
{
|
|
|
|
unsigned long flags;
|
|
|
|
struct pglist_data *pgdat = zone->zone_pgdat;
|
|
|
|
int ret = -EINVAL;
|
|
|
|
|
|
|
|
if (!valid_section(ms))
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = unregister_memory_section(ms);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
pgdat_resize_lock(pgdat, &flags);
|
|
|
|
sparse_remove_one_section(zone, ms);
|
|
|
|
pgdat_resize_unlock(pgdat, &flags);
|
|
|
|
return 0;
|
|
|
|
}
|
2008-04-28 02:13:34 -07:00
|
|
|
#endif
|
2008-04-28 02:12:01 -07:00
|
|
|
|
2005-10-29 18:16:54 -07:00
|
|
|
/*
|
|
|
|
* Reasonably generic function for adding memory. It is
|
|
|
|
* expected that archs that support memory hotplug will
|
|
|
|
* call this function after deciding the zone to which to
|
|
|
|
* add the new pages.
|
|
|
|
*/
|
2009-01-06 15:39:14 -07:00
|
|
|
int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
|
|
|
|
unsigned long nr_pages)
|
2005-10-29 18:16:54 -07:00
|
|
|
{
|
|
|
|
unsigned long i;
|
|
|
|
int err = 0;
|
2006-08-05 12:14:58 -07:00
|
|
|
int start_sec, end_sec;
|
|
|
|
/* during initialize mem_map, align hot-added range to section */
|
|
|
|
start_sec = pfn_to_section_nr(phys_start_pfn);
|
|
|
|
end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
|
2005-10-29 18:16:54 -07:00
|
|
|
|
2006-08-05 12:14:58 -07:00
|
|
|
for (i = start_sec; i <= end_sec; i++) {
|
2009-01-06 15:39:14 -07:00
|
|
|
err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
|
2005-10-29 18:16:54 -07:00
|
|
|
|
2006-08-05 12:14:58 -07:00
|
|
|
/*
|
2007-10-19 16:27:18 -07:00
|
|
|
* EEXIST is finally dealt with by ioresource collision
|
2006-08-05 12:14:58 -07:00
|
|
|
* check. see add_memory() => register_memory_resource()
|
|
|
|
* Warning will be printed if there is collision.
|
2006-05-01 12:16:11 -07:00
|
|
|
*/
|
|
|
|
if (err && (err != -EEXIST))
|
2005-10-29 18:16:54 -07:00
|
|
|
break;
|
2006-08-05 12:14:58 -07:00
|
|
|
err = 0;
|
2005-10-29 18:16:54 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
2006-05-01 12:16:11 -07:00
|
|
|
EXPORT_SYMBOL_GPL(__add_pages);
|
2005-10-29 18:16:54 -07:00
|
|
|
|
2008-04-28 02:12:01 -07:00
|
|
|
/**
|
|
|
|
* __remove_pages() - remove sections of pages from a zone
|
|
|
|
* @zone: zone from which pages need to be removed
|
|
|
|
* @phys_start_pfn: starting pageframe (must be aligned to start of a section)
|
|
|
|
* @nr_pages: number of pages to remove (must be multiple of section size)
|
|
|
|
*
|
|
|
|
* Generic helper function to remove section mappings and sysfs entries
|
|
|
|
* for the section of the memory we are removing. Caller needs to make
|
|
|
|
* sure that pages are marked reserved and zones are adjust properly by
|
|
|
|
* calling offline_pages().
|
|
|
|
*/
|
|
|
|
int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
|
|
|
|
unsigned long nr_pages)
|
|
|
|
{
|
|
|
|
unsigned long i, ret = 0;
|
|
|
|
int sections_to_remove;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We can only remove entire sections
|
|
|
|
*/
|
|
|
|
BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
|
|
|
|
BUG_ON(nr_pages % PAGES_PER_SECTION);
|
|
|
|
|
|
|
|
sections_to_remove = nr_pages / PAGES_PER_SECTION;
|
|
|
|
for (i = 0; i < sections_to_remove; i++) {
|
|
|
|
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
|
2008-10-18 20:27:14 -07:00
|
|
|
release_mem_region(pfn << PAGE_SHIFT,
|
|
|
|
PAGES_PER_SECTION << PAGE_SHIFT);
|
2008-04-28 02:12:01 -07:00
|
|
|
ret = __remove_section(zone, __pfn_to_section(pfn));
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(__remove_pages);
|
|
|
|
|
2008-04-28 02:12:03 -07:00
|
|
|
void online_page(struct page *page)
|
|
|
|
{
|
2009-09-21 17:03:03 -07:00
|
|
|
unsigned long pfn = page_to_pfn(page);
|
|
|
|
|
2008-04-28 02:12:03 -07:00
|
|
|
totalram_pages++;
|
2009-09-21 17:03:03 -07:00
|
|
|
if (pfn >= num_physpages)
|
|
|
|
num_physpages = pfn + 1;
|
2008-04-28 02:12:03 -07:00
|
|
|
|
|
|
|
#ifdef CONFIG_HIGHMEM
|
|
|
|
if (PageHighMem(page))
|
|
|
|
totalhigh_pages++;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_FLATMEM
|
|
|
|
max_mapnr = max(page_to_pfn(page), max_mapnr);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
ClearPageReserved(page);
|
|
|
|
init_page_count(page);
|
|
|
|
__free_page(page);
|
|
|
|
}
|
|
|
|
|
2007-10-16 01:26:10 -07:00
|
|
|
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
|
|
|
|
void *arg)
|
2005-10-29 18:16:54 -07:00
|
|
|
{
|
|
|
|
unsigned long i;
|
2007-10-16 01:26:10 -07:00
|
|
|
unsigned long onlined_pages = *(unsigned long *)arg;
|
|
|
|
struct page *page;
|
|
|
|
if (PageReserved(pfn_to_page(start_pfn)))
|
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
page = pfn_to_page(start_pfn + i);
|
|
|
|
online_page(page);
|
|
|
|
onlined_pages++;
|
|
|
|
}
|
|
|
|
*(unsigned long *)arg = onlined_pages;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int online_pages(unsigned long pfn, unsigned long nr_pages)
|
|
|
|
{
|
2005-10-29 18:16:54 -07:00
|
|
|
unsigned long onlined_pages = 0;
|
|
|
|
struct zone *zone;
|
2006-06-23 02:03:11 -07:00
|
|
|
int need_zonelists_rebuild = 0;
|
2007-10-21 16:41:36 -07:00
|
|
|
int nid;
|
|
|
|
int ret;
|
|
|
|
struct memory_notify arg;
|
|
|
|
|
|
|
|
arg.start_pfn = pfn;
|
|
|
|
arg.nr_pages = nr_pages;
|
|
|
|
arg.status_change_nid = -1;
|
|
|
|
|
|
|
|
nid = page_to_nid(pfn_to_page(pfn));
|
|
|
|
if (node_present_pages(nid) == 0)
|
|
|
|
arg.status_change_nid = nid;
|
2005-10-29 18:16:54 -07:00
|
|
|
|
2007-10-21 16:41:36 -07:00
|
|
|
ret = memory_notify(MEM_GOING_ONLINE, &arg);
|
|
|
|
ret = notifier_to_errno(ret);
|
|
|
|
if (ret) {
|
|
|
|
memory_notify(MEM_CANCEL_ONLINE, &arg);
|
|
|
|
return ret;
|
|
|
|
}
|
2005-10-29 18:16:54 -07:00
|
|
|
/*
|
|
|
|
* This doesn't need a lock to do pfn_to_page().
|
|
|
|
* The section can't be removed here because of the
|
2008-02-05 00:35:47 -07:00
|
|
|
* memory_block->state_mutex.
|
2005-10-29 18:16:54 -07:00
|
|
|
*/
|
|
|
|
zone = page_zone(pfn_to_page(pfn));
|
2006-06-23 02:03:11 -07:00
|
|
|
/*
|
|
|
|
* If this zone is not populated, then it is not in zonelist.
|
|
|
|
* This means the page allocator ignores this zone.
|
|
|
|
* So, zonelist must be updated after online.
|
|
|
|
*/
|
2010-05-24 14:32:52 -07:00
|
|
|
mutex_lock(&zonelists_mutex);
|
2006-06-23 02:03:11 -07:00
|
|
|
if (!populated_zone(zone))
|
|
|
|
need_zonelists_rebuild = 1;
|
|
|
|
|
2009-09-22 16:45:46 -07:00
|
|
|
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
|
2007-10-16 01:26:10 -07:00
|
|
|
online_pages_range);
|
2008-05-14 16:05:50 -07:00
|
|
|
if (ret) {
|
2010-05-24 14:32:52 -07:00
|
|
|
mutex_unlock(&zonelists_mutex);
|
2008-05-14 16:05:50 -07:00
|
|
|
printk(KERN_DEBUG "online_pages %lx at %lx failed\n",
|
|
|
|
nr_pages, pfn);
|
|
|
|
memory_notify(MEM_CANCEL_ONLINE, &arg);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2005-10-29 18:16:54 -07:00
|
|
|
zone->present_pages += onlined_pages;
|
2006-03-09 18:33:51 -07:00
|
|
|
zone->zone_pgdat->node_present_pages += onlined_pages;
|
2010-05-24 14:32:51 -07:00
|
|
|
if (need_zonelists_rebuild)
|
|
|
|
build_all_zonelists(zone);
|
|
|
|
else
|
|
|
|
zone_pcp_update(zone);
|
2005-10-29 18:16:54 -07:00
|
|
|
|
2010-05-24 14:32:52 -07:00
|
|
|
mutex_unlock(&zonelists_mutex);
|
2009-06-16 15:32:48 -07:00
|
|
|
setup_per_zone_wmarks();
|
2009-06-16 15:32:50 -07:00
|
|
|
calculate_zone_inactive_ratio(zone);
|
2007-10-16 01:25:29 -07:00
|
|
|
if (onlined_pages) {
|
|
|
|
kswapd_run(zone_to_nid(zone));
|
|
|
|
node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
|
|
|
|
}
|
2005-10-29 18:16:56 -07:00
|
|
|
|
2010-05-24 14:32:51 -07:00
|
|
|
vm_total_pages = nr_free_pagecache_pages();
|
2008-07-23 21:28:18 -07:00
|
|
|
|
2006-09-29 02:01:25 -07:00
|
|
|
writeback_set_ratelimit();
|
2007-10-21 16:41:36 -07:00
|
|
|
|
|
|
|
if (onlined_pages)
|
|
|
|
memory_notify(MEM_ONLINE, &arg);
|
|
|
|
|
2005-10-29 18:16:54 -07:00
|
|
|
return 0;
|
|
|
|
}
|
2006-09-30 23:27:08 -07:00
|
|
|
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
|
2006-06-27 02:53:30 -07:00
|
|
|
|
2009-11-17 15:06:18 -07:00
|
|
|
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
|
|
|
|
static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
|
[PATCH] pgdat allocation for new node add (call pgdat allocation)
Add node-hot-add support to add_memory().
node hotadd uses this sequence.
1. allocate pgdat.
2. refresh NODE_DATA()
3. call free_area_init_node() to initialize
4. create sysfs entry
5. add memory (old add_memory())
6. set node online
7. run kswapd for new node.
(8). update zonelist after pages are onlined. (This is already merged in -mm
due to update phase is difference.)
Note:
To make common function as much as possible,
there is 2 changes from v2.
- The old add_memory(), which is defiend by each archs,
is renamed to arch_add_memory(). New add_memory becomes
caller of arch dependent function as a common code.
- This patch changes add_memory()'s interface
From: add_memory(start, end)
TO : add_memory(nid, start, end).
It was cause of similar code that finding node id from
physical address is inside of old add_memory() on each arch.
In addition, acpi memory hotplug driver can find node id easier.
In v2, it must walk DSDT'S _CRS by matching physical address to
get the handle of its memory device, then get _PXM and node id.
Because input is just physical address.
However, in v3, the acpi driver can use handle to get _PXM and node id
for the new memory device. It can pass just node id to add_memory().
Fix interface of arch_add_memory() is in next patche.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:34 -07:00
|
|
|
{
|
|
|
|
struct pglist_data *pgdat;
|
|
|
|
unsigned long zones_size[MAX_NR_ZONES] = {0};
|
|
|
|
unsigned long zholes_size[MAX_NR_ZONES] = {0};
|
|
|
|
unsigned long start_pfn = start >> PAGE_SHIFT;
|
|
|
|
|
|
|
|
pgdat = arch_alloc_nodedata(nid);
|
|
|
|
if (!pgdat)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
arch_refresh_nodedata(nid, pgdat);
|
|
|
|
|
|
|
|
/* we can use NODE_DATA(nid) from here */
|
|
|
|
|
|
|
|
/* init node's zones as empty zones, we don't have any present pages.*/
|
2008-07-23 21:27:20 -07:00
|
|
|
free_area_init_node(nid, zones_size, start_pfn, zholes_size);
|
[PATCH] pgdat allocation for new node add (call pgdat allocation)
Add node-hot-add support to add_memory().
node hotadd uses this sequence.
1. allocate pgdat.
2. refresh NODE_DATA()
3. call free_area_init_node() to initialize
4. create sysfs entry
5. add memory (old add_memory())
6. set node online
7. run kswapd for new node.
(8). update zonelist after pages are onlined. (This is already merged in -mm
due to update phase is difference.)
Note:
To make common function as much as possible,
there is 2 changes from v2.
- The old add_memory(), which is defiend by each archs,
is renamed to arch_add_memory(). New add_memory becomes
caller of arch dependent function as a common code.
- This patch changes add_memory()'s interface
From: add_memory(start, end)
TO : add_memory(nid, start, end).
It was cause of similar code that finding node id from
physical address is inside of old add_memory() on each arch.
In addition, acpi memory hotplug driver can find node id easier.
In v2, it must walk DSDT'S _CRS by matching physical address to
get the handle of its memory device, then get _PXM and node id.
Because input is just physical address.
However, in v3, the acpi driver can use handle to get _PXM and node id
for the new memory device. It can pass just node id to add_memory().
Fix interface of arch_add_memory() is in next patche.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:34 -07:00
|
|
|
|
|
|
|
return pgdat;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
|
|
|
|
{
|
|
|
|
arch_refresh_nodedata(nid, NULL);
|
|
|
|
arch_free_nodedata(pgdat);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2006-06-27 02:53:35 -07:00
|
|
|
|
2010-05-24 14:32:41 -07:00
|
|
|
/*
|
|
|
|
* called by cpu_up() to online a node without onlined memory.
|
|
|
|
*/
|
|
|
|
int mem_online_node(int nid)
|
|
|
|
{
|
|
|
|
pg_data_t *pgdat;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
lock_system_sleep();
|
|
|
|
pgdat = hotadd_new_pgdat(nid, 0);
|
|
|
|
if (pgdat) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
node_set_online(nid);
|
|
|
|
ret = register_one_node(nid);
|
|
|
|
BUG_ON(ret);
|
|
|
|
|
|
|
|
out:
|
|
|
|
unlock_system_sleep();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2008-11-22 10:33:24 -07:00
|
|
|
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
|
|
|
|
int __ref add_memory(int nid, u64 start, u64 size)
|
2006-06-27 02:53:30 -07:00
|
|
|
{
|
[PATCH] pgdat allocation for new node add (call pgdat allocation)
Add node-hot-add support to add_memory().
node hotadd uses this sequence.
1. allocate pgdat.
2. refresh NODE_DATA()
3. call free_area_init_node() to initialize
4. create sysfs entry
5. add memory (old add_memory())
6. set node online
7. run kswapd for new node.
(8). update zonelist after pages are onlined. (This is already merged in -mm
due to update phase is difference.)
Note:
To make common function as much as possible,
there is 2 changes from v2.
- The old add_memory(), which is defiend by each archs,
is renamed to arch_add_memory(). New add_memory becomes
caller of arch dependent function as a common code.
- This patch changes add_memory()'s interface
From: add_memory(start, end)
TO : add_memory(nid, start, end).
It was cause of similar code that finding node id from
physical address is inside of old add_memory() on each arch.
In addition, acpi memory hotplug driver can find node id easier.
In v2, it must walk DSDT'S _CRS by matching physical address to
get the handle of its memory device, then get _PXM and node id.
Because input is just physical address.
However, in v3, the acpi driver can use handle to get _PXM and node id
for the new memory device. It can pass just node id to add_memory().
Fix interface of arch_add_memory() is in next patche.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:34 -07:00
|
|
|
pg_data_t *pgdat = NULL;
|
|
|
|
int new_pgdat = 0;
|
2006-08-05 12:15:06 -07:00
|
|
|
struct resource *res;
|
2006-06-27 02:53:30 -07:00
|
|
|
int ret;
|
|
|
|
|
2009-11-17 15:06:22 -07:00
|
|
|
lock_system_sleep();
|
|
|
|
|
2006-08-05 12:15:06 -07:00
|
|
|
res = register_memory_resource(start, size);
|
2009-11-17 15:06:22 -07:00
|
|
|
ret = -EEXIST;
|
2006-08-05 12:15:06 -07:00
|
|
|
if (!res)
|
2009-11-17 15:06:22 -07:00
|
|
|
goto out;
|
2006-08-05 12:15:06 -07:00
|
|
|
|
[PATCH] pgdat allocation for new node add (call pgdat allocation)
Add node-hot-add support to add_memory().
node hotadd uses this sequence.
1. allocate pgdat.
2. refresh NODE_DATA()
3. call free_area_init_node() to initialize
4. create sysfs entry
5. add memory (old add_memory())
6. set node online
7. run kswapd for new node.
(8). update zonelist after pages are onlined. (This is already merged in -mm
due to update phase is difference.)
Note:
To make common function as much as possible,
there is 2 changes from v2.
- The old add_memory(), which is defiend by each archs,
is renamed to arch_add_memory(). New add_memory becomes
caller of arch dependent function as a common code.
- This patch changes add_memory()'s interface
From: add_memory(start, end)
TO : add_memory(nid, start, end).
It was cause of similar code that finding node id from
physical address is inside of old add_memory() on each arch.
In addition, acpi memory hotplug driver can find node id easier.
In v2, it must walk DSDT'S _CRS by matching physical address to
get the handle of its memory device, then get _PXM and node id.
Because input is just physical address.
However, in v3, the acpi driver can use handle to get _PXM and node id
for the new memory device. It can pass just node id to add_memory().
Fix interface of arch_add_memory() is in next patche.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:34 -07:00
|
|
|
if (!node_online(nid)) {
|
|
|
|
pgdat = hotadd_new_pgdat(nid, start);
|
2009-11-17 15:06:22 -07:00
|
|
|
ret = -ENOMEM;
|
[PATCH] pgdat allocation for new node add (call pgdat allocation)
Add node-hot-add support to add_memory().
node hotadd uses this sequence.
1. allocate pgdat.
2. refresh NODE_DATA()
3. call free_area_init_node() to initialize
4. create sysfs entry
5. add memory (old add_memory())
6. set node online
7. run kswapd for new node.
(8). update zonelist after pages are onlined. (This is already merged in -mm
due to update phase is difference.)
Note:
To make common function as much as possible,
there is 2 changes from v2.
- The old add_memory(), which is defiend by each archs,
is renamed to arch_add_memory(). New add_memory becomes
caller of arch dependent function as a common code.
- This patch changes add_memory()'s interface
From: add_memory(start, end)
TO : add_memory(nid, start, end).
It was cause of similar code that finding node id from
physical address is inside of old add_memory() on each arch.
In addition, acpi memory hotplug driver can find node id easier.
In v2, it must walk DSDT'S _CRS by matching physical address to
get the handle of its memory device, then get _PXM and node id.
Because input is just physical address.
However, in v3, the acpi driver can use handle to get _PXM and node id
for the new memory device. It can pass just node id to add_memory().
Fix interface of arch_add_memory() is in next patche.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:34 -07:00
|
|
|
if (!pgdat)
|
2009-11-17 15:06:22 -07:00
|
|
|
goto out;
|
[PATCH] pgdat allocation for new node add (call pgdat allocation)
Add node-hot-add support to add_memory().
node hotadd uses this sequence.
1. allocate pgdat.
2. refresh NODE_DATA()
3. call free_area_init_node() to initialize
4. create sysfs entry
5. add memory (old add_memory())
6. set node online
7. run kswapd for new node.
(8). update zonelist after pages are onlined. (This is already merged in -mm
due to update phase is difference.)
Note:
To make common function as much as possible,
there is 2 changes from v2.
- The old add_memory(), which is defiend by each archs,
is renamed to arch_add_memory(). New add_memory becomes
caller of arch dependent function as a common code.
- This patch changes add_memory()'s interface
From: add_memory(start, end)
TO : add_memory(nid, start, end).
It was cause of similar code that finding node id from
physical address is inside of old add_memory() on each arch.
In addition, acpi memory hotplug driver can find node id easier.
In v2, it must walk DSDT'S _CRS by matching physical address to
get the handle of its memory device, then get _PXM and node id.
Because input is just physical address.
However, in v3, the acpi driver can use handle to get _PXM and node id
for the new memory device. It can pass just node id to add_memory().
Fix interface of arch_add_memory() is in next patche.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:34 -07:00
|
|
|
new_pgdat = 1;
|
|
|
|
}
|
|
|
|
|
2006-06-27 02:53:30 -07:00
|
|
|
/* call arch's memory hotadd */
|
|
|
|
ret = arch_add_memory(nid, start, size);
|
|
|
|
|
[PATCH] pgdat allocation for new node add (call pgdat allocation)
Add node-hot-add support to add_memory().
node hotadd uses this sequence.
1. allocate pgdat.
2. refresh NODE_DATA()
3. call free_area_init_node() to initialize
4. create sysfs entry
5. add memory (old add_memory())
6. set node online
7. run kswapd for new node.
(8). update zonelist after pages are onlined. (This is already merged in -mm
due to update phase is difference.)
Note:
To make common function as much as possible,
there is 2 changes from v2.
- The old add_memory(), which is defiend by each archs,
is renamed to arch_add_memory(). New add_memory becomes
caller of arch dependent function as a common code.
- This patch changes add_memory()'s interface
From: add_memory(start, end)
TO : add_memory(nid, start, end).
It was cause of similar code that finding node id from
physical address is inside of old add_memory() on each arch.
In addition, acpi memory hotplug driver can find node id easier.
In v2, it must walk DSDT'S _CRS by matching physical address to
get the handle of its memory device, then get _PXM and node id.
Because input is just physical address.
However, in v3, the acpi driver can use handle to get _PXM and node id
for the new memory device. It can pass just node id to add_memory().
Fix interface of arch_add_memory() is in next patche.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:34 -07:00
|
|
|
if (ret < 0)
|
|
|
|
goto error;
|
|
|
|
|
2006-06-27 02:53:38 -07:00
|
|
|
/* we online node here. we can't roll back from here. */
|
[PATCH] pgdat allocation for new node add (call pgdat allocation)
Add node-hot-add support to add_memory().
node hotadd uses this sequence.
1. allocate pgdat.
2. refresh NODE_DATA()
3. call free_area_init_node() to initialize
4. create sysfs entry
5. add memory (old add_memory())
6. set node online
7. run kswapd for new node.
(8). update zonelist after pages are onlined. (This is already merged in -mm
due to update phase is difference.)
Note:
To make common function as much as possible,
there is 2 changes from v2.
- The old add_memory(), which is defiend by each archs,
is renamed to arch_add_memory(). New add_memory becomes
caller of arch dependent function as a common code.
- This patch changes add_memory()'s interface
From: add_memory(start, end)
TO : add_memory(nid, start, end).
It was cause of similar code that finding node id from
physical address is inside of old add_memory() on each arch.
In addition, acpi memory hotplug driver can find node id easier.
In v2, it must walk DSDT'S _CRS by matching physical address to
get the handle of its memory device, then get _PXM and node id.
Because input is just physical address.
However, in v3, the acpi driver can use handle to get _PXM and node id
for the new memory device. It can pass just node id to add_memory().
Fix interface of arch_add_memory() is in next patche.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:34 -07:00
|
|
|
node_set_online(nid);
|
|
|
|
|
2006-06-27 02:53:38 -07:00
|
|
|
if (new_pgdat) {
|
|
|
|
ret = register_one_node(nid);
|
|
|
|
/*
|
|
|
|
* If sysfs file of new node can't create, cpu on the node
|
|
|
|
* can't be hot-added. There is no rollback way now.
|
|
|
|
* So, check by BUG_ON() to catch it reluctantly..
|
|
|
|
*/
|
|
|
|
BUG_ON(ret);
|
|
|
|
}
|
|
|
|
|
2010-03-05 14:41:58 -07:00
|
|
|
/* create new memmap entry */
|
|
|
|
firmware_map_add_hotplug(start, start + size, "System RAM");
|
|
|
|
|
2009-11-17 15:06:22 -07:00
|
|
|
goto out;
|
|
|
|
|
[PATCH] pgdat allocation for new node add (call pgdat allocation)
Add node-hot-add support to add_memory().
node hotadd uses this sequence.
1. allocate pgdat.
2. refresh NODE_DATA()
3. call free_area_init_node() to initialize
4. create sysfs entry
5. add memory (old add_memory())
6. set node online
7. run kswapd for new node.
(8). update zonelist after pages are onlined. (This is already merged in -mm
due to update phase is difference.)
Note:
To make common function as much as possible,
there is 2 changes from v2.
- The old add_memory(), which is defiend by each archs,
is renamed to arch_add_memory(). New add_memory becomes
caller of arch dependent function as a common code.
- This patch changes add_memory()'s interface
From: add_memory(start, end)
TO : add_memory(nid, start, end).
It was cause of similar code that finding node id from
physical address is inside of old add_memory() on each arch.
In addition, acpi memory hotplug driver can find node id easier.
In v2, it must walk DSDT'S _CRS by matching physical address to
get the handle of its memory device, then get _PXM and node id.
Because input is just physical address.
However, in v3, the acpi driver can use handle to get _PXM and node id
for the new memory device. It can pass just node id to add_memory().
Fix interface of arch_add_memory() is in next patche.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:34 -07:00
|
|
|
error:
|
|
|
|
/* rollback pgdat allocation and others */
|
|
|
|
if (new_pgdat)
|
|
|
|
rollback_node_hotadd(nid, pgdat);
|
2006-08-05 12:15:06 -07:00
|
|
|
if (res)
|
|
|
|
release_memory_resource(res);
|
[PATCH] pgdat allocation for new node add (call pgdat allocation)
Add node-hot-add support to add_memory().
node hotadd uses this sequence.
1. allocate pgdat.
2. refresh NODE_DATA()
3. call free_area_init_node() to initialize
4. create sysfs entry
5. add memory (old add_memory())
6. set node online
7. run kswapd for new node.
(8). update zonelist after pages are onlined. (This is already merged in -mm
due to update phase is difference.)
Note:
To make common function as much as possible,
there is 2 changes from v2.
- The old add_memory(), which is defiend by each archs,
is renamed to arch_add_memory(). New add_memory becomes
caller of arch dependent function as a common code.
- This patch changes add_memory()'s interface
From: add_memory(start, end)
TO : add_memory(nid, start, end).
It was cause of similar code that finding node id from
physical address is inside of old add_memory() on each arch.
In addition, acpi memory hotplug driver can find node id easier.
In v2, it must walk DSDT'S _CRS by matching physical address to
get the handle of its memory device, then get _PXM and node id.
Because input is just physical address.
However, in v3, the acpi driver can use handle to get _PXM and node id
for the new memory device. It can pass just node id to add_memory().
Fix interface of arch_add_memory() is in next patche.
Signed-off-by: Yasunori Goto <y-goto@jp.fujitsu.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: "Brown, Len" <len.brown@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 02:53:34 -07:00
|
|
|
|
2009-11-17 15:06:22 -07:00
|
|
|
out:
|
|
|
|
unlock_system_sleep();
|
2006-06-27 02:53:30 -07:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL_GPL(add_memory);
|
2007-10-16 01:26:12 -07:00
|
|
|
|
|
|
|
#ifdef CONFIG_MEMORY_HOTREMOVE
|
2008-07-23 21:28:19 -07:00
|
|
|
/*
|
|
|
|
* A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
|
|
|
|
* set and the size of the free page is given by page_order(). Using this,
|
|
|
|
* the function determines if the pageblock contains only free pages.
|
|
|
|
* Due to buddy contraints, a free page at least the size of a pageblock will
|
|
|
|
* be located at the start of the pageblock
|
|
|
|
*/
|
|
|
|
static inline int pageblock_free(struct page *page)
|
|
|
|
{
|
|
|
|
return PageBuddy(page) && page_order(page) >= pageblock_order;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Return the start of the next active pageblock after a given page */
|
|
|
|
static struct page *next_active_pageblock(struct page *page)
|
|
|
|
{
|
|
|
|
/* Ensure the starting page is pageblock-aligned */
|
|
|
|
BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
|
|
|
|
|
|
|
|
/* If the entire pageblock is free, move to the end of free page */
|
2010-09-09 16:38:01 -07:00
|
|
|
if (pageblock_free(page)) {
|
|
|
|
int order;
|
|
|
|
/* be careful. we don't have locks, page_order can be changed.*/
|
|
|
|
order = page_order(page);
|
|
|
|
if ((order < MAX_ORDER) && (order >= pageblock_order))
|
|
|
|
return page + (1 << order);
|
|
|
|
}
|
2008-07-23 21:28:19 -07:00
|
|
|
|
2010-09-09 16:38:01 -07:00
|
|
|
return page + pageblock_nr_pages;
|
2008-07-23 21:28:19 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Checks if this range of memory is likely to be hot-removable. */
|
|
|
|
int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
|
|
|
|
{
|
|
|
|
struct page *page = pfn_to_page(start_pfn);
|
|
|
|
struct page *end_page = page + nr_pages;
|
|
|
|
|
|
|
|
/* Check the starting page of each pageblock within the range */
|
|
|
|
for (; page < end_page; page = next_active_pageblock(page)) {
|
2010-10-26 14:21:30 -07:00
|
|
|
if (!is_pageblock_removable_nolock(page))
|
2008-07-23 21:28:19 -07:00
|
|
|
return 0;
|
2010-10-26 14:21:30 -07:00
|
|
|
cond_resched();
|
2008-07-23 21:28:19 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
/* All pageblocks in the memory block are likely to be hot-removable */
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2007-10-16 01:26:12 -07:00
|
|
|
/*
|
|
|
|
* Confirm all pages in a range [start, end) is belongs to the same zone.
|
|
|
|
*/
|
|
|
|
static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
unsigned long pfn;
|
|
|
|
struct zone *zone = NULL;
|
|
|
|
struct page *page;
|
|
|
|
int i;
|
|
|
|
for (pfn = start_pfn;
|
|
|
|
pfn < end_pfn;
|
|
|
|
pfn += MAX_ORDER_NR_PAGES) {
|
|
|
|
i = 0;
|
|
|
|
/* This is just a CONFIG_HOLES_IN_ZONE check.*/
|
|
|
|
while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
|
|
|
|
i++;
|
|
|
|
if (i == MAX_ORDER_NR_PAGES)
|
|
|
|
continue;
|
|
|
|
page = pfn_to_page(pfn + i);
|
|
|
|
if (zone && page_zone(page) != zone)
|
|
|
|
return 0;
|
|
|
|
zone = page_zone(page);
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Scanning pfn is much easier than scanning lru list.
|
|
|
|
* Scan pfn from start to end and Find LRU page.
|
|
|
|
*/
|
2010-10-26 14:22:05 -07:00
|
|
|
static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
|
2007-10-16 01:26:12 -07:00
|
|
|
{
|
|
|
|
unsigned long pfn;
|
|
|
|
struct page *page;
|
|
|
|
for (pfn = start; pfn < end; pfn++) {
|
|
|
|
if (pfn_valid(pfn)) {
|
|
|
|
page = pfn_to_page(pfn);
|
|
|
|
if (PageLRU(page))
|
|
|
|
return pfn;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct page *
|
2009-01-06 15:39:23 -07:00
|
|
|
hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
|
2007-10-16 01:26:12 -07:00
|
|
|
{
|
2009-01-06 15:39:23 -07:00
|
|
|
/* This should be improooooved!! */
|
|
|
|
return alloc_page(GFP_HIGHUSER_MOVABLE);
|
2007-10-16 01:26:12 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
#define NR_OFFLINE_AT_ONCE_PAGES (256)
|
|
|
|
static int
|
|
|
|
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
unsigned long pfn;
|
|
|
|
struct page *page;
|
|
|
|
int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
|
|
|
|
int not_managed = 0;
|
|
|
|
int ret = 0;
|
|
|
|
LIST_HEAD(source);
|
|
|
|
|
|
|
|
for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
|
|
|
|
if (!pfn_valid(pfn))
|
|
|
|
continue;
|
|
|
|
page = pfn_to_page(pfn);
|
|
|
|
if (!page_count(page))
|
|
|
|
continue;
|
|
|
|
/*
|
|
|
|
* We can skip free pages. And we can only deal with pages on
|
|
|
|
* LRU.
|
|
|
|
*/
|
vmscan: move isolate_lru_page() to vmscan.c
On large memory systems, the VM can spend way too much time scanning
through pages that it cannot (or should not) evict from memory. Not only
does it use up CPU time, but it also provokes lock contention and can
leave large systems under memory presure in a catatonic state.
This patch series improves VM scalability by:
1) putting filesystem backed, swap backed and unevictable pages
onto their own LRUs, so the system only scans the pages that it
can/should evict from memory
2) switching to two handed clock replacement for the anonymous LRUs,
so the number of pages that need to be scanned when the system
starts swapping is bound to a reasonable number
3) keeping unevictable pages off the LRU completely, so the
VM does not waste CPU time scanning them. ramfs, ramdisk,
SHM_LOCKED shared memory segments and mlock()ed VMA pages
are keept on the unevictable list.
This patch:
isolate_lru_page logically belongs to be in vmscan.c than migrate.c.
It is tough, because we don't need that function without memory migration
so there is a valid argument to have it in migrate.c. However a
subsequent patch needs to make use of it in the core mm, so we can happily
move it to vmscan.c.
Also, make the function a little more generic by not requiring that it
adds an isolated page to a given list. Callers can do that.
Note that we now have '__isolate_lru_page()', that does
something quite different, visible outside of vmscan.c
for use with memory controller. Methinks we need to
rationalize these names/purposes. --lts
[akpm@linux-foundation.org: fix mm/memory_hotplug.c build]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-18 20:26:09 -07:00
|
|
|
ret = isolate_lru_page(page);
|
2007-10-16 01:26:12 -07:00
|
|
|
if (!ret) { /* Success */
|
vmscan: move isolate_lru_page() to vmscan.c
On large memory systems, the VM can spend way too much time scanning
through pages that it cannot (or should not) evict from memory. Not only
does it use up CPU time, but it also provokes lock contention and can
leave large systems under memory presure in a catatonic state.
This patch series improves VM scalability by:
1) putting filesystem backed, swap backed and unevictable pages
onto their own LRUs, so the system only scans the pages that it
can/should evict from memory
2) switching to two handed clock replacement for the anonymous LRUs,
so the number of pages that need to be scanned when the system
starts swapping is bound to a reasonable number
3) keeping unevictable pages off the LRU completely, so the
VM does not waste CPU time scanning them. ramfs, ramdisk,
SHM_LOCKED shared memory segments and mlock()ed VMA pages
are keept on the unevictable list.
This patch:
isolate_lru_page logically belongs to be in vmscan.c than migrate.c.
It is tough, because we don't need that function without memory migration
so there is a valid argument to have it in migrate.c. However a
subsequent patch needs to make use of it in the core mm, so we can happily
move it to vmscan.c.
Also, make the function a little more generic by not requiring that it
adds an isolated page to a given list. Callers can do that.
Note that we now have '__isolate_lru_page()', that does
something quite different, visible outside of vmscan.c
for use with memory controller. Methinks we need to
rationalize these names/purposes. --lts
[akpm@linux-foundation.org: fix mm/memory_hotplug.c build]
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-18 20:26:09 -07:00
|
|
|
list_add_tail(&page->lru, &source);
|
2007-10-16 01:26:12 -07:00
|
|
|
move_pages--;
|
2009-12-14 18:58:11 -07:00
|
|
|
inc_zone_page_state(page, NR_ISOLATED_ANON +
|
|
|
|
page_is_file_cache(page));
|
|
|
|
|
2007-10-16 01:26:12 -07:00
|
|
|
} else {
|
|
|
|
#ifdef CONFIG_DEBUG_VM
|
2010-03-10 16:20:43 -07:00
|
|
|
printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
|
|
|
|
pfn);
|
|
|
|
dump_page(page);
|
2007-10-16 01:26:12 -07:00
|
|
|
#endif
|
2010-10-26 14:22:10 -07:00
|
|
|
/* Becasue we don't have big zone->lock. we should
|
|
|
|
check this again here. */
|
|
|
|
if (page_count(page)) {
|
|
|
|
not_managed++;
|
|
|
|
break;
|
|
|
|
}
|
2007-10-16 01:26:12 -07:00
|
|
|
}
|
|
|
|
}
|
|
|
|
ret = -EBUSY;
|
|
|
|
if (not_managed) {
|
|
|
|
if (!list_empty(&source))
|
|
|
|
putback_lru_pages(&source);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
if (list_empty(&source))
|
|
|
|
goto out;
|
|
|
|
/* this function returns # of failed pages */
|
ksm: memory hotremove migration only
The previous patch enables page migration of ksm pages, but that soon gets
into trouble: not surprising, since we're using the ksm page lock to lock
operations on its stable_node, but page migration switches the page whose
lock is to be used for that. Another layer of locking would fix it, but
do we need that yet?
Do we actually need page migration of ksm pages? Yes, memory hotremove
needs to offline sections of memory: and since we stopped allocating ksm
pages with GFP_HIGHUSER, they will tend to be GFP_HIGHUSER_MOVABLE
candidates for migration.
But KSM is currently unconscious of NUMA issues, happily merging pages
from different NUMA nodes: at present the rule must be, not to use
MADV_MERGEABLE where you care about NUMA. So no, NUMA page migration of
ksm pages does not make sense yet.
So, to complete support for ksm swapping we need to make hotremove safe.
ksm_memory_callback() take ksm_thread_mutex when MEM_GOING_OFFLINE and
release it when MEM_OFFLINE or MEM_CANCEL_OFFLINE. But if mapped pages
are freed before migration reaches them, stable_nodes may be left still
pointing to struct pages which have been removed from the system: the
stable_node needs to identify a page by pfn rather than page pointer, then
it can safely prune them when MEM_OFFLINE.
And make NUMA migration skip PageKsm pages where it skips PageReserved.
But it's only when we reach unmap_and_move() that the page lock is taken
and we can be sure that raised pagecount has prevented a PageAnon from
being upgraded: so add offlining arg to migrate_pages(), to migrate ksm
page when offlining (has sufficient locking) but reject it otherwise.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Izik Eidus <ieidus@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Chris Wright <chrisw@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-14 18:59:33 -07:00
|
|
|
ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1);
|
2010-10-26 14:21:29 -07:00
|
|
|
if (ret)
|
|
|
|
putback_lru_pages(&source);
|
2007-10-16 01:26:12 -07:00
|
|
|
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* remove from free_area[] and mark all as Reserved.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
|
|
|
|
void *data)
|
|
|
|
{
|
|
|
|
__offline_isolated_pages(start, start + nr_pages);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
|
|
|
|
{
|
2009-09-22 16:45:46 -07:00
|
|
|
walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
|
2007-10-16 01:26:12 -07:00
|
|
|
offline_isolated_pages_cb);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Check all pages in range, recoreded as memory resource, are isolated.
|
|
|
|
*/
|
|
|
|
static int
|
|
|
|
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
|
|
|
|
void *data)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
long offlined = *(long *)data;
|
|
|
|
ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
|
|
|
|
offlined = nr_pages;
|
|
|
|
if (!ret)
|
|
|
|
*(long *)data += offlined;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static long
|
|
|
|
check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
|
|
|
|
{
|
|
|
|
long offlined = 0;
|
|
|
|
int ret;
|
|
|
|
|
2009-09-22 16:45:46 -07:00
|
|
|
ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
|
2007-10-16 01:26:12 -07:00
|
|
|
check_pages_isolated_cb);
|
|
|
|
if (ret < 0)
|
|
|
|
offlined = (long)ret;
|
|
|
|
return offlined;
|
|
|
|
}
|
|
|
|
|
2009-12-14 18:59:35 -07:00
|
|
|
static int offline_pages(unsigned long start_pfn,
|
2007-10-16 01:26:12 -07:00
|
|
|
unsigned long end_pfn, unsigned long timeout)
|
|
|
|
{
|
|
|
|
unsigned long pfn, nr_pages, expire;
|
|
|
|
long offlined_pages;
|
2007-10-21 16:41:36 -07:00
|
|
|
int ret, drain, retry_max, node;
|
2007-10-16 01:26:12 -07:00
|
|
|
struct zone *zone;
|
2007-10-21 16:41:36 -07:00
|
|
|
struct memory_notify arg;
|
2007-10-16 01:26:12 -07:00
|
|
|
|
|
|
|
BUG_ON(start_pfn >= end_pfn);
|
|
|
|
/* at least, alignment against pageblock is necessary */
|
|
|
|
if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
|
|
|
|
return -EINVAL;
|
|
|
|
if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
|
|
|
|
return -EINVAL;
|
|
|
|
/* This makes hotplug much easier...and readable.
|
|
|
|
we assume this for now. .*/
|
|
|
|
if (!test_pages_in_a_zone(start_pfn, end_pfn))
|
|
|
|
return -EINVAL;
|
2007-10-21 16:41:36 -07:00
|
|
|
|
2009-11-17 15:06:22 -07:00
|
|
|
lock_system_sleep();
|
|
|
|
|
2007-10-21 16:41:36 -07:00
|
|
|
zone = page_zone(pfn_to_page(start_pfn));
|
|
|
|
node = zone_to_nid(zone);
|
|
|
|
nr_pages = end_pfn - start_pfn;
|
|
|
|
|
2007-10-16 01:26:12 -07:00
|
|
|
/* set above range as isolated */
|
|
|
|
ret = start_isolate_page_range(start_pfn, end_pfn);
|
|
|
|
if (ret)
|
2009-11-17 15:06:22 -07:00
|
|
|
goto out;
|
2007-10-21 16:41:36 -07:00
|
|
|
|
|
|
|
arg.start_pfn = start_pfn;
|
|
|
|
arg.nr_pages = nr_pages;
|
|
|
|
arg.status_change_nid = -1;
|
|
|
|
if (nr_pages >= node_present_pages(node))
|
|
|
|
arg.status_change_nid = node;
|
|
|
|
|
|
|
|
ret = memory_notify(MEM_GOING_OFFLINE, &arg);
|
|
|
|
ret = notifier_to_errno(ret);
|
|
|
|
if (ret)
|
|
|
|
goto failed_removal;
|
|
|
|
|
2007-10-16 01:26:12 -07:00
|
|
|
pfn = start_pfn;
|
|
|
|
expire = jiffies + timeout;
|
|
|
|
drain = 0;
|
|
|
|
retry_max = 5;
|
|
|
|
repeat:
|
|
|
|
/* start memory hot removal */
|
|
|
|
ret = -EAGAIN;
|
|
|
|
if (time_after(jiffies, expire))
|
|
|
|
goto failed_removal;
|
|
|
|
ret = -EINTR;
|
|
|
|
if (signal_pending(current))
|
|
|
|
goto failed_removal;
|
|
|
|
ret = 0;
|
|
|
|
if (drain) {
|
|
|
|
lru_add_drain_all();
|
|
|
|
cond_resched();
|
2008-02-04 23:29:11 -07:00
|
|
|
drain_all_pages();
|
2007-10-16 01:26:12 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
pfn = scan_lru_pages(start_pfn, end_pfn);
|
|
|
|
if (pfn) { /* We have page on LRU */
|
|
|
|
ret = do_migrate_range(pfn, end_pfn);
|
|
|
|
if (!ret) {
|
|
|
|
drain = 1;
|
|
|
|
goto repeat;
|
|
|
|
} else {
|
|
|
|
if (ret < 0)
|
|
|
|
if (--retry_max == 0)
|
|
|
|
goto failed_removal;
|
|
|
|
yield();
|
|
|
|
drain = 1;
|
|
|
|
goto repeat;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/* drain all zone's lru pagevec, this is asyncronous... */
|
|
|
|
lru_add_drain_all();
|
|
|
|
yield();
|
|
|
|
/* drain pcp pages , this is synchrouns. */
|
2008-02-04 23:29:11 -07:00
|
|
|
drain_all_pages();
|
2007-10-16 01:26:12 -07:00
|
|
|
/* check again */
|
|
|
|
offlined_pages = check_pages_isolated(start_pfn, end_pfn);
|
|
|
|
if (offlined_pages < 0) {
|
|
|
|
ret = -EBUSY;
|
|
|
|
goto failed_removal;
|
|
|
|
}
|
|
|
|
printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
|
|
|
|
/* Ok, all of our target is islaoted.
|
|
|
|
We cannot do rollback at this point. */
|
|
|
|
offline_isolated_pages(start_pfn, end_pfn);
|
2007-11-14 17:59:12 -07:00
|
|
|
/* reset pagetype flags and makes migrate type to be MOVABLE */
|
|
|
|
undo_isolate_page_range(start_pfn, end_pfn);
|
2007-10-16 01:26:12 -07:00
|
|
|
/* removal success */
|
|
|
|
zone->present_pages -= offlined_pages;
|
|
|
|
zone->zone_pgdat->node_present_pages -= offlined_pages;
|
|
|
|
totalram_pages -= offlined_pages;
|
2007-10-21 16:41:36 -07:00
|
|
|
|
2009-06-16 15:32:50 -07:00
|
|
|
setup_per_zone_wmarks();
|
|
|
|
calculate_zone_inactive_ratio(zone);
|
2009-12-14 18:58:33 -07:00
|
|
|
if (!node_present_pages(node)) {
|
|
|
|
node_clear_state(node, N_HIGH_MEMORY);
|
|
|
|
kswapd_stop(node);
|
|
|
|
}
|
2009-06-16 15:32:50 -07:00
|
|
|
|
2007-10-16 01:26:12 -07:00
|
|
|
vm_total_pages = nr_free_pagecache_pages();
|
|
|
|
writeback_set_ratelimit();
|
2007-10-21 16:41:36 -07:00
|
|
|
|
|
|
|
memory_notify(MEM_OFFLINE, &arg);
|
2009-11-17 15:06:22 -07:00
|
|
|
unlock_system_sleep();
|
2007-10-16 01:26:12 -07:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
failed_removal:
|
|
|
|
printk(KERN_INFO "memory offlining %lx to %lx failed\n",
|
|
|
|
start_pfn, end_pfn);
|
2007-10-21 16:41:36 -07:00
|
|
|
memory_notify(MEM_CANCEL_OFFLINE, &arg);
|
2007-10-16 01:26:12 -07:00
|
|
|
/* pushback to free area */
|
|
|
|
undo_isolate_page_range(start_pfn, end_pfn);
|
2007-10-21 16:41:36 -07:00
|
|
|
|
2009-11-17 15:06:22 -07:00
|
|
|
out:
|
|
|
|
unlock_system_sleep();
|
2007-10-16 01:26:12 -07:00
|
|
|
return ret;
|
|
|
|
}
|
2008-10-18 20:25:58 -07:00
|
|
|
|
|
|
|
int remove_memory(u64 start, u64 size)
|
|
|
|
{
|
|
|
|
unsigned long start_pfn, end_pfn;
|
|
|
|
|
|
|
|
start_pfn = PFN_DOWN(start);
|
|
|
|
end_pfn = start_pfn + PFN_DOWN(size);
|
|
|
|
return offline_pages(start_pfn, end_pfn, 120 * HZ);
|
|
|
|
}
|
2007-10-16 01:26:14 -07:00
|
|
|
#else
|
|
|
|
int remove_memory(u64 start, u64 size)
|
|
|
|
{
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
2007-10-16 01:26:12 -07:00
|
|
|
#endif /* CONFIG_MEMORY_HOTREMOVE */
|
2008-10-18 20:25:58 -07:00
|
|
|
EXPORT_SYMBOL_GPL(remove_memory);
|