4c21e2f244
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with a many-threaded application which concurrently initializes different parts of a large anonymous area. This patch corrects that, by using a separate spinlock per page table page, to guard the page table entries in that page, instead of using the mm's single page_table_lock. (But even then, page_table_lock is still used to guard page table allocation, and anon_vma allocation.) In this implementation, the spinlock is tucked inside the struct page of the page table page: with a BUILD_BUG_ON in case it overflows - which it would in the case of 32-bit PA-RISC with spinlock debugging enabled. Splitting the lock is not quite for free: another cacheline access. Ideally, I suppose we would use split ptlock only for multi-threaded processes on multi-cpu machines; but deciding that dynamically would have its own costs. So for now enable it by config, at some number of cpus - since the Kconfig language doesn't support inequalities, let preprocessor compare that with NR_CPUS. But I don't think it's worth being user-configurable: for good testing of both split and unsplit configs, split now at 4 cpus, and perhaps change that to 8 later. There is a benefit even for singly threaded processes: kswapd can be attacking one part of the mm while another part is busy faulting. Signed-off-by: Hugh Dickins <hugh@veritas.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
160 lines
4.2 KiB
C
160 lines
4.2 KiB
C
/* pgalloc.c: page directory & page table allocation
|
|
*
|
|
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
|
|
* Written by David Howells (dhowells@redhat.com)
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*/
|
|
|
|
#include <linux/sched.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/highmem.h>
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/page.h>
|
|
#include <asm/cacheflush.h>
|
|
|
|
pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((aligned(PAGE_SIZE)));
|
|
kmem_cache_t *pgd_cache;
|
|
|
|
pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
|
|
{
|
|
pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT);
|
|
if (pte)
|
|
clear_page(pte);
|
|
return pte;
|
|
}
|
|
|
|
struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
|
|
{
|
|
struct page *page;
|
|
|
|
#ifdef CONFIG_HIGHPTE
|
|
page = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0);
|
|
#else
|
|
page = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0);
|
|
#endif
|
|
if (page)
|
|
clear_highpage(page);
|
|
flush_dcache_page(page);
|
|
return page;
|
|
}
|
|
|
|
void __set_pmd(pmd_t *pmdptr, unsigned long pmd)
|
|
{
|
|
unsigned long *__ste_p = pmdptr->ste;
|
|
int loop;
|
|
|
|
if (!pmd) {
|
|
memset(__ste_p, 0, PME_SIZE);
|
|
}
|
|
else {
|
|
BUG_ON(pmd & (0x3f00 | xAMPRx_SS | 0xe));
|
|
|
|
for (loop = PME_SIZE; loop > 0; loop -= 4) {
|
|
*__ste_p++ = pmd;
|
|
pmd += __frv_PT_SIZE;
|
|
}
|
|
}
|
|
|
|
frv_dcache_writeback((unsigned long) pmdptr, (unsigned long) (pmdptr + 1));
|
|
}
|
|
|
|
/*
|
|
* List of all pgd's needed for non-PAE so it can invalidate entries
|
|
* in both cached and uncached pgd's; not needed for PAE since the
|
|
* kernel pmd is shared. If PAE were not to share the pmd a similar
|
|
* tactic would be needed. This is essentially codepath-based locking
|
|
* against pageattr.c; it is the unique case in which a valid change
|
|
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
|
|
* vmalloc faults work because attached pagetables are never freed.
|
|
* If the locking proves to be non-performant, a ticketing scheme with
|
|
* checks at dup_mmap(), exec(), and other mmlist addition points
|
|
* could be used. The locking scheme was chosen on the basis of
|
|
* manfred's recommendations and having no core impact whatsoever.
|
|
* -- wli
|
|
*/
|
|
DEFINE_SPINLOCK(pgd_lock);
|
|
struct page *pgd_list;
|
|
|
|
static inline void pgd_list_add(pgd_t *pgd)
|
|
{
|
|
struct page *page = virt_to_page(pgd);
|
|
page->index = (unsigned long) pgd_list;
|
|
if (pgd_list)
|
|
pgd_list->private = (unsigned long) &page->index;
|
|
pgd_list = page;
|
|
set_page_private(page, (unsigned long)&pgd_list);
|
|
}
|
|
|
|
static inline void pgd_list_del(pgd_t *pgd)
|
|
{
|
|
struct page *next, **pprev, *page = virt_to_page(pgd);
|
|
next = (struct page *) page->index;
|
|
pprev = (struct page **)page_private(page);
|
|
*pprev = next;
|
|
if (next)
|
|
next->private = (unsigned long) pprev;
|
|
}
|
|
|
|
void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
|
|
{
|
|
unsigned long flags;
|
|
|
|
if (PTRS_PER_PMD == 1)
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
|
|
|
memcpy((pgd_t *) pgd + USER_PGDS_IN_LAST_PML4,
|
|
swapper_pg_dir + USER_PGDS_IN_LAST_PML4,
|
|
(PTRS_PER_PGD - USER_PGDS_IN_LAST_PML4) * sizeof(pgd_t));
|
|
|
|
if (PTRS_PER_PMD > 1)
|
|
return;
|
|
|
|
pgd_list_add(pgd);
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
|
memset(pgd, 0, USER_PGDS_IN_LAST_PML4 * sizeof(pgd_t));
|
|
}
|
|
|
|
/* never called when PTRS_PER_PMD > 1 */
|
|
void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
|
|
{
|
|
unsigned long flags; /* can be called from interrupt context */
|
|
|
|
spin_lock_irqsave(&pgd_lock, flags);
|
|
pgd_list_del(pgd);
|
|
spin_unlock_irqrestore(&pgd_lock, flags);
|
|
}
|
|
|
|
pgd_t *pgd_alloc(struct mm_struct *mm)
|
|
{
|
|
pgd_t *pgd;
|
|
|
|
pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
|
|
if (!pgd)
|
|
return pgd;
|
|
|
|
return pgd;
|
|
}
|
|
|
|
void pgd_free(pgd_t *pgd)
|
|
{
|
|
/* in the non-PAE case, clear_page_tables() clears user pgd entries */
|
|
kmem_cache_free(pgd_cache, pgd);
|
|
}
|
|
|
|
void __init pgtable_cache_init(void)
|
|
{
|
|
pgd_cache = kmem_cache_create("pgd",
|
|
PTRS_PER_PGD * sizeof(pgd_t),
|
|
PTRS_PER_PGD * sizeof(pgd_t),
|
|
0,
|
|
pgd_ctor,
|
|
pgd_dtor);
|
|
if (!pgd_cache)
|
|
panic("pgtable_cache_init(): Cannot create pgd cache");
|
|
}
|