b80fa3cbb7
Future changes will need to add a new member to struct vm_unmapped_area_info. This would cause trouble for any call site that doesn't initialize the struct. Currently every caller sets each member manually, so if new ones are added they will be uninitialized and the core code parsing the struct will see garbage in the new member. It could be possible to initialize the new member manually to 0 at each call site. This and a couple other options were discussed. Having some struct vm_unmapped_area_info instances not zero initialized will put those sites at risk of feeding garbage into vm_unmapped_area(), if the convention is to zero initialize the struct and any new field addition missed a call site that initializes each field manually. So it is useful to do things similar across the kernel. The consensus (see links) was that in general the best way to accomplish taking into account both code cleanliness and minimizing the chance of introducing bugs, was to do C99 static initialization. As in: struct vm_unmapped_area_info info = {}; With this method of initialization, the whole struct will be zero initialized, and any statements setting fields to zero will be unneeded. The change should not leave cleanup at the call sides. While iterating though the possible solutions a few archs kindly acked other variations that still zero initialized the struct. These sites have been modified in previous changes using the pattern acked by the respective arch. So to be reduce the chance of bugs via uninitialized fields, perform a tree wide change using the consensus for the best general way to do this change. Use C99 static initializing to zero the struct and remove and statements that simply set members to zero. Link: https://lkml.kernel.org/r/20240326021656.202649-11-rick.p.edgecombe@intel.com Link: https://lore.kernel.org/lkml/202402280912.33AEE7A9CF@keescook/#t Link: https://lore.kernel.org/lkml/j7bfvig3gew3qruouxrh7z7ehjjafrgkbcmg6tcghhfh3rhmzi@wzlcoecgy5rs/ Link: https://lore.kernel.org/lkml/ec3e377a-c0a0-4dd3-9cb9-96517e54d17e@csgroup.eu/ Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Reviewed-by: Kees Cook <keescook@chromium.org> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Andy Lutomirski <luto@kernel.org> Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org> Cc: Borislav Petkov (AMD) <bp@alien8.de> Cc: Christophe Leroy <christophe.leroy@csgroup.eu> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Deepak Gupta <debug@rivosinc.com> Cc: Guo Ren <guoren@kernel.org> Cc: Helge Deller <deller@gmx.de> Cc: H. Peter Anvin (Intel) <hpa@zytor.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: "James E.J. Bottomley" <James.Bottomley@HansenPartnership.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Liam R. Howlett <Liam.Howlett@oracle.com> Cc: Mark Brown <broonie@kernel.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
525 lines
12 KiB
C
525 lines
12 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* SPARC64 Huge TLB page support.
|
|
*
|
|
* Copyright (C) 2002, 2003, 2006 David S. Miller (davem@davemloft.net)
|
|
*/
|
|
|
|
#include <linux/fs.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/sched/mm.h>
|
|
#include <linux/hugetlb.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/sysctl.h>
|
|
|
|
#include <asm/mman.h>
|
|
#include <asm/pgalloc.h>
|
|
#include <asm/tlb.h>
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/cacheflush.h>
|
|
#include <asm/mmu_context.h>
|
|
|
|
/* Slightly simplified from the non-hugepage variant because by
|
|
* definition we don't have to worry about any page coloring stuff
|
|
*/
|
|
|
|
static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp,
|
|
unsigned long addr,
|
|
unsigned long len,
|
|
unsigned long pgoff,
|
|
unsigned long flags)
|
|
{
|
|
struct hstate *h = hstate_file(filp);
|
|
unsigned long task_size = TASK_SIZE;
|
|
struct vm_unmapped_area_info info = {};
|
|
|
|
if (test_thread_flag(TIF_32BIT))
|
|
task_size = STACK_TOP32;
|
|
|
|
info.length = len;
|
|
info.low_limit = TASK_UNMAPPED_BASE;
|
|
info.high_limit = min(task_size, VA_EXCLUDE_START);
|
|
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
|
|
addr = vm_unmapped_area(&info);
|
|
|
|
if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) {
|
|
VM_BUG_ON(addr != -ENOMEM);
|
|
info.low_limit = VA_EXCLUDE_END;
|
|
info.high_limit = task_size;
|
|
addr = vm_unmapped_area(&info);
|
|
}
|
|
|
|
return addr;
|
|
}
|
|
|
|
static unsigned long
|
|
hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
|
|
const unsigned long len,
|
|
const unsigned long pgoff,
|
|
const unsigned long flags)
|
|
{
|
|
struct hstate *h = hstate_file(filp);
|
|
struct mm_struct *mm = current->mm;
|
|
unsigned long addr = addr0;
|
|
struct vm_unmapped_area_info info = {};
|
|
|
|
/* This should only ever run for 32-bit processes. */
|
|
BUG_ON(!test_thread_flag(TIF_32BIT));
|
|
|
|
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
|
|
info.length = len;
|
|
info.low_limit = PAGE_SIZE;
|
|
info.high_limit = mm->mmap_base;
|
|
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
|
|
addr = vm_unmapped_area(&info);
|
|
|
|
/*
|
|
* A failed mmap() very likely causes application failure,
|
|
* so fall back to the bottom-up function here. This scenario
|
|
* can happen with large stack limits and large mmap()
|
|
* allocations.
|
|
*/
|
|
if (addr & ~PAGE_MASK) {
|
|
VM_BUG_ON(addr != -ENOMEM);
|
|
info.flags = 0;
|
|
info.low_limit = TASK_UNMAPPED_BASE;
|
|
info.high_limit = STACK_TOP32;
|
|
addr = vm_unmapped_area(&info);
|
|
}
|
|
|
|
return addr;
|
|
}
|
|
|
|
unsigned long
|
|
hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
|
|
unsigned long len, unsigned long pgoff, unsigned long flags)
|
|
{
|
|
struct hstate *h = hstate_file(file);
|
|
struct mm_struct *mm = current->mm;
|
|
struct vm_area_struct *vma;
|
|
unsigned long task_size = TASK_SIZE;
|
|
|
|
if (test_thread_flag(TIF_32BIT))
|
|
task_size = STACK_TOP32;
|
|
|
|
if (len & ~huge_page_mask(h))
|
|
return -EINVAL;
|
|
if (len > task_size)
|
|
return -ENOMEM;
|
|
|
|
if (flags & MAP_FIXED) {
|
|
if (prepare_hugepage_range(file, addr, len))
|
|
return -EINVAL;
|
|
return addr;
|
|
}
|
|
|
|
if (addr) {
|
|
addr = ALIGN(addr, huge_page_size(h));
|
|
vma = find_vma(mm, addr);
|
|
if (task_size - len >= addr &&
|
|
(!vma || addr + len <= vm_start_gap(vma)))
|
|
return addr;
|
|
}
|
|
if (!test_bit(MMF_TOPDOWN, &mm->flags))
|
|
return hugetlb_get_unmapped_area_bottomup(file, addr, len,
|
|
pgoff, flags);
|
|
else
|
|
return hugetlb_get_unmapped_area_topdown(file, addr, len,
|
|
pgoff, flags);
|
|
}
|
|
|
|
static pte_t sun4u_hugepage_shift_to_tte(pte_t entry, unsigned int shift)
|
|
{
|
|
return entry;
|
|
}
|
|
|
|
static pte_t sun4v_hugepage_shift_to_tte(pte_t entry, unsigned int shift)
|
|
{
|
|
unsigned long hugepage_size = _PAGE_SZ4MB_4V;
|
|
|
|
pte_val(entry) = pte_val(entry) & ~_PAGE_SZALL_4V;
|
|
|
|
switch (shift) {
|
|
case HPAGE_16GB_SHIFT:
|
|
hugepage_size = _PAGE_SZ16GB_4V;
|
|
pte_val(entry) |= _PAGE_PUD_HUGE;
|
|
break;
|
|
case HPAGE_2GB_SHIFT:
|
|
hugepage_size = _PAGE_SZ2GB_4V;
|
|
pte_val(entry) |= _PAGE_PMD_HUGE;
|
|
break;
|
|
case HPAGE_256MB_SHIFT:
|
|
hugepage_size = _PAGE_SZ256MB_4V;
|
|
pte_val(entry) |= _PAGE_PMD_HUGE;
|
|
break;
|
|
case HPAGE_SHIFT:
|
|
pte_val(entry) |= _PAGE_PMD_HUGE;
|
|
break;
|
|
case HPAGE_64K_SHIFT:
|
|
hugepage_size = _PAGE_SZ64K_4V;
|
|
break;
|
|
default:
|
|
WARN_ONCE(1, "unsupported hugepage shift=%u\n", shift);
|
|
}
|
|
|
|
pte_val(entry) = pte_val(entry) | hugepage_size;
|
|
return entry;
|
|
}
|
|
|
|
static pte_t hugepage_shift_to_tte(pte_t entry, unsigned int shift)
|
|
{
|
|
if (tlb_type == hypervisor)
|
|
return sun4v_hugepage_shift_to_tte(entry, shift);
|
|
else
|
|
return sun4u_hugepage_shift_to_tte(entry, shift);
|
|
}
|
|
|
|
pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags)
|
|
{
|
|
pte_t pte;
|
|
|
|
entry = pte_mkhuge(entry);
|
|
pte = hugepage_shift_to_tte(entry, shift);
|
|
|
|
#ifdef CONFIG_SPARC64
|
|
/* If this vma has ADI enabled on it, turn on TTE.mcd
|
|
*/
|
|
if (flags & VM_SPARC_ADI)
|
|
return pte_mkmcd(pte);
|
|
else
|
|
return pte_mknotmcd(pte);
|
|
#else
|
|
return pte;
|
|
#endif
|
|
}
|
|
|
|
static unsigned int sun4v_huge_tte_to_shift(pte_t entry)
|
|
{
|
|
unsigned long tte_szbits = pte_val(entry) & _PAGE_SZALL_4V;
|
|
unsigned int shift;
|
|
|
|
switch (tte_szbits) {
|
|
case _PAGE_SZ16GB_4V:
|
|
shift = HPAGE_16GB_SHIFT;
|
|
break;
|
|
case _PAGE_SZ2GB_4V:
|
|
shift = HPAGE_2GB_SHIFT;
|
|
break;
|
|
case _PAGE_SZ256MB_4V:
|
|
shift = HPAGE_256MB_SHIFT;
|
|
break;
|
|
case _PAGE_SZ4MB_4V:
|
|
shift = REAL_HPAGE_SHIFT;
|
|
break;
|
|
case _PAGE_SZ64K_4V:
|
|
shift = HPAGE_64K_SHIFT;
|
|
break;
|
|
default:
|
|
shift = PAGE_SHIFT;
|
|
break;
|
|
}
|
|
return shift;
|
|
}
|
|
|
|
static unsigned int sun4u_huge_tte_to_shift(pte_t entry)
|
|
{
|
|
unsigned long tte_szbits = pte_val(entry) & _PAGE_SZALL_4U;
|
|
unsigned int shift;
|
|
|
|
switch (tte_szbits) {
|
|
case _PAGE_SZ256MB_4U:
|
|
shift = HPAGE_256MB_SHIFT;
|
|
break;
|
|
case _PAGE_SZ4MB_4U:
|
|
shift = REAL_HPAGE_SHIFT;
|
|
break;
|
|
case _PAGE_SZ64K_4U:
|
|
shift = HPAGE_64K_SHIFT;
|
|
break;
|
|
default:
|
|
shift = PAGE_SHIFT;
|
|
break;
|
|
}
|
|
return shift;
|
|
}
|
|
|
|
static unsigned long tte_to_shift(pte_t entry)
|
|
{
|
|
if (tlb_type == hypervisor)
|
|
return sun4v_huge_tte_to_shift(entry);
|
|
|
|
return sun4u_huge_tte_to_shift(entry);
|
|
}
|
|
|
|
static unsigned int huge_tte_to_shift(pte_t entry)
|
|
{
|
|
unsigned long shift = tte_to_shift(entry);
|
|
|
|
if (shift == PAGE_SHIFT)
|
|
WARN_ONCE(1, "tto_to_shift: invalid hugepage tte=0x%lx\n",
|
|
pte_val(entry));
|
|
|
|
return shift;
|
|
}
|
|
|
|
static unsigned long huge_tte_to_size(pte_t pte)
|
|
{
|
|
unsigned long size = 1UL << huge_tte_to_shift(pte);
|
|
|
|
if (size == REAL_HPAGE_SIZE)
|
|
size = HPAGE_SIZE;
|
|
return size;
|
|
}
|
|
|
|
unsigned long pud_leaf_size(pud_t pud) { return 1UL << tte_to_shift(*(pte_t *)&pud); }
|
|
unsigned long pmd_leaf_size(pmd_t pmd) { return 1UL << tte_to_shift(*(pte_t *)&pmd); }
|
|
unsigned long pte_leaf_size(pte_t pte) { return 1UL << tte_to_shift(pte); }
|
|
|
|
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
|
|
unsigned long addr, unsigned long sz)
|
|
{
|
|
pgd_t *pgd;
|
|
p4d_t *p4d;
|
|
pud_t *pud;
|
|
pmd_t *pmd;
|
|
|
|
pgd = pgd_offset(mm, addr);
|
|
p4d = p4d_offset(pgd, addr);
|
|
pud = pud_alloc(mm, p4d, addr);
|
|
if (!pud)
|
|
return NULL;
|
|
if (sz >= PUD_SIZE)
|
|
return (pte_t *)pud;
|
|
pmd = pmd_alloc(mm, pud, addr);
|
|
if (!pmd)
|
|
return NULL;
|
|
if (sz >= PMD_SIZE)
|
|
return (pte_t *)pmd;
|
|
return pte_alloc_huge(mm, pmd, addr);
|
|
}
|
|
|
|
pte_t *huge_pte_offset(struct mm_struct *mm,
|
|
unsigned long addr, unsigned long sz)
|
|
{
|
|
pgd_t *pgd;
|
|
p4d_t *p4d;
|
|
pud_t *pud;
|
|
pmd_t *pmd;
|
|
|
|
pgd = pgd_offset(mm, addr);
|
|
if (pgd_none(*pgd))
|
|
return NULL;
|
|
p4d = p4d_offset(pgd, addr);
|
|
if (p4d_none(*p4d))
|
|
return NULL;
|
|
pud = pud_offset(p4d, addr);
|
|
if (pud_none(*pud))
|
|
return NULL;
|
|
if (is_hugetlb_pud(*pud))
|
|
return (pte_t *)pud;
|
|
pmd = pmd_offset(pud, addr);
|
|
if (pmd_none(*pmd))
|
|
return NULL;
|
|
if (is_hugetlb_pmd(*pmd))
|
|
return (pte_t *)pmd;
|
|
return pte_offset_huge(pmd, addr);
|
|
}
|
|
|
|
void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
|
|
pte_t *ptep, pte_t entry)
|
|
{
|
|
unsigned int nptes, orig_shift, shift;
|
|
unsigned long i, size;
|
|
pte_t orig;
|
|
|
|
size = huge_tte_to_size(entry);
|
|
|
|
shift = PAGE_SHIFT;
|
|
if (size >= PUD_SIZE)
|
|
shift = PUD_SHIFT;
|
|
else if (size >= PMD_SIZE)
|
|
shift = PMD_SHIFT;
|
|
else
|
|
shift = PAGE_SHIFT;
|
|
|
|
nptes = size >> shift;
|
|
|
|
if (!pte_present(*ptep) && pte_present(entry))
|
|
mm->context.hugetlb_pte_count += nptes;
|
|
|
|
addr &= ~(size - 1);
|
|
orig = *ptep;
|
|
orig_shift = pte_none(orig) ? PAGE_SHIFT : huge_tte_to_shift(orig);
|
|
|
|
for (i = 0; i < nptes; i++)
|
|
ptep[i] = __pte(pte_val(entry) + (i << shift));
|
|
|
|
maybe_tlb_batch_add(mm, addr, ptep, orig, 0, orig_shift);
|
|
/* An HPAGE_SIZE'ed page is composed of two REAL_HPAGE_SIZE'ed pages */
|
|
if (size == HPAGE_SIZE)
|
|
maybe_tlb_batch_add(mm, addr + REAL_HPAGE_SIZE, ptep, orig, 0,
|
|
orig_shift);
|
|
}
|
|
|
|
void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
|
|
pte_t *ptep, pte_t entry, unsigned long sz)
|
|
{
|
|
__set_huge_pte_at(mm, addr, ptep, entry);
|
|
}
|
|
|
|
pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
|
|
pte_t *ptep)
|
|
{
|
|
unsigned int i, nptes, orig_shift, shift;
|
|
unsigned long size;
|
|
pte_t entry;
|
|
|
|
entry = *ptep;
|
|
size = huge_tte_to_size(entry);
|
|
|
|
shift = PAGE_SHIFT;
|
|
if (size >= PUD_SIZE)
|
|
shift = PUD_SHIFT;
|
|
else if (size >= PMD_SIZE)
|
|
shift = PMD_SHIFT;
|
|
else
|
|
shift = PAGE_SHIFT;
|
|
|
|
nptes = size >> shift;
|
|
orig_shift = pte_none(entry) ? PAGE_SHIFT : huge_tte_to_shift(entry);
|
|
|
|
if (pte_present(entry))
|
|
mm->context.hugetlb_pte_count -= nptes;
|
|
|
|
addr &= ~(size - 1);
|
|
for (i = 0; i < nptes; i++)
|
|
ptep[i] = __pte(0UL);
|
|
|
|
maybe_tlb_batch_add(mm, addr, ptep, entry, 0, orig_shift);
|
|
/* An HPAGE_SIZE'ed page is composed of two REAL_HPAGE_SIZE'ed pages */
|
|
if (size == HPAGE_SIZE)
|
|
maybe_tlb_batch_add(mm, addr + REAL_HPAGE_SIZE, ptep, entry, 0,
|
|
orig_shift);
|
|
|
|
return entry;
|
|
}
|
|
|
|
static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
|
|
unsigned long addr)
|
|
{
|
|
pgtable_t token = pmd_pgtable(*pmd);
|
|
|
|
pmd_clear(pmd);
|
|
pte_free_tlb(tlb, token, addr);
|
|
mm_dec_nr_ptes(tlb->mm);
|
|
}
|
|
|
|
static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
|
|
unsigned long addr, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling)
|
|
{
|
|
pmd_t *pmd;
|
|
unsigned long next;
|
|
unsigned long start;
|
|
|
|
start = addr;
|
|
pmd = pmd_offset(pud, addr);
|
|
do {
|
|
next = pmd_addr_end(addr, end);
|
|
if (pmd_none(*pmd))
|
|
continue;
|
|
if (is_hugetlb_pmd(*pmd))
|
|
pmd_clear(pmd);
|
|
else
|
|
hugetlb_free_pte_range(tlb, pmd, addr);
|
|
} while (pmd++, addr = next, addr != end);
|
|
|
|
start &= PUD_MASK;
|
|
if (start < floor)
|
|
return;
|
|
if (ceiling) {
|
|
ceiling &= PUD_MASK;
|
|
if (!ceiling)
|
|
return;
|
|
}
|
|
if (end - 1 > ceiling - 1)
|
|
return;
|
|
|
|
pmd = pmd_offset(pud, start);
|
|
pud_clear(pud);
|
|
pmd_free_tlb(tlb, pmd, start);
|
|
mm_dec_nr_pmds(tlb->mm);
|
|
}
|
|
|
|
static void hugetlb_free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
|
|
unsigned long addr, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling)
|
|
{
|
|
pud_t *pud;
|
|
unsigned long next;
|
|
unsigned long start;
|
|
|
|
start = addr;
|
|
pud = pud_offset(p4d, addr);
|
|
do {
|
|
next = pud_addr_end(addr, end);
|
|
if (pud_none_or_clear_bad(pud))
|
|
continue;
|
|
if (is_hugetlb_pud(*pud))
|
|
pud_clear(pud);
|
|
else
|
|
hugetlb_free_pmd_range(tlb, pud, addr, next, floor,
|
|
ceiling);
|
|
} while (pud++, addr = next, addr != end);
|
|
|
|
start &= PGDIR_MASK;
|
|
if (start < floor)
|
|
return;
|
|
if (ceiling) {
|
|
ceiling &= PGDIR_MASK;
|
|
if (!ceiling)
|
|
return;
|
|
}
|
|
if (end - 1 > ceiling - 1)
|
|
return;
|
|
|
|
pud = pud_offset(p4d, start);
|
|
p4d_clear(p4d);
|
|
pud_free_tlb(tlb, pud, start);
|
|
mm_dec_nr_puds(tlb->mm);
|
|
}
|
|
|
|
void hugetlb_free_pgd_range(struct mmu_gather *tlb,
|
|
unsigned long addr, unsigned long end,
|
|
unsigned long floor, unsigned long ceiling)
|
|
{
|
|
pgd_t *pgd;
|
|
p4d_t *p4d;
|
|
unsigned long next;
|
|
|
|
addr &= PMD_MASK;
|
|
if (addr < floor) {
|
|
addr += PMD_SIZE;
|
|
if (!addr)
|
|
return;
|
|
}
|
|
if (ceiling) {
|
|
ceiling &= PMD_MASK;
|
|
if (!ceiling)
|
|
return;
|
|
}
|
|
if (end - 1 > ceiling - 1)
|
|
end -= PMD_SIZE;
|
|
if (addr > end - 1)
|
|
return;
|
|
|
|
pgd = pgd_offset(tlb->mm, addr);
|
|
p4d = p4d_offset(pgd, addr);
|
|
do {
|
|
next = p4d_addr_end(addr, end);
|
|
if (p4d_none_or_clear_bad(p4d))
|
|
continue;
|
|
hugetlb_free_pud_range(tlb, p4d, addr, next, floor, ceiling);
|
|
} while (p4d++, addr = next, addr != end);
|
|
}
|