1
linux/arch/x86/kernel/shstk.c

580 lines
13 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* shstk.c - Intel shadow stack support
*
* Copyright (c) 2021, Intel Corporation.
* Yu-cheng Yu <yu-cheng.yu@intel.com>
*/
#include <linux/sched.h>
#include <linux/bitops.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/mman.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/sched/signal.h>
#include <linux/compat.h>
#include <linux/sizes.h>
#include <linux/user.h>
x86/shstk: Introduce map_shadow_stack syscall When operating with shadow stacks enabled, the kernel will automatically allocate shadow stacks for new threads, however in some cases userspace will need additional shadow stacks. The main example of this is the ucontext family of functions, which require userspace allocating and pivoting to userspace managed stacks. Unlike most other user memory permissions, shadow stacks need to be provisioned with special data in order to be useful. They need to be setup with a restore token so that userspace can pivot to them via the RSTORSSP instruction. But, the security design of shadow stacks is that they should not be written to except in limited circumstances. This presents a problem for userspace, as to how userspace can provision this special data, without allowing for the shadow stack to be generally writable. Previously, a new PROT_SHADOW_STACK was attempted, which could be mprotect()ed from RW permissions after the data was provisioned. This was found to not be secure enough, as other threads could write to the shadow stack during the writable window. The kernel can use a special instruction, WRUSS, to write directly to userspace shadow stacks. So the solution can be that memory can be mapped as shadow stack permissions from the beginning (never generally writable in userspace), and the kernel itself can write the restore token. First, a new madvise() flag was explored, which could operate on the PROT_SHADOW_STACK memory. This had a couple of downsides: 1. Extra checks were needed in mprotect() to prevent writable memory from ever becoming PROT_SHADOW_STACK. 2. Extra checks/vma state were needed in the new madvise() to prevent restore tokens being written into the middle of pre-used shadow stacks. It is ideal to prevent restore tokens being added at arbitrary locations, so the check was to make sure the shadow stack had never been written to. 3. It stood out from the rest of the madvise flags, as more of direct action than a hint at future desired behavior. So rather than repurpose two existing syscalls (mmap, madvise) that don't quite fit, just implement a new map_shadow_stack syscall to allow userspace to map and setup new shadow stacks in one step. While ucontext is the primary motivator, userspace may have other unforeseen reasons to setup its own shadow stacks using the WRSS instruction. Towards this provide a flag so that stacks can be optionally setup securely for the common case of ucontext without enabling WRSS. Or potentially have the kernel set up the shadow stack in some new way. The following example demonstrates how to create a new shadow stack with map_shadow_stack: void *shstk = map_shadow_stack(addr, stack_size, SHADOW_STACK_SET_TOKEN); Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-35-rick.p.edgecombe%40intel.com
2023-06-12 17:11:00 -07:00
#include <linux/syscalls.h>
#include <asm/msr.h>
#include <asm/fpu/xstate.h>
#include <asm/fpu/types.h>
#include <asm/shstk.h>
#include <asm/special_insns.h>
#include <asm/fpu/api.h>
#include <asm/prctl.h>
x86/shstk: Introduce routines modifying shstk Shadow stacks are normally written to via CALL/RET or specific CET instructions like RSTORSSP/SAVEPREVSSP. However, sometimes the kernel will need to write to the shadow stack directly using the ring-0 only WRUSS instruction. A shadow stack restore token marks a restore point of the shadow stack, and the address in a token must point directly above the token, which is within the same shadow stack. This is distinctively different from other pointers on the shadow stack, since those pointers point to executable code area. Introduce token setup and verify routines. Also introduce WRUSS, which is a kernel-mode instruction but writes directly to user shadow stack. In future patches that enable shadow stack to work with signals, the kernel will need something to denote the point in the stack where sigreturn may be called. This will prevent attackers calling sigreturn at arbitrary places in the stack, in order to help prevent SROP attacks. To do this, something that can only be written by the kernel needs to be placed on the shadow stack. This can be accomplished by setting bit 63 in the frame written to the shadow stack. Userspace return addresses can't have this bit set as it is in the kernel range. It also can't be a valid restore token. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-31-rick.p.edgecombe%40intel.com
2023-06-12 17:10:56 -07:00
#define SS_FRAME_SIZE 8
static bool features_enabled(unsigned long features)
{
return current->thread.features & features;
}
static void features_set(unsigned long features)
{
current->thread.features |= features;
}
static void features_clr(unsigned long features)
{
current->thread.features &= ~features;
}
x86/shstk: Introduce routines modifying shstk Shadow stacks are normally written to via CALL/RET or specific CET instructions like RSTORSSP/SAVEPREVSSP. However, sometimes the kernel will need to write to the shadow stack directly using the ring-0 only WRUSS instruction. A shadow stack restore token marks a restore point of the shadow stack, and the address in a token must point directly above the token, which is within the same shadow stack. This is distinctively different from other pointers on the shadow stack, since those pointers point to executable code area. Introduce token setup and verify routines. Also introduce WRUSS, which is a kernel-mode instruction but writes directly to user shadow stack. In future patches that enable shadow stack to work with signals, the kernel will need something to denote the point in the stack where sigreturn may be called. This will prevent attackers calling sigreturn at arbitrary places in the stack, in order to help prevent SROP attacks. To do this, something that can only be written by the kernel needs to be placed on the shadow stack. This can be accomplished by setting bit 63 in the frame written to the shadow stack. Userspace return addresses can't have this bit set as it is in the kernel range. It also can't be a valid restore token. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-31-rick.p.edgecombe%40intel.com
2023-06-12 17:10:56 -07:00
/*
* Create a restore token on the shadow stack. A token is always 8-byte
* and aligned to 8.
*/
static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
{
unsigned long addr;
/* Token must be aligned */
if (!IS_ALIGNED(ssp, 8))
return -EINVAL;
addr = ssp - SS_FRAME_SIZE;
/*
* SSP is aligned, so reserved bits and mode bit are a zero, just mark
* the token 64-bit.
*/
ssp |= BIT(0);
if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
return -EFAULT;
if (token_addr)
*token_addr = addr;
return 0;
}
/*
* VM_SHADOW_STACK will have a guard page. This helps userspace protect
* itself from attacks. The reasoning is as follows:
*
* The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
* INCSSP instruction can increment the shadow stack pointer. It is the
* shadow stack analog of an instruction like:
*
* addq $0x80, %rsp
*
* However, there is one important difference between an ADD on %rsp
* and INCSSP. In addition to modifying SSP, INCSSP also reads from the
* memory of the first and last elements that were "popped". It can be
* thought of as acting like this:
*
* READ_ONCE(ssp); // read+discard top element on stack
* ssp += nr_to_pop * 8; // move the shadow stack
* READ_ONCE(ssp-8); // read+discard last popped stack element
*
* The maximum distance INCSSP can move the SSP is 2040 bytes, before
* it would read the memory. Therefore a single page gap will be enough
* to prevent any operation from shifting the SSP to an adjacent stack,
* since it would have to land in the gap at least once, causing a
* fault.
*/
x86/shstk: Introduce map_shadow_stack syscall When operating with shadow stacks enabled, the kernel will automatically allocate shadow stacks for new threads, however in some cases userspace will need additional shadow stacks. The main example of this is the ucontext family of functions, which require userspace allocating and pivoting to userspace managed stacks. Unlike most other user memory permissions, shadow stacks need to be provisioned with special data in order to be useful. They need to be setup with a restore token so that userspace can pivot to them via the RSTORSSP instruction. But, the security design of shadow stacks is that they should not be written to except in limited circumstances. This presents a problem for userspace, as to how userspace can provision this special data, without allowing for the shadow stack to be generally writable. Previously, a new PROT_SHADOW_STACK was attempted, which could be mprotect()ed from RW permissions after the data was provisioned. This was found to not be secure enough, as other threads could write to the shadow stack during the writable window. The kernel can use a special instruction, WRUSS, to write directly to userspace shadow stacks. So the solution can be that memory can be mapped as shadow stack permissions from the beginning (never generally writable in userspace), and the kernel itself can write the restore token. First, a new madvise() flag was explored, which could operate on the PROT_SHADOW_STACK memory. This had a couple of downsides: 1. Extra checks were needed in mprotect() to prevent writable memory from ever becoming PROT_SHADOW_STACK. 2. Extra checks/vma state were needed in the new madvise() to prevent restore tokens being written into the middle of pre-used shadow stacks. It is ideal to prevent restore tokens being added at arbitrary locations, so the check was to make sure the shadow stack had never been written to. 3. It stood out from the rest of the madvise flags, as more of direct action than a hint at future desired behavior. So rather than repurpose two existing syscalls (mmap, madvise) that don't quite fit, just implement a new map_shadow_stack syscall to allow userspace to map and setup new shadow stacks in one step. While ucontext is the primary motivator, userspace may have other unforeseen reasons to setup its own shadow stacks using the WRSS instruction. Towards this provide a flag so that stacks can be optionally setup securely for the common case of ucontext without enabling WRSS. Or potentially have the kernel set up the shadow stack in some new way. The following example demonstrates how to create a new shadow stack with map_shadow_stack: void *shstk = map_shadow_stack(addr, stack_size, SHADOW_STACK_SET_TOKEN); Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-35-rick.p.edgecombe%40intel.com
2023-06-12 17:11:00 -07:00
static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
unsigned long token_offset, bool set_res_tok)
{
int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
struct mm_struct *mm = current->mm;
x86/shstk: Introduce map_shadow_stack syscall When operating with shadow stacks enabled, the kernel will automatically allocate shadow stacks for new threads, however in some cases userspace will need additional shadow stacks. The main example of this is the ucontext family of functions, which require userspace allocating and pivoting to userspace managed stacks. Unlike most other user memory permissions, shadow stacks need to be provisioned with special data in order to be useful. They need to be setup with a restore token so that userspace can pivot to them via the RSTORSSP instruction. But, the security design of shadow stacks is that they should not be written to except in limited circumstances. This presents a problem for userspace, as to how userspace can provision this special data, without allowing for the shadow stack to be generally writable. Previously, a new PROT_SHADOW_STACK was attempted, which could be mprotect()ed from RW permissions after the data was provisioned. This was found to not be secure enough, as other threads could write to the shadow stack during the writable window. The kernel can use a special instruction, WRUSS, to write directly to userspace shadow stacks. So the solution can be that memory can be mapped as shadow stack permissions from the beginning (never generally writable in userspace), and the kernel itself can write the restore token. First, a new madvise() flag was explored, which could operate on the PROT_SHADOW_STACK memory. This had a couple of downsides: 1. Extra checks were needed in mprotect() to prevent writable memory from ever becoming PROT_SHADOW_STACK. 2. Extra checks/vma state were needed in the new madvise() to prevent restore tokens being written into the middle of pre-used shadow stacks. It is ideal to prevent restore tokens being added at arbitrary locations, so the check was to make sure the shadow stack had never been written to. 3. It stood out from the rest of the madvise flags, as more of direct action than a hint at future desired behavior. So rather than repurpose two existing syscalls (mmap, madvise) that don't quite fit, just implement a new map_shadow_stack syscall to allow userspace to map and setup new shadow stacks in one step. While ucontext is the primary motivator, userspace may have other unforeseen reasons to setup its own shadow stacks using the WRSS instruction. Towards this provide a flag so that stacks can be optionally setup securely for the common case of ucontext without enabling WRSS. Or potentially have the kernel set up the shadow stack in some new way. The following example demonstrates how to create a new shadow stack with map_shadow_stack: void *shstk = map_shadow_stack(addr, stack_size, SHADOW_STACK_SET_TOKEN); Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-35-rick.p.edgecombe%40intel.com
2023-06-12 17:11:00 -07:00
unsigned long mapped_addr, unused;
x86/shstk: Introduce map_shadow_stack syscall When operating with shadow stacks enabled, the kernel will automatically allocate shadow stacks for new threads, however in some cases userspace will need additional shadow stacks. The main example of this is the ucontext family of functions, which require userspace allocating and pivoting to userspace managed stacks. Unlike most other user memory permissions, shadow stacks need to be provisioned with special data in order to be useful. They need to be setup with a restore token so that userspace can pivot to them via the RSTORSSP instruction. But, the security design of shadow stacks is that they should not be written to except in limited circumstances. This presents a problem for userspace, as to how userspace can provision this special data, without allowing for the shadow stack to be generally writable. Previously, a new PROT_SHADOW_STACK was attempted, which could be mprotect()ed from RW permissions after the data was provisioned. This was found to not be secure enough, as other threads could write to the shadow stack during the writable window. The kernel can use a special instruction, WRUSS, to write directly to userspace shadow stacks. So the solution can be that memory can be mapped as shadow stack permissions from the beginning (never generally writable in userspace), and the kernel itself can write the restore token. First, a new madvise() flag was explored, which could operate on the PROT_SHADOW_STACK memory. This had a couple of downsides: 1. Extra checks were needed in mprotect() to prevent writable memory from ever becoming PROT_SHADOW_STACK. 2. Extra checks/vma state were needed in the new madvise() to prevent restore tokens being written into the middle of pre-used shadow stacks. It is ideal to prevent restore tokens being added at arbitrary locations, so the check was to make sure the shadow stack had never been written to. 3. It stood out from the rest of the madvise flags, as more of direct action than a hint at future desired behavior. So rather than repurpose two existing syscalls (mmap, madvise) that don't quite fit, just implement a new map_shadow_stack syscall to allow userspace to map and setup new shadow stacks in one step. While ucontext is the primary motivator, userspace may have other unforeseen reasons to setup its own shadow stacks using the WRSS instruction. Towards this provide a flag so that stacks can be optionally setup securely for the common case of ucontext without enabling WRSS. Or potentially have the kernel set up the shadow stack in some new way. The following example demonstrates how to create a new shadow stack with map_shadow_stack: void *shstk = map_shadow_stack(addr, stack_size, SHADOW_STACK_SET_TOKEN); Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-35-rick.p.edgecombe%40intel.com
2023-06-12 17:11:00 -07:00
if (addr)
flags |= MAP_FIXED_NOREPLACE;
x86/shstk: Introduce map_shadow_stack syscall When operating with shadow stacks enabled, the kernel will automatically allocate shadow stacks for new threads, however in some cases userspace will need additional shadow stacks. The main example of this is the ucontext family of functions, which require userspace allocating and pivoting to userspace managed stacks. Unlike most other user memory permissions, shadow stacks need to be provisioned with special data in order to be useful. They need to be setup with a restore token so that userspace can pivot to them via the RSTORSSP instruction. But, the security design of shadow stacks is that they should not be written to except in limited circumstances. This presents a problem for userspace, as to how userspace can provision this special data, without allowing for the shadow stack to be generally writable. Previously, a new PROT_SHADOW_STACK was attempted, which could be mprotect()ed from RW permissions after the data was provisioned. This was found to not be secure enough, as other threads could write to the shadow stack during the writable window. The kernel can use a special instruction, WRUSS, to write directly to userspace shadow stacks. So the solution can be that memory can be mapped as shadow stack permissions from the beginning (never generally writable in userspace), and the kernel itself can write the restore token. First, a new madvise() flag was explored, which could operate on the PROT_SHADOW_STACK memory. This had a couple of downsides: 1. Extra checks were needed in mprotect() to prevent writable memory from ever becoming PROT_SHADOW_STACK. 2. Extra checks/vma state were needed in the new madvise() to prevent restore tokens being written into the middle of pre-used shadow stacks. It is ideal to prevent restore tokens being added at arbitrary locations, so the check was to make sure the shadow stack had never been written to. 3. It stood out from the rest of the madvise flags, as more of direct action than a hint at future desired behavior. So rather than repurpose two existing syscalls (mmap, madvise) that don't quite fit, just implement a new map_shadow_stack syscall to allow userspace to map and setup new shadow stacks in one step. While ucontext is the primary motivator, userspace may have other unforeseen reasons to setup its own shadow stacks using the WRSS instruction. Towards this provide a flag so that stacks can be optionally setup securely for the common case of ucontext without enabling WRSS. Or potentially have the kernel set up the shadow stack in some new way. The following example demonstrates how to create a new shadow stack with map_shadow_stack: void *shstk = map_shadow_stack(addr, stack_size, SHADOW_STACK_SET_TOKEN); Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-35-rick.p.edgecombe%40intel.com
2023-06-12 17:11:00 -07:00
mmap_write_lock(mm);
mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
mmap_write_unlock(mm);
x86/shstk: Introduce map_shadow_stack syscall When operating with shadow stacks enabled, the kernel will automatically allocate shadow stacks for new threads, however in some cases userspace will need additional shadow stacks. The main example of this is the ucontext family of functions, which require userspace allocating and pivoting to userspace managed stacks. Unlike most other user memory permissions, shadow stacks need to be provisioned with special data in order to be useful. They need to be setup with a restore token so that userspace can pivot to them via the RSTORSSP instruction. But, the security design of shadow stacks is that they should not be written to except in limited circumstances. This presents a problem for userspace, as to how userspace can provision this special data, without allowing for the shadow stack to be generally writable. Previously, a new PROT_SHADOW_STACK was attempted, which could be mprotect()ed from RW permissions after the data was provisioned. This was found to not be secure enough, as other threads could write to the shadow stack during the writable window. The kernel can use a special instruction, WRUSS, to write directly to userspace shadow stacks. So the solution can be that memory can be mapped as shadow stack permissions from the beginning (never generally writable in userspace), and the kernel itself can write the restore token. First, a new madvise() flag was explored, which could operate on the PROT_SHADOW_STACK memory. This had a couple of downsides: 1. Extra checks were needed in mprotect() to prevent writable memory from ever becoming PROT_SHADOW_STACK. 2. Extra checks/vma state were needed in the new madvise() to prevent restore tokens being written into the middle of pre-used shadow stacks. It is ideal to prevent restore tokens being added at arbitrary locations, so the check was to make sure the shadow stack had never been written to. 3. It stood out from the rest of the madvise flags, as more of direct action than a hint at future desired behavior. So rather than repurpose two existing syscalls (mmap, madvise) that don't quite fit, just implement a new map_shadow_stack syscall to allow userspace to map and setup new shadow stacks in one step. While ucontext is the primary motivator, userspace may have other unforeseen reasons to setup its own shadow stacks using the WRSS instruction. Towards this provide a flag so that stacks can be optionally setup securely for the common case of ucontext without enabling WRSS. Or potentially have the kernel set up the shadow stack in some new way. The following example demonstrates how to create a new shadow stack with map_shadow_stack: void *shstk = map_shadow_stack(addr, stack_size, SHADOW_STACK_SET_TOKEN); Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-35-rick.p.edgecombe%40intel.com
2023-06-12 17:11:00 -07:00
if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
goto out;
if (create_rstor_token(mapped_addr + token_offset, NULL)) {
vm_munmap(mapped_addr, size);
return -EINVAL;
}
out:
return mapped_addr;
}
static unsigned long adjust_shstk_size(unsigned long size)
{
if (size)
return PAGE_ALIGN(size);
return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
}
static void unmap_shadow_stack(u64 base, u64 size)
{
int r;
r = vm_munmap(base, size);
/*
* mmap_write_lock_killable() failed with -EINTR. This means
* the process is about to die and have it's MM cleaned up.
* This task shouldn't ever make it back to userspace. In this
* case it is ok to leak a shadow stack, so just exit out.
*/
if (r == -EINTR)
return;
/*
* For all other types of vm_munmap() failure, either the
* system is out of memory or there is bug.
*/
WARN_ON_ONCE(r);
}
static int shstk_setup(void)
{
struct thread_shstk *shstk = &current->thread.shstk;
unsigned long addr, size;
/* Already enabled */
if (features_enabled(ARCH_SHSTK_SHSTK))
return 0;
/* Also not supported for 32 bit */
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
return -EOPNOTSUPP;
size = adjust_shstk_size(0);
x86/shstk: Introduce map_shadow_stack syscall When operating with shadow stacks enabled, the kernel will automatically allocate shadow stacks for new threads, however in some cases userspace will need additional shadow stacks. The main example of this is the ucontext family of functions, which require userspace allocating and pivoting to userspace managed stacks. Unlike most other user memory permissions, shadow stacks need to be provisioned with special data in order to be useful. They need to be setup with a restore token so that userspace can pivot to them via the RSTORSSP instruction. But, the security design of shadow stacks is that they should not be written to except in limited circumstances. This presents a problem for userspace, as to how userspace can provision this special data, without allowing for the shadow stack to be generally writable. Previously, a new PROT_SHADOW_STACK was attempted, which could be mprotect()ed from RW permissions after the data was provisioned. This was found to not be secure enough, as other threads could write to the shadow stack during the writable window. The kernel can use a special instruction, WRUSS, to write directly to userspace shadow stacks. So the solution can be that memory can be mapped as shadow stack permissions from the beginning (never generally writable in userspace), and the kernel itself can write the restore token. First, a new madvise() flag was explored, which could operate on the PROT_SHADOW_STACK memory. This had a couple of downsides: 1. Extra checks were needed in mprotect() to prevent writable memory from ever becoming PROT_SHADOW_STACK. 2. Extra checks/vma state were needed in the new madvise() to prevent restore tokens being written into the middle of pre-used shadow stacks. It is ideal to prevent restore tokens being added at arbitrary locations, so the check was to make sure the shadow stack had never been written to. 3. It stood out from the rest of the madvise flags, as more of direct action than a hint at future desired behavior. So rather than repurpose two existing syscalls (mmap, madvise) that don't quite fit, just implement a new map_shadow_stack syscall to allow userspace to map and setup new shadow stacks in one step. While ucontext is the primary motivator, userspace may have other unforeseen reasons to setup its own shadow stacks using the WRSS instruction. Towards this provide a flag so that stacks can be optionally setup securely for the common case of ucontext without enabling WRSS. Or potentially have the kernel set up the shadow stack in some new way. The following example demonstrates how to create a new shadow stack with map_shadow_stack: void *shstk = map_shadow_stack(addr, stack_size, SHADOW_STACK_SET_TOKEN); Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-35-rick.p.edgecombe%40intel.com
2023-06-12 17:11:00 -07:00
addr = alloc_shstk(0, size, 0, false);
if (IS_ERR_VALUE(addr))
return PTR_ERR((void *)addr);
fpregs_lock_and_load();
wrmsrl(MSR_IA32_PL3_SSP, addr + size);
wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
fpregs_unlock();
shstk->base = addr;
shstk->size = size;
features_set(ARCH_SHSTK_SHSTK);
return 0;
}
void reset_thread_features(void)
{
memset(&current->thread.shstk, 0, sizeof(struct thread_shstk));
current->thread.features = 0;
current->thread.features_locked = 0;
}
x86/shstk: Handle thread shadow stack When a process is duplicated, but the child shares the address space with the parent, there is potential for the threads sharing a single stack to cause conflicts for each other. In the normal non-CET case this is handled in two ways. With regular CLONE_VM a new stack is provided by userspace such that the parent and child have different stacks. For vfork, the parent is suspended until the child exits. So as long as the child doesn't return from the vfork()/CLONE_VFORK calling function and sticks to a limited set of operations, the parent and child can share the same stack. For shadow stack, these scenarios present similar sharing problems. For the CLONE_VM case, the child and the parent must have separate shadow stacks. Instead of changing clone to take a shadow stack, have the kernel just allocate one and switch to it. Use stack_size passed from clone3() syscall for thread shadow stack size. A compat-mode thread shadow stack size is further reduced to 1/4. This allows more threads to run in a 32-bit address space. The clone() does not pass stack_size, which was added to clone3(). In that case, use RLIMIT_STACK size and cap to 4 GB. For shadow stack enabled vfork(), the parent and child can share the same shadow stack, like they can share a normal stack. Since the parent is suspended until the child terminates, the child will not interfere with the parent while executing as long as it doesn't return from the vfork() and overwrite up the shadow stack. The child can safely overwrite down the shadow stack, as the parent can just overwrite this later. So CET does not add any additional limitations for vfork(). Free the shadow stack on thread exit by doing it in mm_release(). Skip this when exiting a vfork() child since the stack is shared in the parent. During this operation, the shadow stack pointer of the new thread needs to be updated to point to the newly allocated shadow stack. Since the ability to do this is confined to the FPU subsystem, change fpu_clone() to take the new shadow stack pointer, and update it internally inside the FPU subsystem. This part was suggested by Thomas Gleixner. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-30-rick.p.edgecombe%40intel.com
2023-06-12 17:10:55 -07:00
unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
unsigned long stack_size)
{
struct thread_shstk *shstk = &tsk->thread.shstk;
unsigned long addr, size;
/*
* If shadow stack is not enabled on the new thread, skip any
* switch to a new shadow stack.
*/
if (!features_enabled(ARCH_SHSTK_SHSTK))
return 0;
/*
x86/shstk: Handle vfork clone failure correctly Shadow stacks are allocated automatically and freed on exit, depending on the clone flags. The two cases where new shadow stacks are not allocated are !CLONE_VM (fork()) and CLONE_VFORK (vfork()). For !CLONE_VM, although a new stack is not allocated, it can be freed normally because it will happen in the child's copy of the VM. However, for CLONE_VFORK the parent and the child are actually using the same shadow stack. So the kernel doesn't need to allocate *or* free a shadow stack for a CLONE_VFORK child. CLONE_VFORK children already need special tracking to avoid returning to userspace until the child exits or execs. Shadow stack uses this same tracking to avoid freeing CLONE_VFORK shadow stacks. However, the tracking is not setup until the clone has succeeded (internally). Which means, if a CLONE_VFORK fails, the existing logic will not know it is a CLONE_VFORK and proceed to unmap the parents shadow stack. This error handling cleanup logic runs via exit_thread() in the bad_fork_cleanup_thread label in copy_process(). The issue was seen in the glibc test "posix/tst-spawn3-pidfd" while running with shadow stack using currently out-of-tree glibc patches. Fix it by not unmapping the vfork shadow stack in the error case as well. Since clone is implemented in core code, it is not ideal to pass the clone flags along the error path in order to have shadow stack code have symmetric logic in the freeing half of the thread shadow stack handling. Instead use the existing state for thread shadow stacks to track whether the thread is managing its own shadow stack. For CLONE_VFORK, simply set shstk->base and shstk->size to 0, and have it mean the thread is not managing a shadow stack and so should skip cleanup work. Implement this by breaking up the CLONE_VFORK and !CLONE_VM cases in shstk_alloc_thread_stack() to separate conditionals since, the logic is now different between them. In the case of CLONE_VFORK && !CLONE_VM, the existing behavior is to not clean up the shadow stack in the child (which should go away quickly with either be exit or exec), so maintain that behavior by handling the CLONE_VFORK case first in the allocation path. This new logioc cleanly handles the case of normal, successful CLONE_VFORK's skipping cleaning up their shadow stack's on exit as well. So remove the existing, vfork shadow stack freeing logic. This is in deactivate_mm() where vfork_done is used to tell if it is a vfork child that can skip cleaning up the thread shadow stack. Fixes: b2926a36b97a ("x86/shstk: Handle thread shadow stack") Reported-by: H.J. Lu <hjl.tools@gmail.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Tested-by: H.J. Lu <hjl.tools@gmail.com> Link: https://lore.kernel.org/all/20230908203655.543765-2-rick.p.edgecombe%40intel.com
2023-09-08 13:36:53 -07:00
* For CLONE_VFORK the child will share the parents shadow stack.
* Make sure to clear the internal tracking of the thread shadow
* stack so the freeing logic run for child knows to leave it alone.
*/
if (clone_flags & CLONE_VFORK) {
shstk->base = 0;
shstk->size = 0;
return 0;
}
/*
* For !CLONE_VM the child will use a copy of the parents shadow
x86/shstk: Handle thread shadow stack When a process is duplicated, but the child shares the address space with the parent, there is potential for the threads sharing a single stack to cause conflicts for each other. In the normal non-CET case this is handled in two ways. With regular CLONE_VM a new stack is provided by userspace such that the parent and child have different stacks. For vfork, the parent is suspended until the child exits. So as long as the child doesn't return from the vfork()/CLONE_VFORK calling function and sticks to a limited set of operations, the parent and child can share the same stack. For shadow stack, these scenarios present similar sharing problems. For the CLONE_VM case, the child and the parent must have separate shadow stacks. Instead of changing clone to take a shadow stack, have the kernel just allocate one and switch to it. Use stack_size passed from clone3() syscall for thread shadow stack size. A compat-mode thread shadow stack size is further reduced to 1/4. This allows more threads to run in a 32-bit address space. The clone() does not pass stack_size, which was added to clone3(). In that case, use RLIMIT_STACK size and cap to 4 GB. For shadow stack enabled vfork(), the parent and child can share the same shadow stack, like they can share a normal stack. Since the parent is suspended until the child terminates, the child will not interfere with the parent while executing as long as it doesn't return from the vfork() and overwrite up the shadow stack. The child can safely overwrite down the shadow stack, as the parent can just overwrite this later. So CET does not add any additional limitations for vfork(). Free the shadow stack on thread exit by doing it in mm_release(). Skip this when exiting a vfork() child since the stack is shared in the parent. During this operation, the shadow stack pointer of the new thread needs to be updated to point to the newly allocated shadow stack. Since the ability to do this is confined to the FPU subsystem, change fpu_clone() to take the new shadow stack pointer, and update it internally inside the FPU subsystem. This part was suggested by Thomas Gleixner. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-30-rick.p.edgecombe%40intel.com
2023-06-12 17:10:55 -07:00
* stack.
*/
x86/shstk: Handle vfork clone failure correctly Shadow stacks are allocated automatically and freed on exit, depending on the clone flags. The two cases where new shadow stacks are not allocated are !CLONE_VM (fork()) and CLONE_VFORK (vfork()). For !CLONE_VM, although a new stack is not allocated, it can be freed normally because it will happen in the child's copy of the VM. However, for CLONE_VFORK the parent and the child are actually using the same shadow stack. So the kernel doesn't need to allocate *or* free a shadow stack for a CLONE_VFORK child. CLONE_VFORK children already need special tracking to avoid returning to userspace until the child exits or execs. Shadow stack uses this same tracking to avoid freeing CLONE_VFORK shadow stacks. However, the tracking is not setup until the clone has succeeded (internally). Which means, if a CLONE_VFORK fails, the existing logic will not know it is a CLONE_VFORK and proceed to unmap the parents shadow stack. This error handling cleanup logic runs via exit_thread() in the bad_fork_cleanup_thread label in copy_process(). The issue was seen in the glibc test "posix/tst-spawn3-pidfd" while running with shadow stack using currently out-of-tree glibc patches. Fix it by not unmapping the vfork shadow stack in the error case as well. Since clone is implemented in core code, it is not ideal to pass the clone flags along the error path in order to have shadow stack code have symmetric logic in the freeing half of the thread shadow stack handling. Instead use the existing state for thread shadow stacks to track whether the thread is managing its own shadow stack. For CLONE_VFORK, simply set shstk->base and shstk->size to 0, and have it mean the thread is not managing a shadow stack and so should skip cleanup work. Implement this by breaking up the CLONE_VFORK and !CLONE_VM cases in shstk_alloc_thread_stack() to separate conditionals since, the logic is now different between them. In the case of CLONE_VFORK && !CLONE_VM, the existing behavior is to not clean up the shadow stack in the child (which should go away quickly with either be exit or exec), so maintain that behavior by handling the CLONE_VFORK case first in the allocation path. This new logioc cleanly handles the case of normal, successful CLONE_VFORK's skipping cleaning up their shadow stack's on exit as well. So remove the existing, vfork shadow stack freeing logic. This is in deactivate_mm() where vfork_done is used to tell if it is a vfork child that can skip cleaning up the thread shadow stack. Fixes: b2926a36b97a ("x86/shstk: Handle thread shadow stack") Reported-by: H.J. Lu <hjl.tools@gmail.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Tested-by: H.J. Lu <hjl.tools@gmail.com> Link: https://lore.kernel.org/all/20230908203655.543765-2-rick.p.edgecombe%40intel.com
2023-09-08 13:36:53 -07:00
if (!(clone_flags & CLONE_VM))
x86/shstk: Handle thread shadow stack When a process is duplicated, but the child shares the address space with the parent, there is potential for the threads sharing a single stack to cause conflicts for each other. In the normal non-CET case this is handled in two ways. With regular CLONE_VM a new stack is provided by userspace such that the parent and child have different stacks. For vfork, the parent is suspended until the child exits. So as long as the child doesn't return from the vfork()/CLONE_VFORK calling function and sticks to a limited set of operations, the parent and child can share the same stack. For shadow stack, these scenarios present similar sharing problems. For the CLONE_VM case, the child and the parent must have separate shadow stacks. Instead of changing clone to take a shadow stack, have the kernel just allocate one and switch to it. Use stack_size passed from clone3() syscall for thread shadow stack size. A compat-mode thread shadow stack size is further reduced to 1/4. This allows more threads to run in a 32-bit address space. The clone() does not pass stack_size, which was added to clone3(). In that case, use RLIMIT_STACK size and cap to 4 GB. For shadow stack enabled vfork(), the parent and child can share the same shadow stack, like they can share a normal stack. Since the parent is suspended until the child terminates, the child will not interfere with the parent while executing as long as it doesn't return from the vfork() and overwrite up the shadow stack. The child can safely overwrite down the shadow stack, as the parent can just overwrite this later. So CET does not add any additional limitations for vfork(). Free the shadow stack on thread exit by doing it in mm_release(). Skip this when exiting a vfork() child since the stack is shared in the parent. During this operation, the shadow stack pointer of the new thread needs to be updated to point to the newly allocated shadow stack. Since the ability to do this is confined to the FPU subsystem, change fpu_clone() to take the new shadow stack pointer, and update it internally inside the FPU subsystem. This part was suggested by Thomas Gleixner. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-30-rick.p.edgecombe%40intel.com
2023-06-12 17:10:55 -07:00
return 0;
size = adjust_shstk_size(stack_size);
x86/shstk: Introduce map_shadow_stack syscall When operating with shadow stacks enabled, the kernel will automatically allocate shadow stacks for new threads, however in some cases userspace will need additional shadow stacks. The main example of this is the ucontext family of functions, which require userspace allocating and pivoting to userspace managed stacks. Unlike most other user memory permissions, shadow stacks need to be provisioned with special data in order to be useful. They need to be setup with a restore token so that userspace can pivot to them via the RSTORSSP instruction. But, the security design of shadow stacks is that they should not be written to except in limited circumstances. This presents a problem for userspace, as to how userspace can provision this special data, without allowing for the shadow stack to be generally writable. Previously, a new PROT_SHADOW_STACK was attempted, which could be mprotect()ed from RW permissions after the data was provisioned. This was found to not be secure enough, as other threads could write to the shadow stack during the writable window. The kernel can use a special instruction, WRUSS, to write directly to userspace shadow stacks. So the solution can be that memory can be mapped as shadow stack permissions from the beginning (never generally writable in userspace), and the kernel itself can write the restore token. First, a new madvise() flag was explored, which could operate on the PROT_SHADOW_STACK memory. This had a couple of downsides: 1. Extra checks were needed in mprotect() to prevent writable memory from ever becoming PROT_SHADOW_STACK. 2. Extra checks/vma state were needed in the new madvise() to prevent restore tokens being written into the middle of pre-used shadow stacks. It is ideal to prevent restore tokens being added at arbitrary locations, so the check was to make sure the shadow stack had never been written to. 3. It stood out from the rest of the madvise flags, as more of direct action than a hint at future desired behavior. So rather than repurpose two existing syscalls (mmap, madvise) that don't quite fit, just implement a new map_shadow_stack syscall to allow userspace to map and setup new shadow stacks in one step. While ucontext is the primary motivator, userspace may have other unforeseen reasons to setup its own shadow stacks using the WRSS instruction. Towards this provide a flag so that stacks can be optionally setup securely for the common case of ucontext without enabling WRSS. Or potentially have the kernel set up the shadow stack in some new way. The following example demonstrates how to create a new shadow stack with map_shadow_stack: void *shstk = map_shadow_stack(addr, stack_size, SHADOW_STACK_SET_TOKEN); Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-35-rick.p.edgecombe%40intel.com
2023-06-12 17:11:00 -07:00
addr = alloc_shstk(0, size, 0, false);
x86/shstk: Handle thread shadow stack When a process is duplicated, but the child shares the address space with the parent, there is potential for the threads sharing a single stack to cause conflicts for each other. In the normal non-CET case this is handled in two ways. With regular CLONE_VM a new stack is provided by userspace such that the parent and child have different stacks. For vfork, the parent is suspended until the child exits. So as long as the child doesn't return from the vfork()/CLONE_VFORK calling function and sticks to a limited set of operations, the parent and child can share the same stack. For shadow stack, these scenarios present similar sharing problems. For the CLONE_VM case, the child and the parent must have separate shadow stacks. Instead of changing clone to take a shadow stack, have the kernel just allocate one and switch to it. Use stack_size passed from clone3() syscall for thread shadow stack size. A compat-mode thread shadow stack size is further reduced to 1/4. This allows more threads to run in a 32-bit address space. The clone() does not pass stack_size, which was added to clone3(). In that case, use RLIMIT_STACK size and cap to 4 GB. For shadow stack enabled vfork(), the parent and child can share the same shadow stack, like they can share a normal stack. Since the parent is suspended until the child terminates, the child will not interfere with the parent while executing as long as it doesn't return from the vfork() and overwrite up the shadow stack. The child can safely overwrite down the shadow stack, as the parent can just overwrite this later. So CET does not add any additional limitations for vfork(). Free the shadow stack on thread exit by doing it in mm_release(). Skip this when exiting a vfork() child since the stack is shared in the parent. During this operation, the shadow stack pointer of the new thread needs to be updated to point to the newly allocated shadow stack. Since the ability to do this is confined to the FPU subsystem, change fpu_clone() to take the new shadow stack pointer, and update it internally inside the FPU subsystem. This part was suggested by Thomas Gleixner. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-30-rick.p.edgecombe%40intel.com
2023-06-12 17:10:55 -07:00
if (IS_ERR_VALUE(addr))
return addr;
shstk->base = addr;
shstk->size = size;
return addr + size;
}
x86/shstk: Introduce routines modifying shstk Shadow stacks are normally written to via CALL/RET or specific CET instructions like RSTORSSP/SAVEPREVSSP. However, sometimes the kernel will need to write to the shadow stack directly using the ring-0 only WRUSS instruction. A shadow stack restore token marks a restore point of the shadow stack, and the address in a token must point directly above the token, which is within the same shadow stack. This is distinctively different from other pointers on the shadow stack, since those pointers point to executable code area. Introduce token setup and verify routines. Also introduce WRUSS, which is a kernel-mode instruction but writes directly to user shadow stack. In future patches that enable shadow stack to work with signals, the kernel will need something to denote the point in the stack where sigreturn may be called. This will prevent attackers calling sigreturn at arbitrary places in the stack, in order to help prevent SROP attacks. To do this, something that can only be written by the kernel needs to be placed on the shadow stack. This can be accomplished by setting bit 63 in the frame written to the shadow stack. Userspace return addresses can't have this bit set as it is in the kernel range. It also can't be a valid restore token. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-31-rick.p.edgecombe%40intel.com
2023-06-12 17:10:56 -07:00
static unsigned long get_user_shstk_addr(void)
{
unsigned long long ssp;
fpregs_lock_and_load();
rdmsrl(MSR_IA32_PL3_SSP, ssp);
fpregs_unlock();
return ssp;
}
#define SHSTK_DATA_BIT BIT(63)
static int put_shstk_data(u64 __user *addr, u64 data)
{
if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
return -EINVAL;
/*
* Mark the high bit so that the sigframe can't be processed as a
* return address.
*/
if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
return -EFAULT;
return 0;
}
static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
{
unsigned long ldata;
if (unlikely(get_user(ldata, addr)))
return -EFAULT;
if (!(ldata & SHSTK_DATA_BIT))
return -EINVAL;
*data = ldata & ~SHSTK_DATA_BIT;
return 0;
}
x86/shstk: Handle signals for shadow stack When a signal is handled, the context is pushed to the stack before handling it. For shadow stacks, since the shadow stack only tracks return addresses, there isn't any state that needs to be pushed. However, there are still a few things that need to be done. These things are visible to userspace and which will be kernel ABI for shadow stacks. One is to make sure the restorer address is written to shadow stack, since the signal handler (if not changing ucontext) returns to the restorer, and the restorer calls sigreturn. So add the restorer on the shadow stack before handling the signal, so there is not a conflict when the signal handler returns to the restorer. The other thing to do is to place some type of checkable token on the thread's shadow stack before handling the signal and check it during sigreturn. This is an extra layer of protection to hamper attackers calling sigreturn manually as in SROP-like attacks. For this token the shadow stack data format defined earlier can be used. Have the data pushed be the previous SSP. In the future the sigreturn might want to return back to a different stack. Storing the SSP (instead of a restore offset or something) allows for future functionality that may want to restore to a different stack. So, when handling a signal push - the SSP pointing in the shadow stack data format - the restorer address below the restore token. In sigreturn, verify SSP is stored in the data format and pop the shadow stack. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-32-rick.p.edgecombe%40intel.com
2023-06-12 17:10:57 -07:00
static int shstk_push_sigframe(unsigned long *ssp)
{
unsigned long target_ssp = *ssp;
/* Token must be aligned */
if (!IS_ALIGNED(target_ssp, 8))
return -EINVAL;
*ssp -= SS_FRAME_SIZE;
if (put_shstk_data((void __user *)*ssp, target_ssp))
x86/shstk: Handle signals for shadow stack When a signal is handled, the context is pushed to the stack before handling it. For shadow stacks, since the shadow stack only tracks return addresses, there isn't any state that needs to be pushed. However, there are still a few things that need to be done. These things are visible to userspace and which will be kernel ABI for shadow stacks. One is to make sure the restorer address is written to shadow stack, since the signal handler (if not changing ucontext) returns to the restorer, and the restorer calls sigreturn. So add the restorer on the shadow stack before handling the signal, so there is not a conflict when the signal handler returns to the restorer. The other thing to do is to place some type of checkable token on the thread's shadow stack before handling the signal and check it during sigreturn. This is an extra layer of protection to hamper attackers calling sigreturn manually as in SROP-like attacks. For this token the shadow stack data format defined earlier can be used. Have the data pushed be the previous SSP. In the future the sigreturn might want to return back to a different stack. Storing the SSP (instead of a restore offset or something) allows for future functionality that may want to restore to a different stack. So, when handling a signal push - the SSP pointing in the shadow stack data format - the restorer address below the restore token. In sigreturn, verify SSP is stored in the data format and pop the shadow stack. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-32-rick.p.edgecombe%40intel.com
2023-06-12 17:10:57 -07:00
return -EFAULT;
return 0;
}
static int shstk_pop_sigframe(unsigned long *ssp)
{
x86/shstk: Check that signal frame is shadow stack mem The shadow stack signal frame is read by the kernel on sigreturn. It relies on shadow stack memory protections to prevent forgeries of this signal frame (which included the pre-signal SSP). This behavior helps userspace protect itself. However, using the INCSSP instruction userspace can adjust the SSP to 8 bytes beyond the end of a shadow stack. INCSSP performs shadow stack reads to make sure it doesn’t increment off of the shadow stack, but on the end position it actually reads 8 bytes below the new SSP. For the shadow stack HW operations, this situation (INCSSP off the end of a shadow stack by 8 bytes) would be fine. If the a RET is executed, the push to the shadow stack would fail to write to the shadow stack. If a CALL is executed, the SSP will be incremented back onto the stack and the return address will be written successfully to the very end. That is expected behavior around shadow stack underflow. However, the kernel doesn’t have a way to read shadow stack memory using shadow stack accesses. WRUSS can write to shadow stack memory with a shadow stack access which ensures the access is to shadow stack memory. But unfortunately for this case, there is no equivalent instruction for shadow stack reads. So when reading the shadow stack signal frames, the kernel currently assumes the SSP is pointing to the shadow stack and uses a normal read. The SSP pointing to shadow stack memory will be true in most cases, but as described above, in can be untrue by 8 bytes. So lookup the VMA of the shadow stack sigframe being read to verify it is shadow stack. Since the SSP can only be beyond the shadow stack by 8 bytes, and shadow stack memory is page aligned, this check only needs to be done when this type of relative position to a page boundary is encountered. So skip the extra work otherwise. Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Link: https://lore.kernel.org/all/20230613001108.3040476-34-rick.p.edgecombe%40intel.com
2023-06-12 17:10:59 -07:00
struct vm_area_struct *vma;
x86/shstk: Handle signals for shadow stack When a signal is handled, the context is pushed to the stack before handling it. For shadow stacks, since the shadow stack only tracks return addresses, there isn't any state that needs to be pushed. However, there are still a few things that need to be done. These things are visible to userspace and which will be kernel ABI for shadow stacks. One is to make sure the restorer address is written to shadow stack, since the signal handler (if not changing ucontext) returns to the restorer, and the restorer calls sigreturn. So add the restorer on the shadow stack before handling the signal, so there is not a conflict when the signal handler returns to the restorer. The other thing to do is to place some type of checkable token on the thread's shadow stack before handling the signal and check it during sigreturn. This is an extra layer of protection to hamper attackers calling sigreturn manually as in SROP-like attacks. For this token the shadow stack data format defined earlier can be used. Have the data pushed be the previous SSP. In the future the sigreturn might want to return back to a different stack. Storing the SSP (instead of a restore offset or something) allows for future functionality that may want to restore to a different stack. So, when handling a signal push - the SSP pointing in the shadow stack data format - the restorer address below the restore token. In sigreturn, verify SSP is stored in the data format and pop the shadow stack. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-32-rick.p.edgecombe%40intel.com
2023-06-12 17:10:57 -07:00
unsigned long token_addr;
x86/shstk: Check that signal frame is shadow stack mem The shadow stack signal frame is read by the kernel on sigreturn. It relies on shadow stack memory protections to prevent forgeries of this signal frame (which included the pre-signal SSP). This behavior helps userspace protect itself. However, using the INCSSP instruction userspace can adjust the SSP to 8 bytes beyond the end of a shadow stack. INCSSP performs shadow stack reads to make sure it doesn’t increment off of the shadow stack, but on the end position it actually reads 8 bytes below the new SSP. For the shadow stack HW operations, this situation (INCSSP off the end of a shadow stack by 8 bytes) would be fine. If the a RET is executed, the push to the shadow stack would fail to write to the shadow stack. If a CALL is executed, the SSP will be incremented back onto the stack and the return address will be written successfully to the very end. That is expected behavior around shadow stack underflow. However, the kernel doesn’t have a way to read shadow stack memory using shadow stack accesses. WRUSS can write to shadow stack memory with a shadow stack access which ensures the access is to shadow stack memory. But unfortunately for this case, there is no equivalent instruction for shadow stack reads. So when reading the shadow stack signal frames, the kernel currently assumes the SSP is pointing to the shadow stack and uses a normal read. The SSP pointing to shadow stack memory will be true in most cases, but as described above, in can be untrue by 8 bytes. So lookup the VMA of the shadow stack sigframe being read to verify it is shadow stack. Since the SSP can only be beyond the shadow stack by 8 bytes, and shadow stack memory is page aligned, this check only needs to be done when this type of relative position to a page boundary is encountered. So skip the extra work otherwise. Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Link: https://lore.kernel.org/all/20230613001108.3040476-34-rick.p.edgecombe%40intel.com
2023-06-12 17:10:59 -07:00
bool need_to_check_vma;
int err = 1;
x86/shstk: Handle signals for shadow stack When a signal is handled, the context is pushed to the stack before handling it. For shadow stacks, since the shadow stack only tracks return addresses, there isn't any state that needs to be pushed. However, there are still a few things that need to be done. These things are visible to userspace and which will be kernel ABI for shadow stacks. One is to make sure the restorer address is written to shadow stack, since the signal handler (if not changing ucontext) returns to the restorer, and the restorer calls sigreturn. So add the restorer on the shadow stack before handling the signal, so there is not a conflict when the signal handler returns to the restorer. The other thing to do is to place some type of checkable token on the thread's shadow stack before handling the signal and check it during sigreturn. This is an extra layer of protection to hamper attackers calling sigreturn manually as in SROP-like attacks. For this token the shadow stack data format defined earlier can be used. Have the data pushed be the previous SSP. In the future the sigreturn might want to return back to a different stack. Storing the SSP (instead of a restore offset or something) allows for future functionality that may want to restore to a different stack. So, when handling a signal push - the SSP pointing in the shadow stack data format - the restorer address below the restore token. In sigreturn, verify SSP is stored in the data format and pop the shadow stack. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-32-rick.p.edgecombe%40intel.com
2023-06-12 17:10:57 -07:00
x86/shstk: Check that signal frame is shadow stack mem The shadow stack signal frame is read by the kernel on sigreturn. It relies on shadow stack memory protections to prevent forgeries of this signal frame (which included the pre-signal SSP). This behavior helps userspace protect itself. However, using the INCSSP instruction userspace can adjust the SSP to 8 bytes beyond the end of a shadow stack. INCSSP performs shadow stack reads to make sure it doesn’t increment off of the shadow stack, but on the end position it actually reads 8 bytes below the new SSP. For the shadow stack HW operations, this situation (INCSSP off the end of a shadow stack by 8 bytes) would be fine. If the a RET is executed, the push to the shadow stack would fail to write to the shadow stack. If a CALL is executed, the SSP will be incremented back onto the stack and the return address will be written successfully to the very end. That is expected behavior around shadow stack underflow. However, the kernel doesn’t have a way to read shadow stack memory using shadow stack accesses. WRUSS can write to shadow stack memory with a shadow stack access which ensures the access is to shadow stack memory. But unfortunately for this case, there is no equivalent instruction for shadow stack reads. So when reading the shadow stack signal frames, the kernel currently assumes the SSP is pointing to the shadow stack and uses a normal read. The SSP pointing to shadow stack memory will be true in most cases, but as described above, in can be untrue by 8 bytes. So lookup the VMA of the shadow stack sigframe being read to verify it is shadow stack. Since the SSP can only be beyond the shadow stack by 8 bytes, and shadow stack memory is page aligned, this check only needs to be done when this type of relative position to a page boundary is encountered. So skip the extra work otherwise. Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Link: https://lore.kernel.org/all/20230613001108.3040476-34-rick.p.edgecombe%40intel.com
2023-06-12 17:10:59 -07:00
/*
* It is possible for the SSP to be off the end of a shadow stack by 4
* or 8 bytes. If the shadow stack is at the start of a page or 4 bytes
* before it, it might be this case, so check that the address being
* read is actually shadow stack.
*/
if (!IS_ALIGNED(*ssp, 8))
return -EINVAL;
x86/shstk: Check that signal frame is shadow stack mem The shadow stack signal frame is read by the kernel on sigreturn. It relies on shadow stack memory protections to prevent forgeries of this signal frame (which included the pre-signal SSP). This behavior helps userspace protect itself. However, using the INCSSP instruction userspace can adjust the SSP to 8 bytes beyond the end of a shadow stack. INCSSP performs shadow stack reads to make sure it doesn’t increment off of the shadow stack, but on the end position it actually reads 8 bytes below the new SSP. For the shadow stack HW operations, this situation (INCSSP off the end of a shadow stack by 8 bytes) would be fine. If the a RET is executed, the push to the shadow stack would fail to write to the shadow stack. If a CALL is executed, the SSP will be incremented back onto the stack and the return address will be written successfully to the very end. That is expected behavior around shadow stack underflow. However, the kernel doesn’t have a way to read shadow stack memory using shadow stack accesses. WRUSS can write to shadow stack memory with a shadow stack access which ensures the access is to shadow stack memory. But unfortunately for this case, there is no equivalent instruction for shadow stack reads. So when reading the shadow stack signal frames, the kernel currently assumes the SSP is pointing to the shadow stack and uses a normal read. The SSP pointing to shadow stack memory will be true in most cases, but as described above, in can be untrue by 8 bytes. So lookup the VMA of the shadow stack sigframe being read to verify it is shadow stack. Since the SSP can only be beyond the shadow stack by 8 bytes, and shadow stack memory is page aligned, this check only needs to be done when this type of relative position to a page boundary is encountered. So skip the extra work otherwise. Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Link: https://lore.kernel.org/all/20230613001108.3040476-34-rick.p.edgecombe%40intel.com
2023-06-12 17:10:59 -07:00
need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp;
if (need_to_check_vma)
mmap_read_lock_killable(current->mm);
x86/shstk: Handle signals for shadow stack When a signal is handled, the context is pushed to the stack before handling it. For shadow stacks, since the shadow stack only tracks return addresses, there isn't any state that needs to be pushed. However, there are still a few things that need to be done. These things are visible to userspace and which will be kernel ABI for shadow stacks. One is to make sure the restorer address is written to shadow stack, since the signal handler (if not changing ucontext) returns to the restorer, and the restorer calls sigreturn. So add the restorer on the shadow stack before handling the signal, so there is not a conflict when the signal handler returns to the restorer. The other thing to do is to place some type of checkable token on the thread's shadow stack before handling the signal and check it during sigreturn. This is an extra layer of protection to hamper attackers calling sigreturn manually as in SROP-like attacks. For this token the shadow stack data format defined earlier can be used. Have the data pushed be the previous SSP. In the future the sigreturn might want to return back to a different stack. Storing the SSP (instead of a restore offset or something) allows for future functionality that may want to restore to a different stack. So, when handling a signal push - the SSP pointing in the shadow stack data format - the restorer address below the restore token. In sigreturn, verify SSP is stored in the data format and pop the shadow stack. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-32-rick.p.edgecombe%40intel.com
2023-06-12 17:10:57 -07:00
err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
if (unlikely(err))
x86/shstk: Check that signal frame is shadow stack mem The shadow stack signal frame is read by the kernel on sigreturn. It relies on shadow stack memory protections to prevent forgeries of this signal frame (which included the pre-signal SSP). This behavior helps userspace protect itself. However, using the INCSSP instruction userspace can adjust the SSP to 8 bytes beyond the end of a shadow stack. INCSSP performs shadow stack reads to make sure it doesn’t increment off of the shadow stack, but on the end position it actually reads 8 bytes below the new SSP. For the shadow stack HW operations, this situation (INCSSP off the end of a shadow stack by 8 bytes) would be fine. If the a RET is executed, the push to the shadow stack would fail to write to the shadow stack. If a CALL is executed, the SSP will be incremented back onto the stack and the return address will be written successfully to the very end. That is expected behavior around shadow stack underflow. However, the kernel doesn’t have a way to read shadow stack memory using shadow stack accesses. WRUSS can write to shadow stack memory with a shadow stack access which ensures the access is to shadow stack memory. But unfortunately for this case, there is no equivalent instruction for shadow stack reads. So when reading the shadow stack signal frames, the kernel currently assumes the SSP is pointing to the shadow stack and uses a normal read. The SSP pointing to shadow stack memory will be true in most cases, but as described above, in can be untrue by 8 bytes. So lookup the VMA of the shadow stack sigframe being read to verify it is shadow stack. Since the SSP can only be beyond the shadow stack by 8 bytes, and shadow stack memory is page aligned, this check only needs to be done when this type of relative position to a page boundary is encountered. So skip the extra work otherwise. Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Link: https://lore.kernel.org/all/20230613001108.3040476-34-rick.p.edgecombe%40intel.com
2023-06-12 17:10:59 -07:00
goto out_err;
if (need_to_check_vma) {
vma = find_vma(current->mm, *ssp);
if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) {
err = -EFAULT;
goto out_err;
}
mmap_read_unlock(current->mm);
}
x86/shstk: Handle signals for shadow stack When a signal is handled, the context is pushed to the stack before handling it. For shadow stacks, since the shadow stack only tracks return addresses, there isn't any state that needs to be pushed. However, there are still a few things that need to be done. These things are visible to userspace and which will be kernel ABI for shadow stacks. One is to make sure the restorer address is written to shadow stack, since the signal handler (if not changing ucontext) returns to the restorer, and the restorer calls sigreturn. So add the restorer on the shadow stack before handling the signal, so there is not a conflict when the signal handler returns to the restorer. The other thing to do is to place some type of checkable token on the thread's shadow stack before handling the signal and check it during sigreturn. This is an extra layer of protection to hamper attackers calling sigreturn manually as in SROP-like attacks. For this token the shadow stack data format defined earlier can be used. Have the data pushed be the previous SSP. In the future the sigreturn might want to return back to a different stack. Storing the SSP (instead of a restore offset or something) allows for future functionality that may want to restore to a different stack. So, when handling a signal push - the SSP pointing in the shadow stack data format - the restorer address below the restore token. In sigreturn, verify SSP is stored in the data format and pop the shadow stack. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-32-rick.p.edgecombe%40intel.com
2023-06-12 17:10:57 -07:00
/* Restore SSP aligned? */
if (unlikely(!IS_ALIGNED(token_addr, 8)))
return -EINVAL;
/* SSP in userspace? */
if (unlikely(token_addr >= TASK_SIZE_MAX))
return -EINVAL;
*ssp = token_addr;
return 0;
x86/shstk: Check that signal frame is shadow stack mem The shadow stack signal frame is read by the kernel on sigreturn. It relies on shadow stack memory protections to prevent forgeries of this signal frame (which included the pre-signal SSP). This behavior helps userspace protect itself. However, using the INCSSP instruction userspace can adjust the SSP to 8 bytes beyond the end of a shadow stack. INCSSP performs shadow stack reads to make sure it doesn’t increment off of the shadow stack, but on the end position it actually reads 8 bytes below the new SSP. For the shadow stack HW operations, this situation (INCSSP off the end of a shadow stack by 8 bytes) would be fine. If the a RET is executed, the push to the shadow stack would fail to write to the shadow stack. If a CALL is executed, the SSP will be incremented back onto the stack and the return address will be written successfully to the very end. That is expected behavior around shadow stack underflow. However, the kernel doesn’t have a way to read shadow stack memory using shadow stack accesses. WRUSS can write to shadow stack memory with a shadow stack access which ensures the access is to shadow stack memory. But unfortunately for this case, there is no equivalent instruction for shadow stack reads. So when reading the shadow stack signal frames, the kernel currently assumes the SSP is pointing to the shadow stack and uses a normal read. The SSP pointing to shadow stack memory will be true in most cases, but as described above, in can be untrue by 8 bytes. So lookup the VMA of the shadow stack sigframe being read to verify it is shadow stack. Since the SSP can only be beyond the shadow stack by 8 bytes, and shadow stack memory is page aligned, this check only needs to be done when this type of relative position to a page boundary is encountered. So skip the extra work otherwise. Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Link: https://lore.kernel.org/all/20230613001108.3040476-34-rick.p.edgecombe%40intel.com
2023-06-12 17:10:59 -07:00
out_err:
if (need_to_check_vma)
mmap_read_unlock(current->mm);
return err;
x86/shstk: Handle signals for shadow stack When a signal is handled, the context is pushed to the stack before handling it. For shadow stacks, since the shadow stack only tracks return addresses, there isn't any state that needs to be pushed. However, there are still a few things that need to be done. These things are visible to userspace and which will be kernel ABI for shadow stacks. One is to make sure the restorer address is written to shadow stack, since the signal handler (if not changing ucontext) returns to the restorer, and the restorer calls sigreturn. So add the restorer on the shadow stack before handling the signal, so there is not a conflict when the signal handler returns to the restorer. The other thing to do is to place some type of checkable token on the thread's shadow stack before handling the signal and check it during sigreturn. This is an extra layer of protection to hamper attackers calling sigreturn manually as in SROP-like attacks. For this token the shadow stack data format defined earlier can be used. Have the data pushed be the previous SSP. In the future the sigreturn might want to return back to a different stack. Storing the SSP (instead of a restore offset or something) allows for future functionality that may want to restore to a different stack. So, when handling a signal push - the SSP pointing in the shadow stack data format - the restorer address below the restore token. In sigreturn, verify SSP is stored in the data format and pop the shadow stack. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-32-rick.p.edgecombe%40intel.com
2023-06-12 17:10:57 -07:00
}
int setup_signal_shadow_stack(struct ksignal *ksig)
{
void __user *restorer = ksig->ka.sa.sa_restorer;
unsigned long ssp;
int err;
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
!features_enabled(ARCH_SHSTK_SHSTK))
return 0;
if (!restorer)
return -EINVAL;
ssp = get_user_shstk_addr();
if (unlikely(!ssp))
return -EINVAL;
err = shstk_push_sigframe(&ssp);
if (unlikely(err))
return err;
/* Push restorer address */
ssp -= SS_FRAME_SIZE;
err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
if (unlikely(err))
return -EFAULT;
fpregs_lock_and_load();
wrmsrl(MSR_IA32_PL3_SSP, ssp);
fpregs_unlock();
return 0;
}
int restore_signal_shadow_stack(void)
{
unsigned long ssp;
int err;
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
!features_enabled(ARCH_SHSTK_SHSTK))
return 0;
ssp = get_user_shstk_addr();
if (unlikely(!ssp))
return -EINVAL;
err = shstk_pop_sigframe(&ssp);
if (unlikely(err))
return err;
fpregs_lock_and_load();
wrmsrl(MSR_IA32_PL3_SSP, ssp);
fpregs_unlock();
return 0;
}
void shstk_free(struct task_struct *tsk)
{
struct thread_shstk *shstk = &tsk->thread.shstk;
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
!features_enabled(ARCH_SHSTK_SHSTK))
return;
x86/shstk: Handle thread shadow stack When a process is duplicated, but the child shares the address space with the parent, there is potential for the threads sharing a single stack to cause conflicts for each other. In the normal non-CET case this is handled in two ways. With regular CLONE_VM a new stack is provided by userspace such that the parent and child have different stacks. For vfork, the parent is suspended until the child exits. So as long as the child doesn't return from the vfork()/CLONE_VFORK calling function and sticks to a limited set of operations, the parent and child can share the same stack. For shadow stack, these scenarios present similar sharing problems. For the CLONE_VM case, the child and the parent must have separate shadow stacks. Instead of changing clone to take a shadow stack, have the kernel just allocate one and switch to it. Use stack_size passed from clone3() syscall for thread shadow stack size. A compat-mode thread shadow stack size is further reduced to 1/4. This allows more threads to run in a 32-bit address space. The clone() does not pass stack_size, which was added to clone3(). In that case, use RLIMIT_STACK size and cap to 4 GB. For shadow stack enabled vfork(), the parent and child can share the same shadow stack, like they can share a normal stack. Since the parent is suspended until the child terminates, the child will not interfere with the parent while executing as long as it doesn't return from the vfork() and overwrite up the shadow stack. The child can safely overwrite down the shadow stack, as the parent can just overwrite this later. So CET does not add any additional limitations for vfork(). Free the shadow stack on thread exit by doing it in mm_release(). Skip this when exiting a vfork() child since the stack is shared in the parent. During this operation, the shadow stack pointer of the new thread needs to be updated to point to the newly allocated shadow stack. Since the ability to do this is confined to the FPU subsystem, change fpu_clone() to take the new shadow stack pointer, and update it internally inside the FPU subsystem. This part was suggested by Thomas Gleixner. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-30-rick.p.edgecombe%40intel.com
2023-06-12 17:10:55 -07:00
/*
* When fork() with CLONE_VM fails, the child (tsk) already has a
* shadow stack allocated, and exit_thread() calls this function to
* free it. In this case the parent (current) and the child share
* the same mm struct.
*/
if (!tsk->mm || tsk->mm != current->mm)
return;
x86/shstk: Handle vfork clone failure correctly Shadow stacks are allocated automatically and freed on exit, depending on the clone flags. The two cases where new shadow stacks are not allocated are !CLONE_VM (fork()) and CLONE_VFORK (vfork()). For !CLONE_VM, although a new stack is not allocated, it can be freed normally because it will happen in the child's copy of the VM. However, for CLONE_VFORK the parent and the child are actually using the same shadow stack. So the kernel doesn't need to allocate *or* free a shadow stack for a CLONE_VFORK child. CLONE_VFORK children already need special tracking to avoid returning to userspace until the child exits or execs. Shadow stack uses this same tracking to avoid freeing CLONE_VFORK shadow stacks. However, the tracking is not setup until the clone has succeeded (internally). Which means, if a CLONE_VFORK fails, the existing logic will not know it is a CLONE_VFORK and proceed to unmap the parents shadow stack. This error handling cleanup logic runs via exit_thread() in the bad_fork_cleanup_thread label in copy_process(). The issue was seen in the glibc test "posix/tst-spawn3-pidfd" while running with shadow stack using currently out-of-tree glibc patches. Fix it by not unmapping the vfork shadow stack in the error case as well. Since clone is implemented in core code, it is not ideal to pass the clone flags along the error path in order to have shadow stack code have symmetric logic in the freeing half of the thread shadow stack handling. Instead use the existing state for thread shadow stacks to track whether the thread is managing its own shadow stack. For CLONE_VFORK, simply set shstk->base and shstk->size to 0, and have it mean the thread is not managing a shadow stack and so should skip cleanup work. Implement this by breaking up the CLONE_VFORK and !CLONE_VM cases in shstk_alloc_thread_stack() to separate conditionals since, the logic is now different between them. In the case of CLONE_VFORK && !CLONE_VM, the existing behavior is to not clean up the shadow stack in the child (which should go away quickly with either be exit or exec), so maintain that behavior by handling the CLONE_VFORK case first in the allocation path. This new logioc cleanly handles the case of normal, successful CLONE_VFORK's skipping cleaning up their shadow stack's on exit as well. So remove the existing, vfork shadow stack freeing logic. This is in deactivate_mm() where vfork_done is used to tell if it is a vfork child that can skip cleaning up the thread shadow stack. Fixes: b2926a36b97a ("x86/shstk: Handle thread shadow stack") Reported-by: H.J. Lu <hjl.tools@gmail.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Tested-by: H.J. Lu <hjl.tools@gmail.com> Link: https://lore.kernel.org/all/20230908203655.543765-2-rick.p.edgecombe%40intel.com
2023-09-08 13:36:53 -07:00
/*
* If shstk->base is NULL, then this task is not managing its
* own shadow stack (CLONE_VFORK). So skip freeing it.
*/
if (!shstk->base)
return;
/*
* shstk->base is NULL for CLONE_VFORK child tasks, and so is
* normal. But size = 0 on a shstk->base is not normal and
* indicated an attempt to free the thread shadow stack twice.
* Warn about it.
*/
if (WARN_ON(!shstk->size))
return;
unmap_shadow_stack(shstk->base, shstk->size);
shstk->size = 0;
}
static int wrss_control(bool enable)
{
u64 msrval;
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
return -EOPNOTSUPP;
/*
* Only enable WRSS if shadow stack is enabled. If shadow stack is not
* enabled, WRSS will already be disabled, so don't bother clearing it
* when disabling.
*/
if (!features_enabled(ARCH_SHSTK_SHSTK))
return -EPERM;
/* Already enabled/disabled? */
if (features_enabled(ARCH_SHSTK_WRSS) == enable)
return 0;
fpregs_lock_and_load();
rdmsrl(MSR_IA32_U_CET, msrval);
if (enable) {
features_set(ARCH_SHSTK_WRSS);
msrval |= CET_WRSS_EN;
} else {
features_clr(ARCH_SHSTK_WRSS);
if (!(msrval & CET_WRSS_EN))
goto unlock;
msrval &= ~CET_WRSS_EN;
}
wrmsrl(MSR_IA32_U_CET, msrval);
unlock:
fpregs_unlock();
return 0;
}
static int shstk_disable(void)
{
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
return -EOPNOTSUPP;
/* Already disabled? */
if (!features_enabled(ARCH_SHSTK_SHSTK))
return 0;
fpregs_lock_and_load();
/* Disable WRSS too when disabling shadow stack */
wrmsrl(MSR_IA32_U_CET, 0);
wrmsrl(MSR_IA32_PL3_SSP, 0);
fpregs_unlock();
shstk_free(current);
features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);
return 0;
}
x86/shstk: Introduce map_shadow_stack syscall When operating with shadow stacks enabled, the kernel will automatically allocate shadow stacks for new threads, however in some cases userspace will need additional shadow stacks. The main example of this is the ucontext family of functions, which require userspace allocating and pivoting to userspace managed stacks. Unlike most other user memory permissions, shadow stacks need to be provisioned with special data in order to be useful. They need to be setup with a restore token so that userspace can pivot to them via the RSTORSSP instruction. But, the security design of shadow stacks is that they should not be written to except in limited circumstances. This presents a problem for userspace, as to how userspace can provision this special data, without allowing for the shadow stack to be generally writable. Previously, a new PROT_SHADOW_STACK was attempted, which could be mprotect()ed from RW permissions after the data was provisioned. This was found to not be secure enough, as other threads could write to the shadow stack during the writable window. The kernel can use a special instruction, WRUSS, to write directly to userspace shadow stacks. So the solution can be that memory can be mapped as shadow stack permissions from the beginning (never generally writable in userspace), and the kernel itself can write the restore token. First, a new madvise() flag was explored, which could operate on the PROT_SHADOW_STACK memory. This had a couple of downsides: 1. Extra checks were needed in mprotect() to prevent writable memory from ever becoming PROT_SHADOW_STACK. 2. Extra checks/vma state were needed in the new madvise() to prevent restore tokens being written into the middle of pre-used shadow stacks. It is ideal to prevent restore tokens being added at arbitrary locations, so the check was to make sure the shadow stack had never been written to. 3. It stood out from the rest of the madvise flags, as more of direct action than a hint at future desired behavior. So rather than repurpose two existing syscalls (mmap, madvise) that don't quite fit, just implement a new map_shadow_stack syscall to allow userspace to map and setup new shadow stacks in one step. While ucontext is the primary motivator, userspace may have other unforeseen reasons to setup its own shadow stacks using the WRSS instruction. Towards this provide a flag so that stacks can be optionally setup securely for the common case of ucontext without enabling WRSS. Or potentially have the kernel set up the shadow stack in some new way. The following example demonstrates how to create a new shadow stack with map_shadow_stack: void *shstk = map_shadow_stack(addr, stack_size, SHADOW_STACK_SET_TOKEN); Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-35-rick.p.edgecombe%40intel.com
2023-06-12 17:11:00 -07:00
SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
{
bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
unsigned long aligned_size;
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
return -EOPNOTSUPP;
if (flags & ~SHADOW_STACK_SET_TOKEN)
return -EINVAL;
/* If there isn't space for a token */
if (set_tok && size < 8)
return -ENOSPC;
if (addr && addr < SZ_4G)
return -ERANGE;
/*
* An overflow would result in attempting to write the restore token
* to the wrong location. Not catastrophic, but just return the right
* error code and block it.
*/
aligned_size = PAGE_ALIGN(size);
if (aligned_size < size)
return -EOVERFLOW;
return alloc_shstk(addr, aligned_size, size, set_tok);
}
long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
{
unsigned long features = arg2;
if (option == ARCH_SHSTK_STATUS) {
return put_user(task->thread.features, (unsigned long __user *)arg2);
}
if (option == ARCH_SHSTK_LOCK) {
task->thread.features_locked |= features;
return 0;
}
/* Only allow via ptrace */
if (task != current) {
if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
task->thread.features_locked &= ~features;
return 0;
}
return -EINVAL;
}
/* Do not allow to change locked features */
if (features & task->thread.features_locked)
return -EPERM;
/* Only support enabling/disabling one feature at a time. */
if (hweight_long(features) > 1)
return -EINVAL;
if (option == ARCH_SHSTK_DISABLE) {
if (features & ARCH_SHSTK_WRSS)
return wrss_control(false);
if (features & ARCH_SHSTK_SHSTK)
return shstk_disable();
return -EINVAL;
}
/* Handle ARCH_SHSTK_ENABLE */
if (features & ARCH_SHSTK_SHSTK)
return shstk_setup();
if (features & ARCH_SHSTK_WRSS)
return wrss_control(true);
return -EINVAL;
}