ff474a78ce
Adding uretprobe syscall instead of trap to speed up return probe. At the moment the uretprobe setup/path is: - install entry uprobe - when the uprobe is hit, it overwrites probed function's return address on stack with address of the trampoline that contains breakpoint instruction - the breakpoint trap code handles the uretprobe consumers execution and jumps back to original return address This patch replaces the above trampoline's breakpoint instruction with new ureprobe syscall call. This syscall does exactly the same job as the trap with some more extra work: - syscall trampoline must save original value for rax/r11/rcx registers on stack - rax is set to syscall number and r11/rcx are changed and used by syscall instruction - the syscall code reads the original values of those registers and restore those values in task's pt_regs area - only caller from trampoline exposed in '[uprobes]' is allowed, the process will receive SIGILL signal otherwise Even with some extra work, using the uretprobes syscall shows speed improvement (compared to using standard breakpoint): On Intel (11th Gen Intel(R) Core(TM) i7-1165G7 @ 2.80GHz) current: uretprobe-nop : 1.498 ± 0.000M/s uretprobe-push : 1.448 ± 0.001M/s uretprobe-ret : 0.816 ± 0.001M/s with the fix: uretprobe-nop : 1.969 ± 0.002M/s < 31% speed up uretprobe-push : 1.910 ± 0.000M/s < 31% speed up uretprobe-ret : 0.934 ± 0.000M/s < 14% speed up On Amd (AMD Ryzen 7 5700U) current: uretprobe-nop : 0.778 ± 0.001M/s uretprobe-push : 0.744 ± 0.001M/s uretprobe-ret : 0.540 ± 0.001M/s with the fix: uretprobe-nop : 0.860 ± 0.001M/s < 10% speed up uretprobe-push : 0.818 ± 0.001M/s < 10% speed up uretprobe-ret : 0.578 ± 0.000M/s < 7% speed up The performance test spawns a thread that runs loop which triggers uprobe with attached bpf program that increments the counter that gets printed in results above. The uprobe (and uretprobe) kind is determined by which instruction is being patched with breakpoint instruction. That's also important for uretprobes, because uprobe is installed for each uretprobe. The performance test is part of bpf selftests: tools/testing/selftests/bpf/run_bench_uprobes.sh Note at the moment uretprobe syscall is supported only for native 64-bit process, compat process still uses standard breakpoint. Note that when shadow stack is enabled the uretprobe syscall returns via iret, which is slower than return via sysret, but won't cause the shadow stack violation. Link: https://lore.kernel.org/all/20240611112158.40795-4-jolsa@kernel.org/ Suggested-by: Andrii Nakryiko <andrii@kernel.org> Reviewed-by: Oleg Nesterov <oleg@redhat.com> Reviewed-by: Masami Hiramatsu (Google) <mhiramat@kernel.org> Acked-by: Andrii Nakryiko <andrii@kernel.org> Signed-off-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Jiri Olsa <jolsa@kernel.org> Signed-off-by: Masami Hiramatsu (Google) <mhiramat@kernel.org>
596 lines
13 KiB
C
596 lines
13 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* shstk.c - Intel shadow stack support
|
|
*
|
|
* Copyright (c) 2021, Intel Corporation.
|
|
* Yu-cheng Yu <yu-cheng.yu@intel.com>
|
|
*/
|
|
|
|
#include <linux/sched.h>
|
|
#include <linux/bitops.h>
|
|
#include <linux/types.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/mman.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/uaccess.h>
|
|
#include <linux/sched/signal.h>
|
|
#include <linux/compat.h>
|
|
#include <linux/sizes.h>
|
|
#include <linux/user.h>
|
|
#include <linux/syscalls.h>
|
|
#include <asm/msr.h>
|
|
#include <asm/fpu/xstate.h>
|
|
#include <asm/fpu/types.h>
|
|
#include <asm/shstk.h>
|
|
#include <asm/special_insns.h>
|
|
#include <asm/fpu/api.h>
|
|
#include <asm/prctl.h>
|
|
|
|
#define SS_FRAME_SIZE 8
|
|
|
|
static bool features_enabled(unsigned long features)
|
|
{
|
|
return current->thread.features & features;
|
|
}
|
|
|
|
static void features_set(unsigned long features)
|
|
{
|
|
current->thread.features |= features;
|
|
}
|
|
|
|
static void features_clr(unsigned long features)
|
|
{
|
|
current->thread.features &= ~features;
|
|
}
|
|
|
|
/*
|
|
* Create a restore token on the shadow stack. A token is always 8-byte
|
|
* and aligned to 8.
|
|
*/
|
|
static int create_rstor_token(unsigned long ssp, unsigned long *token_addr)
|
|
{
|
|
unsigned long addr;
|
|
|
|
/* Token must be aligned */
|
|
if (!IS_ALIGNED(ssp, 8))
|
|
return -EINVAL;
|
|
|
|
addr = ssp - SS_FRAME_SIZE;
|
|
|
|
/*
|
|
* SSP is aligned, so reserved bits and mode bit are a zero, just mark
|
|
* the token 64-bit.
|
|
*/
|
|
ssp |= BIT(0);
|
|
|
|
if (write_user_shstk_64((u64 __user *)addr, (u64)ssp))
|
|
return -EFAULT;
|
|
|
|
if (token_addr)
|
|
*token_addr = addr;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* VM_SHADOW_STACK will have a guard page. This helps userspace protect
|
|
* itself from attacks. The reasoning is as follows:
|
|
*
|
|
* The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The
|
|
* INCSSP instruction can increment the shadow stack pointer. It is the
|
|
* shadow stack analog of an instruction like:
|
|
*
|
|
* addq $0x80, %rsp
|
|
*
|
|
* However, there is one important difference between an ADD on %rsp
|
|
* and INCSSP. In addition to modifying SSP, INCSSP also reads from the
|
|
* memory of the first and last elements that were "popped". It can be
|
|
* thought of as acting like this:
|
|
*
|
|
* READ_ONCE(ssp); // read+discard top element on stack
|
|
* ssp += nr_to_pop * 8; // move the shadow stack
|
|
* READ_ONCE(ssp-8); // read+discard last popped stack element
|
|
*
|
|
* The maximum distance INCSSP can move the SSP is 2040 bytes, before
|
|
* it would read the memory. Therefore a single page gap will be enough
|
|
* to prevent any operation from shifting the SSP to an adjacent stack,
|
|
* since it would have to land in the gap at least once, causing a
|
|
* fault.
|
|
*/
|
|
static unsigned long alloc_shstk(unsigned long addr, unsigned long size,
|
|
unsigned long token_offset, bool set_res_tok)
|
|
{
|
|
int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G;
|
|
struct mm_struct *mm = current->mm;
|
|
unsigned long mapped_addr, unused;
|
|
|
|
if (addr)
|
|
flags |= MAP_FIXED_NOREPLACE;
|
|
|
|
mmap_write_lock(mm);
|
|
mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags,
|
|
VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL);
|
|
mmap_write_unlock(mm);
|
|
|
|
if (!set_res_tok || IS_ERR_VALUE(mapped_addr))
|
|
goto out;
|
|
|
|
if (create_rstor_token(mapped_addr + token_offset, NULL)) {
|
|
vm_munmap(mapped_addr, size);
|
|
return -EINVAL;
|
|
}
|
|
|
|
out:
|
|
return mapped_addr;
|
|
}
|
|
|
|
static unsigned long adjust_shstk_size(unsigned long size)
|
|
{
|
|
if (size)
|
|
return PAGE_ALIGN(size);
|
|
|
|
return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
|
|
}
|
|
|
|
static void unmap_shadow_stack(u64 base, u64 size)
|
|
{
|
|
int r;
|
|
|
|
r = vm_munmap(base, size);
|
|
|
|
/*
|
|
* mmap_write_lock_killable() failed with -EINTR. This means
|
|
* the process is about to die and have it's MM cleaned up.
|
|
* This task shouldn't ever make it back to userspace. In this
|
|
* case it is ok to leak a shadow stack, so just exit out.
|
|
*/
|
|
if (r == -EINTR)
|
|
return;
|
|
|
|
/*
|
|
* For all other types of vm_munmap() failure, either the
|
|
* system is out of memory or there is bug.
|
|
*/
|
|
WARN_ON_ONCE(r);
|
|
}
|
|
|
|
static int shstk_setup(void)
|
|
{
|
|
struct thread_shstk *shstk = ¤t->thread.shstk;
|
|
unsigned long addr, size;
|
|
|
|
/* Already enabled */
|
|
if (features_enabled(ARCH_SHSTK_SHSTK))
|
|
return 0;
|
|
|
|
/* Also not supported for 32 bit */
|
|
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall())
|
|
return -EOPNOTSUPP;
|
|
|
|
size = adjust_shstk_size(0);
|
|
addr = alloc_shstk(0, size, 0, false);
|
|
if (IS_ERR_VALUE(addr))
|
|
return PTR_ERR((void *)addr);
|
|
|
|
fpregs_lock_and_load();
|
|
wrmsrl(MSR_IA32_PL3_SSP, addr + size);
|
|
wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
|
|
fpregs_unlock();
|
|
|
|
shstk->base = addr;
|
|
shstk->size = size;
|
|
features_set(ARCH_SHSTK_SHSTK);
|
|
|
|
return 0;
|
|
}
|
|
|
|
void reset_thread_features(void)
|
|
{
|
|
memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk));
|
|
current->thread.features = 0;
|
|
current->thread.features_locked = 0;
|
|
}
|
|
|
|
unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags,
|
|
unsigned long stack_size)
|
|
{
|
|
struct thread_shstk *shstk = &tsk->thread.shstk;
|
|
unsigned long addr, size;
|
|
|
|
/*
|
|
* If shadow stack is not enabled on the new thread, skip any
|
|
* switch to a new shadow stack.
|
|
*/
|
|
if (!features_enabled(ARCH_SHSTK_SHSTK))
|
|
return 0;
|
|
|
|
/*
|
|
* For CLONE_VFORK the child will share the parents shadow stack.
|
|
* Make sure to clear the internal tracking of the thread shadow
|
|
* stack so the freeing logic run for child knows to leave it alone.
|
|
*/
|
|
if (clone_flags & CLONE_VFORK) {
|
|
shstk->base = 0;
|
|
shstk->size = 0;
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* For !CLONE_VM the child will use a copy of the parents shadow
|
|
* stack.
|
|
*/
|
|
if (!(clone_flags & CLONE_VM))
|
|
return 0;
|
|
|
|
size = adjust_shstk_size(stack_size);
|
|
addr = alloc_shstk(0, size, 0, false);
|
|
if (IS_ERR_VALUE(addr))
|
|
return addr;
|
|
|
|
shstk->base = addr;
|
|
shstk->size = size;
|
|
|
|
return addr + size;
|
|
}
|
|
|
|
static unsigned long get_user_shstk_addr(void)
|
|
{
|
|
unsigned long long ssp;
|
|
|
|
fpregs_lock_and_load();
|
|
|
|
rdmsrl(MSR_IA32_PL3_SSP, ssp);
|
|
|
|
fpregs_unlock();
|
|
|
|
return ssp;
|
|
}
|
|
|
|
#define SHSTK_DATA_BIT BIT(63)
|
|
|
|
static int put_shstk_data(u64 __user *addr, u64 data)
|
|
{
|
|
if (WARN_ON_ONCE(data & SHSTK_DATA_BIT))
|
|
return -EINVAL;
|
|
|
|
/*
|
|
* Mark the high bit so that the sigframe can't be processed as a
|
|
* return address.
|
|
*/
|
|
if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT))
|
|
return -EFAULT;
|
|
return 0;
|
|
}
|
|
|
|
static int get_shstk_data(unsigned long *data, unsigned long __user *addr)
|
|
{
|
|
unsigned long ldata;
|
|
|
|
if (unlikely(get_user(ldata, addr)))
|
|
return -EFAULT;
|
|
|
|
if (!(ldata & SHSTK_DATA_BIT))
|
|
return -EINVAL;
|
|
|
|
*data = ldata & ~SHSTK_DATA_BIT;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int shstk_push_sigframe(unsigned long *ssp)
|
|
{
|
|
unsigned long target_ssp = *ssp;
|
|
|
|
/* Token must be aligned */
|
|
if (!IS_ALIGNED(target_ssp, 8))
|
|
return -EINVAL;
|
|
|
|
*ssp -= SS_FRAME_SIZE;
|
|
if (put_shstk_data((void __user *)*ssp, target_ssp))
|
|
return -EFAULT;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int shstk_pop_sigframe(unsigned long *ssp)
|
|
{
|
|
struct vm_area_struct *vma;
|
|
unsigned long token_addr;
|
|
bool need_to_check_vma;
|
|
int err = 1;
|
|
|
|
/*
|
|
* It is possible for the SSP to be off the end of a shadow stack by 4
|
|
* or 8 bytes. If the shadow stack is at the start of a page or 4 bytes
|
|
* before it, it might be this case, so check that the address being
|
|
* read is actually shadow stack.
|
|
*/
|
|
if (!IS_ALIGNED(*ssp, 8))
|
|
return -EINVAL;
|
|
|
|
need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp;
|
|
|
|
if (need_to_check_vma)
|
|
mmap_read_lock_killable(current->mm);
|
|
|
|
err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp);
|
|
if (unlikely(err))
|
|
goto out_err;
|
|
|
|
if (need_to_check_vma) {
|
|
vma = find_vma(current->mm, *ssp);
|
|
if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) {
|
|
err = -EFAULT;
|
|
goto out_err;
|
|
}
|
|
|
|
mmap_read_unlock(current->mm);
|
|
}
|
|
|
|
/* Restore SSP aligned? */
|
|
if (unlikely(!IS_ALIGNED(token_addr, 8)))
|
|
return -EINVAL;
|
|
|
|
/* SSP in userspace? */
|
|
if (unlikely(token_addr >= TASK_SIZE_MAX))
|
|
return -EINVAL;
|
|
|
|
*ssp = token_addr;
|
|
|
|
return 0;
|
|
out_err:
|
|
if (need_to_check_vma)
|
|
mmap_read_unlock(current->mm);
|
|
return err;
|
|
}
|
|
|
|
int setup_signal_shadow_stack(struct ksignal *ksig)
|
|
{
|
|
void __user *restorer = ksig->ka.sa.sa_restorer;
|
|
unsigned long ssp;
|
|
int err;
|
|
|
|
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
|
|
!features_enabled(ARCH_SHSTK_SHSTK))
|
|
return 0;
|
|
|
|
if (!restorer)
|
|
return -EINVAL;
|
|
|
|
ssp = get_user_shstk_addr();
|
|
if (unlikely(!ssp))
|
|
return -EINVAL;
|
|
|
|
err = shstk_push_sigframe(&ssp);
|
|
if (unlikely(err))
|
|
return err;
|
|
|
|
/* Push restorer address */
|
|
ssp -= SS_FRAME_SIZE;
|
|
err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer);
|
|
if (unlikely(err))
|
|
return -EFAULT;
|
|
|
|
fpregs_lock_and_load();
|
|
wrmsrl(MSR_IA32_PL3_SSP, ssp);
|
|
fpregs_unlock();
|
|
|
|
return 0;
|
|
}
|
|
|
|
int restore_signal_shadow_stack(void)
|
|
{
|
|
unsigned long ssp;
|
|
int err;
|
|
|
|
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
|
|
!features_enabled(ARCH_SHSTK_SHSTK))
|
|
return 0;
|
|
|
|
ssp = get_user_shstk_addr();
|
|
if (unlikely(!ssp))
|
|
return -EINVAL;
|
|
|
|
err = shstk_pop_sigframe(&ssp);
|
|
if (unlikely(err))
|
|
return err;
|
|
|
|
fpregs_lock_and_load();
|
|
wrmsrl(MSR_IA32_PL3_SSP, ssp);
|
|
fpregs_unlock();
|
|
|
|
return 0;
|
|
}
|
|
|
|
void shstk_free(struct task_struct *tsk)
|
|
{
|
|
struct thread_shstk *shstk = &tsk->thread.shstk;
|
|
|
|
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) ||
|
|
!features_enabled(ARCH_SHSTK_SHSTK))
|
|
return;
|
|
|
|
/*
|
|
* When fork() with CLONE_VM fails, the child (tsk) already has a
|
|
* shadow stack allocated, and exit_thread() calls this function to
|
|
* free it. In this case the parent (current) and the child share
|
|
* the same mm struct.
|
|
*/
|
|
if (!tsk->mm || tsk->mm != current->mm)
|
|
return;
|
|
|
|
/*
|
|
* If shstk->base is NULL, then this task is not managing its
|
|
* own shadow stack (CLONE_VFORK). So skip freeing it.
|
|
*/
|
|
if (!shstk->base)
|
|
return;
|
|
|
|
/*
|
|
* shstk->base is NULL for CLONE_VFORK child tasks, and so is
|
|
* normal. But size = 0 on a shstk->base is not normal and
|
|
* indicated an attempt to free the thread shadow stack twice.
|
|
* Warn about it.
|
|
*/
|
|
if (WARN_ON(!shstk->size))
|
|
return;
|
|
|
|
unmap_shadow_stack(shstk->base, shstk->size);
|
|
|
|
shstk->size = 0;
|
|
}
|
|
|
|
static int wrss_control(bool enable)
|
|
{
|
|
u64 msrval;
|
|
|
|
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
|
|
return -EOPNOTSUPP;
|
|
|
|
/*
|
|
* Only enable WRSS if shadow stack is enabled. If shadow stack is not
|
|
* enabled, WRSS will already be disabled, so don't bother clearing it
|
|
* when disabling.
|
|
*/
|
|
if (!features_enabled(ARCH_SHSTK_SHSTK))
|
|
return -EPERM;
|
|
|
|
/* Already enabled/disabled? */
|
|
if (features_enabled(ARCH_SHSTK_WRSS) == enable)
|
|
return 0;
|
|
|
|
fpregs_lock_and_load();
|
|
rdmsrl(MSR_IA32_U_CET, msrval);
|
|
|
|
if (enable) {
|
|
features_set(ARCH_SHSTK_WRSS);
|
|
msrval |= CET_WRSS_EN;
|
|
} else {
|
|
features_clr(ARCH_SHSTK_WRSS);
|
|
if (!(msrval & CET_WRSS_EN))
|
|
goto unlock;
|
|
|
|
msrval &= ~CET_WRSS_EN;
|
|
}
|
|
|
|
wrmsrl(MSR_IA32_U_CET, msrval);
|
|
|
|
unlock:
|
|
fpregs_unlock();
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int shstk_disable(void)
|
|
{
|
|
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
|
|
return -EOPNOTSUPP;
|
|
|
|
/* Already disabled? */
|
|
if (!features_enabled(ARCH_SHSTK_SHSTK))
|
|
return 0;
|
|
|
|
fpregs_lock_and_load();
|
|
/* Disable WRSS too when disabling shadow stack */
|
|
wrmsrl(MSR_IA32_U_CET, 0);
|
|
wrmsrl(MSR_IA32_PL3_SSP, 0);
|
|
fpregs_unlock();
|
|
|
|
shstk_free(current);
|
|
features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS);
|
|
|
|
return 0;
|
|
}
|
|
|
|
SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags)
|
|
{
|
|
bool set_tok = flags & SHADOW_STACK_SET_TOKEN;
|
|
unsigned long aligned_size;
|
|
|
|
if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK))
|
|
return -EOPNOTSUPP;
|
|
|
|
if (flags & ~SHADOW_STACK_SET_TOKEN)
|
|
return -EINVAL;
|
|
|
|
/* If there isn't space for a token */
|
|
if (set_tok && size < 8)
|
|
return -ENOSPC;
|
|
|
|
if (addr && addr < SZ_4G)
|
|
return -ERANGE;
|
|
|
|
/*
|
|
* An overflow would result in attempting to write the restore token
|
|
* to the wrong location. Not catastrophic, but just return the right
|
|
* error code and block it.
|
|
*/
|
|
aligned_size = PAGE_ALIGN(size);
|
|
if (aligned_size < size)
|
|
return -EOVERFLOW;
|
|
|
|
return alloc_shstk(addr, aligned_size, size, set_tok);
|
|
}
|
|
|
|
long shstk_prctl(struct task_struct *task, int option, unsigned long arg2)
|
|
{
|
|
unsigned long features = arg2;
|
|
|
|
if (option == ARCH_SHSTK_STATUS) {
|
|
return put_user(task->thread.features, (unsigned long __user *)arg2);
|
|
}
|
|
|
|
if (option == ARCH_SHSTK_LOCK) {
|
|
task->thread.features_locked |= features;
|
|
return 0;
|
|
}
|
|
|
|
/* Only allow via ptrace */
|
|
if (task != current) {
|
|
if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) {
|
|
task->thread.features_locked &= ~features;
|
|
return 0;
|
|
}
|
|
return -EINVAL;
|
|
}
|
|
|
|
/* Do not allow to change locked features */
|
|
if (features & task->thread.features_locked)
|
|
return -EPERM;
|
|
|
|
/* Only support enabling/disabling one feature at a time. */
|
|
if (hweight_long(features) > 1)
|
|
return -EINVAL;
|
|
|
|
if (option == ARCH_SHSTK_DISABLE) {
|
|
if (features & ARCH_SHSTK_WRSS)
|
|
return wrss_control(false);
|
|
if (features & ARCH_SHSTK_SHSTK)
|
|
return shstk_disable();
|
|
return -EINVAL;
|
|
}
|
|
|
|
/* Handle ARCH_SHSTK_ENABLE */
|
|
if (features & ARCH_SHSTK_SHSTK)
|
|
return shstk_setup();
|
|
if (features & ARCH_SHSTK_WRSS)
|
|
return wrss_control(true);
|
|
return -EINVAL;
|
|
}
|
|
|
|
int shstk_update_last_frame(unsigned long val)
|
|
{
|
|
unsigned long ssp;
|
|
|
|
if (!features_enabled(ARCH_SHSTK_SHSTK))
|
|
return 0;
|
|
|
|
ssp = get_user_shstk_addr();
|
|
return write_user_shstk_64((u64 __user *)ssp, (u64)val);
|
|
}
|
|
|
|
bool shstk_is_enabled(void)
|
|
{
|
|
return features_enabled(ARCH_SHSTK_SHSTK);
|
|
}
|