userfaultfd: protect mmap_changing with rw_sem in userfaulfd_ctx
Increments and loads to mmap_changing are always in mmap_lock critical section. This ensures that if userspace requests event notification for non-cooperative operations (e.g. mremap), userfaultfd operations don't occur concurrently. This can be achieved by using a separate read-write semaphore in userfaultfd_ctx such that increments are done in write-mode and loads in read-mode, thereby eliminating the dependency on mmap_lock for this purpose. This is a preparatory step before we replace mmap_lock usage with per-vma locks in fill/move ioctls. Link: https://lkml.kernel.org/r/20240215182756.3448972-3-lokeshgidra@google.com Signed-off-by: Lokesh Gidra <lokeshgidra@google.com> Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org> Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Axel Rasmussen <axelrasmussen@google.com> Cc: Brian Geffon <bgeffon@google.com> Cc: David Hildenbrand <david@redhat.com> Cc: Jann Horn <jannh@google.com> Cc: Kalesh Singh <kaleshsingh@google.com> Cc: Matthew Wilcox (Oracle) <willy@infradead.org> Cc: Nicolas Geoffray <ngeoffray@google.com> Cc: Peter Xu <peterx@redhat.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Tim Murray <timmurray@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
parent
f91e6b41dd
commit
5e4c24a57b
@ -685,12 +685,15 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
|
||||
ctx->flags = octx->flags;
|
||||
ctx->features = octx->features;
|
||||
ctx->released = false;
|
||||
init_rwsem(&ctx->map_changing_lock);
|
||||
atomic_set(&ctx->mmap_changing, 0);
|
||||
ctx->mm = vma->vm_mm;
|
||||
mmgrab(ctx->mm);
|
||||
|
||||
userfaultfd_ctx_get(octx);
|
||||
down_write(&octx->map_changing_lock);
|
||||
atomic_inc(&octx->mmap_changing);
|
||||
up_write(&octx->map_changing_lock);
|
||||
fctx->orig = octx;
|
||||
fctx->new = ctx;
|
||||
list_add_tail(&fctx->list, fcs);
|
||||
@ -737,7 +740,9 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma,
|
||||
if (ctx->features & UFFD_FEATURE_EVENT_REMAP) {
|
||||
vm_ctx->ctx = ctx;
|
||||
userfaultfd_ctx_get(ctx);
|
||||
down_write(&ctx->map_changing_lock);
|
||||
atomic_inc(&ctx->mmap_changing);
|
||||
up_write(&ctx->map_changing_lock);
|
||||
} else {
|
||||
/* Drop uffd context if remap feature not enabled */
|
||||
vma_start_write(vma);
|
||||
@ -783,7 +788,9 @@ bool userfaultfd_remove(struct vm_area_struct *vma,
|
||||
return true;
|
||||
|
||||
userfaultfd_ctx_get(ctx);
|
||||
down_write(&ctx->map_changing_lock);
|
||||
atomic_inc(&ctx->mmap_changing);
|
||||
up_write(&ctx->map_changing_lock);
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
msg_init(&ewq.msg);
|
||||
@ -825,7 +832,9 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, unsigned long start,
|
||||
return -ENOMEM;
|
||||
|
||||
userfaultfd_ctx_get(ctx);
|
||||
down_write(&ctx->map_changing_lock);
|
||||
atomic_inc(&ctx->mmap_changing);
|
||||
up_write(&ctx->map_changing_lock);
|
||||
unmap_ctx->ctx = ctx;
|
||||
unmap_ctx->start = start;
|
||||
unmap_ctx->end = end;
|
||||
@ -1709,9 +1718,8 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
|
||||
if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP)
|
||||
flags |= MFILL_ATOMIC_WP;
|
||||
if (mmget_not_zero(ctx->mm)) {
|
||||
ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
|
||||
uffdio_copy.len, &ctx->mmap_changing,
|
||||
flags);
|
||||
ret = mfill_atomic_copy(ctx, uffdio_copy.dst, uffdio_copy.src,
|
||||
uffdio_copy.len, flags);
|
||||
mmput(ctx->mm);
|
||||
} else {
|
||||
return -ESRCH;
|
||||
@ -1761,9 +1769,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
|
||||
goto out;
|
||||
|
||||
if (mmget_not_zero(ctx->mm)) {
|
||||
ret = mfill_atomic_zeropage(ctx->mm, uffdio_zeropage.range.start,
|
||||
uffdio_zeropage.range.len,
|
||||
&ctx->mmap_changing);
|
||||
ret = mfill_atomic_zeropage(ctx, uffdio_zeropage.range.start,
|
||||
uffdio_zeropage.range.len);
|
||||
mmput(ctx->mm);
|
||||
} else {
|
||||
return -ESRCH;
|
||||
@ -1818,9 +1825,8 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
|
||||
return -EINVAL;
|
||||
|
||||
if (mmget_not_zero(ctx->mm)) {
|
||||
ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
|
||||
uffdio_wp.range.len, mode_wp,
|
||||
&ctx->mmap_changing);
|
||||
ret = mwriteprotect_range(ctx, uffdio_wp.range.start,
|
||||
uffdio_wp.range.len, mode_wp);
|
||||
mmput(ctx->mm);
|
||||
} else {
|
||||
return -ESRCH;
|
||||
@ -1870,9 +1876,8 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
|
||||
flags |= MFILL_ATOMIC_WP;
|
||||
|
||||
if (mmget_not_zero(ctx->mm)) {
|
||||
ret = mfill_atomic_continue(ctx->mm, uffdio_continue.range.start,
|
||||
uffdio_continue.range.len,
|
||||
&ctx->mmap_changing, flags);
|
||||
ret = mfill_atomic_continue(ctx, uffdio_continue.range.start,
|
||||
uffdio_continue.range.len, flags);
|
||||
mmput(ctx->mm);
|
||||
} else {
|
||||
return -ESRCH;
|
||||
@ -1925,9 +1930,8 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long
|
||||
goto out;
|
||||
|
||||
if (mmget_not_zero(ctx->mm)) {
|
||||
ret = mfill_atomic_poison(ctx->mm, uffdio_poison.range.start,
|
||||
uffdio_poison.range.len,
|
||||
&ctx->mmap_changing, 0);
|
||||
ret = mfill_atomic_poison(ctx, uffdio_poison.range.start,
|
||||
uffdio_poison.range.len, 0);
|
||||
mmput(ctx->mm);
|
||||
} else {
|
||||
return -ESRCH;
|
||||
@ -2003,13 +2007,14 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx,
|
||||
if (mmget_not_zero(mm)) {
|
||||
mmap_read_lock(mm);
|
||||
|
||||
/* Re-check after taking mmap_lock */
|
||||
/* Re-check after taking map_changing_lock */
|
||||
down_read(&ctx->map_changing_lock);
|
||||
if (likely(!atomic_read(&ctx->mmap_changing)))
|
||||
ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
|
||||
uffdio_move.len, uffdio_move.mode);
|
||||
else
|
||||
ret = -EAGAIN;
|
||||
|
||||
up_read(&ctx->map_changing_lock);
|
||||
mmap_read_unlock(mm);
|
||||
mmput(mm);
|
||||
} else {
|
||||
@ -2216,6 +2221,7 @@ static int new_userfaultfd(int flags)
|
||||
ctx->flags = flags;
|
||||
ctx->features = 0;
|
||||
ctx->released = false;
|
||||
init_rwsem(&ctx->map_changing_lock);
|
||||
atomic_set(&ctx->mmap_changing, 0);
|
||||
ctx->mm = current->mm;
|
||||
/* prevent the mm struct to be freed */
|
||||
|
@ -69,6 +69,13 @@ struct userfaultfd_ctx {
|
||||
unsigned int features;
|
||||
/* released */
|
||||
bool released;
|
||||
/*
|
||||
* Prevents userfaultfd operations (fill/move/wp) from happening while
|
||||
* some non-cooperative event(s) is taking place. Increments are done
|
||||
* in write-mode. Whereas, userfaultfd operations, which includes
|
||||
* reading mmap_changing, is done under read-mode.
|
||||
*/
|
||||
struct rw_semaphore map_changing_lock;
|
||||
/* memory mappings are changing because of non-cooperative event */
|
||||
atomic_t mmap_changing;
|
||||
/* mm with one ore more vmas attached to this userfaultfd_ctx */
|
||||
@ -113,22 +120,18 @@ extern int mfill_atomic_install_pte(pmd_t *dst_pmd,
|
||||
unsigned long dst_addr, struct page *page,
|
||||
bool newly_allocated, uffd_flags_t flags);
|
||||
|
||||
extern ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
|
||||
extern ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
|
||||
unsigned long src_start, unsigned long len,
|
||||
atomic_t *mmap_changing, uffd_flags_t flags);
|
||||
extern ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm,
|
||||
uffd_flags_t flags);
|
||||
extern ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
|
||||
unsigned long dst_start,
|
||||
unsigned long len,
|
||||
atomic_t *mmap_changing);
|
||||
extern ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long dst_start,
|
||||
unsigned long len, atomic_t *mmap_changing,
|
||||
uffd_flags_t flags);
|
||||
extern ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
|
||||
unsigned long len, atomic_t *mmap_changing,
|
||||
uffd_flags_t flags);
|
||||
extern int mwriteprotect_range(struct mm_struct *dst_mm,
|
||||
unsigned long start, unsigned long len,
|
||||
bool enable_wp, atomic_t *mmap_changing);
|
||||
unsigned long len);
|
||||
extern ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long dst_start,
|
||||
unsigned long len, uffd_flags_t flags);
|
||||
extern ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
|
||||
unsigned long len, uffd_flags_t flags);
|
||||
extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
|
||||
unsigned long len, bool enable_wp);
|
||||
extern long uffd_wp_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long len, bool enable_wp);
|
||||
|
||||
|
@ -353,11 +353,11 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
|
||||
* called with mmap_lock held, it will release mmap_lock before returning.
|
||||
*/
|
||||
static __always_inline ssize_t mfill_atomic_hugetlb(
|
||||
struct userfaultfd_ctx *ctx,
|
||||
struct vm_area_struct *dst_vma,
|
||||
unsigned long dst_start,
|
||||
unsigned long src_start,
|
||||
unsigned long len,
|
||||
atomic_t *mmap_changing,
|
||||
uffd_flags_t flags)
|
||||
{
|
||||
struct mm_struct *dst_mm = dst_vma->vm_mm;
|
||||
@ -379,6 +379,7 @@ static __always_inline ssize_t mfill_atomic_hugetlb(
|
||||
* feature is not supported.
|
||||
*/
|
||||
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_ZEROPAGE)) {
|
||||
up_read(&ctx->map_changing_lock);
|
||||
mmap_read_unlock(dst_mm);
|
||||
return -EINVAL;
|
||||
}
|
||||
@ -463,6 +464,7 @@ retry:
|
||||
cond_resched();
|
||||
|
||||
if (unlikely(err == -ENOENT)) {
|
||||
up_read(&ctx->map_changing_lock);
|
||||
mmap_read_unlock(dst_mm);
|
||||
BUG_ON(!folio);
|
||||
|
||||
@ -473,12 +475,13 @@ retry:
|
||||
goto out;
|
||||
}
|
||||
mmap_read_lock(dst_mm);
|
||||
down_read(&ctx->map_changing_lock);
|
||||
/*
|
||||
* If memory mappings are changing because of non-cooperative
|
||||
* operation (e.g. mremap) running in parallel, bail out and
|
||||
* request the user to retry later
|
||||
*/
|
||||
if (mmap_changing && atomic_read(mmap_changing)) {
|
||||
if (atomic_read(&ctx->mmap_changing)) {
|
||||
err = -EAGAIN;
|
||||
break;
|
||||
}
|
||||
@ -501,6 +504,7 @@ retry:
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
up_read(&ctx->map_changing_lock);
|
||||
mmap_read_unlock(dst_mm);
|
||||
out:
|
||||
if (folio)
|
||||
@ -512,11 +516,11 @@ out:
|
||||
}
|
||||
#else /* !CONFIG_HUGETLB_PAGE */
|
||||
/* fail at build time if gcc attempts to use this */
|
||||
extern ssize_t mfill_atomic_hugetlb(struct vm_area_struct *dst_vma,
|
||||
extern ssize_t mfill_atomic_hugetlb(struct userfaultfd_ctx *ctx,
|
||||
struct vm_area_struct *dst_vma,
|
||||
unsigned long dst_start,
|
||||
unsigned long src_start,
|
||||
unsigned long len,
|
||||
atomic_t *mmap_changing,
|
||||
uffd_flags_t flags);
|
||||
#endif /* CONFIG_HUGETLB_PAGE */
|
||||
|
||||
@ -564,13 +568,13 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
|
||||
return err;
|
||||
}
|
||||
|
||||
static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm,
|
||||
static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
|
||||
unsigned long dst_start,
|
||||
unsigned long src_start,
|
||||
unsigned long len,
|
||||
atomic_t *mmap_changing,
|
||||
uffd_flags_t flags)
|
||||
{
|
||||
struct mm_struct *dst_mm = ctx->mm;
|
||||
struct vm_area_struct *dst_vma;
|
||||
ssize_t err;
|
||||
pmd_t *dst_pmd;
|
||||
@ -600,8 +604,9 @@ retry:
|
||||
* operation (e.g. mremap) running in parallel, bail out and
|
||||
* request the user to retry later
|
||||
*/
|
||||
down_read(&ctx->map_changing_lock);
|
||||
err = -EAGAIN;
|
||||
if (mmap_changing && atomic_read(mmap_changing))
|
||||
if (atomic_read(&ctx->mmap_changing))
|
||||
goto out_unlock;
|
||||
|
||||
/*
|
||||
@ -633,8 +638,8 @@ retry:
|
||||
* If this is a HUGETLB vma, pass off to appropriate routine
|
||||
*/
|
||||
if (is_vm_hugetlb_page(dst_vma))
|
||||
return mfill_atomic_hugetlb(dst_vma, dst_start, src_start,
|
||||
len, mmap_changing, flags);
|
||||
return mfill_atomic_hugetlb(ctx, dst_vma, dst_start,
|
||||
src_start, len, flags);
|
||||
|
||||
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
|
||||
goto out_unlock;
|
||||
@ -693,6 +698,7 @@ retry:
|
||||
if (unlikely(err == -ENOENT)) {
|
||||
void *kaddr;
|
||||
|
||||
up_read(&ctx->map_changing_lock);
|
||||
mmap_read_unlock(dst_mm);
|
||||
BUG_ON(!folio);
|
||||
|
||||
@ -723,6 +729,7 @@ retry:
|
||||
}
|
||||
|
||||
out_unlock:
|
||||
up_read(&ctx->map_changing_lock);
|
||||
mmap_read_unlock(dst_mm);
|
||||
out:
|
||||
if (folio)
|
||||
@ -733,34 +740,33 @@ out:
|
||||
return copied ? copied : err;
|
||||
}
|
||||
|
||||
ssize_t mfill_atomic_copy(struct mm_struct *dst_mm, unsigned long dst_start,
|
||||
ssize_t mfill_atomic_copy(struct userfaultfd_ctx *ctx, unsigned long dst_start,
|
||||
unsigned long src_start, unsigned long len,
|
||||
atomic_t *mmap_changing, uffd_flags_t flags)
|
||||
uffd_flags_t flags)
|
||||
{
|
||||
return mfill_atomic(dst_mm, dst_start, src_start, len, mmap_changing,
|
||||
return mfill_atomic(ctx, dst_start, src_start, len,
|
||||
uffd_flags_set_mode(flags, MFILL_ATOMIC_COPY));
|
||||
}
|
||||
|
||||
ssize_t mfill_atomic_zeropage(struct mm_struct *dst_mm, unsigned long start,
|
||||
unsigned long len, atomic_t *mmap_changing)
|
||||
ssize_t mfill_atomic_zeropage(struct userfaultfd_ctx *ctx,
|
||||
unsigned long start,
|
||||
unsigned long len)
|
||||
{
|
||||
return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
|
||||
return mfill_atomic(ctx, start, 0, len,
|
||||
uffd_flags_set_mode(0, MFILL_ATOMIC_ZEROPAGE));
|
||||
}
|
||||
|
||||
ssize_t mfill_atomic_continue(struct mm_struct *dst_mm, unsigned long start,
|
||||
unsigned long len, atomic_t *mmap_changing,
|
||||
uffd_flags_t flags)
|
||||
ssize_t mfill_atomic_continue(struct userfaultfd_ctx *ctx, unsigned long start,
|
||||
unsigned long len, uffd_flags_t flags)
|
||||
{
|
||||
return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
|
||||
return mfill_atomic(ctx, start, 0, len,
|
||||
uffd_flags_set_mode(flags, MFILL_ATOMIC_CONTINUE));
|
||||
}
|
||||
|
||||
ssize_t mfill_atomic_poison(struct mm_struct *dst_mm, unsigned long start,
|
||||
unsigned long len, atomic_t *mmap_changing,
|
||||
uffd_flags_t flags)
|
||||
ssize_t mfill_atomic_poison(struct userfaultfd_ctx *ctx, unsigned long start,
|
||||
unsigned long len, uffd_flags_t flags)
|
||||
{
|
||||
return mfill_atomic(dst_mm, start, 0, len, mmap_changing,
|
||||
return mfill_atomic(ctx, start, 0, len,
|
||||
uffd_flags_set_mode(flags, MFILL_ATOMIC_POISON));
|
||||
}
|
||||
|
||||
@ -793,10 +799,10 @@ long uffd_wp_range(struct vm_area_struct *dst_vma,
|
||||
return ret;
|
||||
}
|
||||
|
||||
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
|
||||
unsigned long len, bool enable_wp,
|
||||
atomic_t *mmap_changing)
|
||||
int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
|
||||
unsigned long len, bool enable_wp)
|
||||
{
|
||||
struct mm_struct *dst_mm = ctx->mm;
|
||||
unsigned long end = start + len;
|
||||
unsigned long _start, _end;
|
||||
struct vm_area_struct *dst_vma;
|
||||
@ -820,8 +826,9 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
|
||||
* operation (e.g. mremap) running in parallel, bail out and
|
||||
* request the user to retry later
|
||||
*/
|
||||
down_read(&ctx->map_changing_lock);
|
||||
err = -EAGAIN;
|
||||
if (mmap_changing && atomic_read(mmap_changing))
|
||||
if (atomic_read(&ctx->mmap_changing))
|
||||
goto out_unlock;
|
||||
|
||||
err = -ENOENT;
|
||||
@ -850,6 +857,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
|
||||
err = 0;
|
||||
}
|
||||
out_unlock:
|
||||
up_read(&ctx->map_changing_lock);
|
||||
mmap_read_unlock(dst_mm);
|
||||
return err;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user