From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
Add a new UFFDIO_COPY mode UFFDIO_COPY_MODE_DIRECT_MAP to map physical pages without copy_from_user(). We use uswap_unmap_anon_page() to unmap an anonymous page and uswap_map_anon_page() to map page to src addr. We introduce mfill_atomic_pte_nocopy() to achieve zero copy by unmapping src_addr to the physical page and establishing the mapping from dst_addr to the physical page.
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- fs/userfaultfd.c | 8 ++- include/linux/userfaultfd_k.h | 5 ++ include/linux/userswap.h | 13 +++++ include/uapi/linux/userfaultfd.h | 1 + mm/userfaultfd.c | 12 +++++ mm/userswap.c | 85 ++++++++++++++++++++++++++++++++ 6 files changed, 123 insertions(+), 1 deletion(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5d5d642a4686..207467a46e7a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1797,10 +1797,16 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out;
ret = -EINVAL; - if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE | + UFFDIO_COPY_MODE_WP | + IS_ENABLED(CONFIG_USERSWAP) ? + UFFDIO_COPY_MODE_DIRECT_MAP : 0)) goto out; if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) flags |= MFILL_ATOMIC_WP; + if (IS_ENABLED(CONFIG_USERSWAP) && + (uffdio_copy.mode & UFFDIO_COPY_MODE_DIRECT_MAP)) + flags |= MFILL_ATOMIC_DIRECT_MAP; if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src, uffdio_copy.len, &ctx->mmap_changing, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ac8c6854097c..9427d5fccf7b 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -47,6 +47,7 @@ enum mfill_atomic_mode { MFILL_ATOMIC_ZEROPAGE, MFILL_ATOMIC_CONTINUE, MFILL_ATOMIC_POISON, + MFILL_ATOMIC_DIRECT_MAP, NR_MFILL_ATOMIC_MODES, };
@@ -62,6 +63,10 @@ static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode
static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode) { + if (IS_ENABLED(CONFIG_USERSWAP) && (flags & MFILL_ATOMIC_DIRECT_MAP) && + uffd_flags_mode_is(mode, MFILL_ATOMIC_COPY)) + mode = MFILL_ATOMIC_DIRECT_MAP; + flags &= ~MFILL_ATOMIC_MODE_MASK; return flags | ((__force uffd_flags_t) mode); } diff --git a/include/linux/userswap.h b/include/linux/userswap.h index bd6475259a9d..10a7111e9129 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -29,6 +29,10 @@ bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, vm_fault_t do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, struct vm_area_struct *vma);
+int mfill_atomic_pte_nocopy(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr); + static inline void uswap_must_wait(unsigned long reason, pte_t pte, bool *ret) { if (!static_branch_unlikely(&userswap_enabled)) @@ -37,5 +41,14 @@ static inline void uswap_must_wait(unsigned long reason, pte_t pte, bool *ret) *ret = true; }
+static inline bool uswap_check_copy(struct vm_area_struct *vma, + uffd_flags_t flags) +{ + if (!!uffd_flags_mode_is(flags, MFILL_ATOMIC_DIRECT_MAP) ^ + !!(vma->vm_flags & VM_USWAP)) + return false; + return true; +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index c672bd90600b..2e9cf89f441d 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -270,6 +270,7 @@ struct uffdio_copy { * according to the uffdio_register.ioctls. */ #define UFFDIO_COPY_MODE_WP ((__u64)1<<1) +#define UFFDIO_COPY_MODE_DIRECT_MAP ((__u64)1<<10) __u64 mode;
/* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 96d9eae5c7cc..32fa1a22c85a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -15,6 +15,7 @@ #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> +#include <linux/userswap.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" @@ -603,6 +604,10 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, goto out_unlock;
err = -EINVAL; +#ifdef CONFIG_USERSWAP + if (!uswap_check_copy(dst_vma, flags)) + goto out_unlock; +#endif /* * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but * it will overwrite vm_ops, so vma_is_anonymous must return false. @@ -675,6 +680,13 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd));
+#ifdef CONFIG_USERSWAP + if (static_branch_unlikely(&userswap_enabled) && + uffd_flags_mode_is(flags, MFILL_ATOMIC_DIRECT_MAP)) + err = mfill_atomic_pte_nocopy(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr); + else +#endif err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, flags, &folio); cond_resched(); diff --git a/mm/userswap.c b/mm/userswap.c index 4f798c8226a1..18c99c2a0fc7 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -459,6 +459,91 @@ vm_fault_t do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, return handle_userfault(vmf, VM_UFFD_MISSING); }
+int mfill_atomic_pte_nocopy(struct mm_struct *mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr) +{ + struct vm_area_struct *src_vma; + pte_t dst_pte, *pte, src_pte; + struct page *page; + spinlock_t *ptl; + pmd_t *src_pmd; + int ret; + + src_vma = find_vma(mm, src_addr); + if (!src_vma || src_addr < src_vma->vm_start) + return -EINVAL; + + if (!vma_uswap_compatible(src_vma)) + return -EINVAL; + + page = follow_page(src_vma, src_addr, FOLL_GET | FOLL_DUMP); + if (IS_ERR_OR_NULL(page)) + return -ENODEV; + + ret = -ENXIO; + src_pmd = mm_find_pmd(mm, src_addr); + if (!src_pmd) + goto out_put_page; + + if (!PageLRU(page)) + lru_add_drain_all(); + + ret = -EBUSY; + if (page_mapcount(page) > 1 || + page_mapcount(page) + 1 != page_count(page)) + goto out_put_page; + + ret = uswap_unmap_anon_page(mm, src_vma, src_addr, page, src_pmd, + &src_pte, false); + if (ret) + goto out_put_page; + if (dst_vma->vm_flags & VM_USWAP) + ClearPageDirty(page); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + dst_pte = mk_pte(page, dst_vma->vm_page_prot); + if (dst_vma->vm_flags & VM_WRITE) + dst_pte = pte_mkwrite_novma(pte_mkdirty(dst_pte)); + if (dst_vma->vm_flags & VM_USWAP) + dst_pte = pte_mkclean(dst_pte); + + pte = pte_offset_map_lock(mm, dst_pmd, dst_addr, &ptl); + /* + * The userspace may swap in a large area. Part of the area is not + * swapped out. If concurrent execution, PTE may be present. Skip those + * pages (pte_present). + * No other scenes should be handled except first pagefault (pte_none) + * and after userswap out (SWP_USERSWAP_ENTRY). + */ + if (pte_present(*pte) || (!pte_none(*pte) && + !is_userswap_entry(pte_to_swp_entry(*pte)))) { + pte_unmap_unlock(pte, ptl); + uswap_map_anon_page(mm, src_vma, src_addr, page, src_pmd, + src_pte); + ret = -EEXIST; + goto out_put_page; + } + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, dst_vma, dst_addr); + set_pte_at(mm, dst_addr, pte, dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, pte); + pte_unmap_unlock(pte, ptl); + ret = 0; + +out_put_page: + put_page(page); + return ret; +} + static int __init enable_userswap_setup(char *str) { static_branch_enable(&userswap_enabled);