From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
This patch modify the userfaultfd to support userswap. VM_USWAP is set in userfaultfd_register() and cleared in userfaultfd_unregister() and userfaultfd_release(). Use do_uswap_page() to handle page faults of the userswap swap entry in do_swap_page(). Add uswap_must_wait() to handle userswap type userfaults in userfaultfd_must_wait().
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- fs/userfaultfd.c | 32 +++++++++++++-- include/linux/userswap.h | 16 ++++++++ include/uapi/linux/userfaultfd.h | 2 + mm/memory.c | 5 +++ mm/userswap.c | 70 ++++++++++++++++++++++++++++++++ 5 files changed, 122 insertions(+), 3 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 56eaae9dac1a..5d5d642a4686 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -31,6 +31,7 @@ #include <linux/hugetlb.h> #include <linux/swapops.h> #include <linux/miscdevice.h> +#include <linux/userswap.h>
static int sysctl_unprivileged_userfaultfd __read_mostly;
@@ -373,6 +374,9 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, * ptes here. */ ptent = ptep_get(pte); +#ifdef CONFIG_USERSWAP + uswap_must_wait(reason, ptent, &ret); +#endif if (pte_none_mostly(ptent)) ret = true; if (!pte_write(ptent) && (reason & VM_UFFD_WP)) @@ -442,10 +446,14 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) BUG_ON(ctx->mm != mm);
/* Any unrecognized flag is a bug. */ - VM_BUG_ON(reason & ~__VM_UFFD_FLAGS); + VM_BUG_ON(reason & ~(__VM_UFFD_FLAGS | VM_USWAP)); /* 0 or > 1 flags set is a bug; we expect exactly 1. */ VM_BUG_ON(!reason || (reason & (reason - 1)));
+ if (IS_ENABLED(CONFIG_USERSWAP) && (reason == VM_UFFD_MISSING) && + (vma->vm_flags & VM_USWAP)) + reason |= VM_USWAP; + if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) @@ -520,6 +528,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) uwq.wq.private = current; uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, reason, ctx->features); +#ifdef CONFIG_USERSWAP + if ((reason & VM_USWAP) && pte_none(vmf->orig_pte)) + uwq.msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_FIRST; +#endif uwq.ctx = ctx; uwq.waken = false;
@@ -921,7 +933,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; continue; } - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; + new_flags = vma->vm_flags & ~(__VM_UFFD_FLAGS | VM_USWAP); prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, @@ -1326,6 +1338,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long start, end, vma_end; struct vma_iterator vmi; pgoff_t pgoff; +#ifdef CONFIG_USERSWAP + bool uswap_mode = false; +#endif
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1337,6 +1352,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (!uffdio_register.mode) goto out; +#ifdef CONFIG_USERSWAP + if (!uswap_register(&uffdio_register, &uswap_mode)) + goto out; +#endif if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES) goto out; vm_flags = 0; @@ -1359,6 +1378,13 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, uffdio_register.range.len); if (ret) goto out; +#ifdef CONFIG_USERSWAP + if (unlikely(uswap_mode)) { + ret = -EINVAL; + if (!uswap_adjust_uffd_range(&uffdio_register, &vm_flags, mm)) + goto out; + } +#endif
start = uffdio_register.range.start; end = start + uffdio_register.range.len; @@ -1663,7 +1689,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, if (userfaultfd_wp(vma)) uffd_wp_range(vma, start, vma_end - start, false);
- new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; + new_flags = vma->vm_flags & ~(__VM_UFFD_FLAGS | VM_USWAP); pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, pgoff, diff --git a/include/linux/userswap.h b/include/linux/userswap.h index f8063185056c..bd6475259a9d 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -21,5 +21,21 @@ extern struct static_key_false userswap_enabled; unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, unsigned long new_addr, unsigned long new_len);
+bool uswap_register(struct uffdio_register *uffdio_register, bool *uswap_mode); + +bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, + unsigned long *vm_flags, struct mm_struct *mm); + +vm_fault_t do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, + struct vm_area_struct *vma); + +static inline void uswap_must_wait(unsigned long reason, pte_t pte, bool *ret) +{ + if (!static_branch_unlikely(&userswap_enabled)) + return; + if ((reason & VM_USWAP) && (!pte_present(pte))) + *ret = true; +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 62151706c5a3..c672bd90600b 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -152,6 +152,7 @@ struct uffd_msg { #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ #define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ +#define UFFD_PAGEFAULT_FLAG_FIRST (1<<10) /* USWAP first page fault */
struct uffdio_api { /* userland asks for an API number and the features to enable */ @@ -247,6 +248,7 @@ struct uffdio_register { #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) #define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) #define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) +#define UFFDIO_REGISTER_MODE_USWAP ((__u64)1<<10) __u64 mode;
/* diff --git a/mm/memory.c b/mm/memory.c index 862e14416027..6569c9e97c9d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,6 +77,7 @@ #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> +#include <linux/userswap.h>
#include <trace/events/kmem.h>
@@ -3778,6 +3779,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out;
entry = pte_to_swp_entry(vmf->orig_pte); +#ifdef CONFIG_USERSWAP + if (is_userswap_entry(entry)) + return do_uswap_page(entry, vmf, vma); +#endif if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, diff --git a/mm/userswap.c b/mm/userswap.c index 56f7140d5335..4f798c8226a1 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -389,6 +389,76 @@ unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, return ret; }
+bool uswap_register(struct uffdio_register *uffdio_register, bool *uswap_mode) +{ + if (!static_branch_unlikely(&userswap_enabled)) + return true; + if (!(uffdio_register->mode & UFFDIO_REGISTER_MODE_USWAP)) + return true; + uffdio_register->mode &= ~UFFDIO_REGISTER_MODE_USWAP; + if (uffdio_register->mode != UFFDIO_REGISTER_MODE_MISSING) + return false; + *uswap_mode = true; + return true; +} + +/* + * register the whole vma overlapping with the address range to avoid splitting + * the vma which could reduce fragmentation. + */ +bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, + unsigned long *vm_flags, struct mm_struct *mm) +{ + struct vm_area_struct *vma, *cur; + unsigned long end; + bool ret = false; + + VMA_ITERATOR(vmi, mm, uffdio_register->range.start); + + end = uffdio_register->range.start + uffdio_register->range.len - 1; + + mmap_read_lock(mm); + vma = find_vma(mm, uffdio_register->range.start); + if (!vma || vma->vm_start >= end) + goto out_unlock; + for_each_vma_range(vmi, cur, end) + if (!vma_uswap_compatible(cur)) + goto out_unlock; + + uffdio_register->range.start = vma->vm_start; + vma = find_vma(mm, end); + if (vma && end >= vma->vm_start) + uffdio_register->range.len = vma->vm_end - uffdio_register->range.start; + + *vm_flags |= VM_USWAP; + + ret = true; +out_unlock: + mmap_read_unlock(mm); + return ret; +} + +vm_fault_t do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, + struct vm_area_struct *vma) +{ + const char *process_prefix = "uswap"; + + /* print error if we come across a nested fault */ + if (!strncmp(current->comm, process_prefix, strlen(process_prefix))) { + pr_err("USWAP: fault %lx is triggered by %s\n", vmf->address, + current->comm); + return VM_FAULT_SIGBUS; + } + + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + pr_err("USWAP: addr %lx flags %lx is not a user swap page", + vmf->address, vma->vm_flags); + return VM_FAULT_SIGBUS; + } + + return handle_userfault(vmf, VM_UFFD_MISSING); +} + static int __init enable_userswap_setup(char *str) { static_branch_enable(&userswap_enabled);