From: ZhangPeng zhangpeng362@huawei.com
This patch series support userswap feature including registration, unregistration, swap-out and swap-in.
Userswap feature depends on CONFIG_USERSWAP and can be enabled by cmdline enable_userswap.
We tested the concurrent scenario of multi-threaded page fault and multi-threaded swap-in in the uswap demo;and the remapping in the swap-out phase and the copy-free function in the swap-in phase were ok. During the test, related debugging functions including CONFIG_DEBUG_VM, lockdep, slub debug, kasan and kmemleak are enabled.
ZhangPeng (6): mm/userswap: add VM_USWAP and SWP_USERSWAP_ENTRY mm/userswap: add enable_userswap boot option mm/userswap: introduce MREMAP_USWAP_SET_PTE mm/userswap: support userswap via userfaultfd mm/userswap: introduce UFFDIO_COPY_MODE_DIRECT_MAP mm/userswap: provide cpu info in userfault msg
fs/proc/task_mmu.c | 3 + fs/userfaultfd.c | 43 ++- include/linux/mm.h | 8 + include/linux/swap.h | 14 +- include/linux/swapops.h | 16 + include/linux/userfaultfd_k.h | 5 + include/linux/userswap.h | 60 ++++ include/trace/events/mmflags.h | 7 + include/uapi/linux/mman.h | 1 + include/uapi/linux/userfaultfd.h | 3 + mm/Kconfig | 10 + mm/Makefile | 1 + mm/memory.c | 8 + mm/mremap.c | 8 +- mm/userfaultfd.c | 12 + mm/userswap.c | 552 +++++++++++++++++++++++++++++++ 16 files changed, 745 insertions(+), 6 deletions(-) create mode 100644 include/linux/userswap.h create mode 100644 mm/userswap.c
From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
VM_USWAP gets set in vma->vm_flags to tell the VM common code that the userswap is registered. SWP_USERSWAP_ENTRY is the swap entry when userswap memory is swapped out. In addition, is_userswap_entry() is introduced to determine whether the entry is a userswap swap entry. Add the userswap entry case in zap_pte_range() to prevent WARN_ON_ONCE(1).
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- fs/proc/task_mmu.c | 3 +++ include/linux/mm.h | 7 +++++++ include/linux/swap.h | 14 +++++++++++++- include/linux/swapops.h | 12 ++++++++++++ include/trace/events/mmflags.h | 7 +++++++ mm/Kconfig | 10 ++++++++++ mm/memory.c | 3 +++ 7 files changed, 55 insertions(+), 1 deletion(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3dd5be96691b..fe12b057d077 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -700,6 +700,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_X86_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif +#ifdef CONFIG_USERSWAP + [ilog2(VM_USWAP)] = "us", +#endif /* CONFIG_USERSWAP */ }; size_t i;
diff --git a/include/linux/mm.h b/include/linux/mm.h index 8cf86b56aba5..3592fabc8507 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -313,6 +313,13 @@ extern unsigned int kobjsize(const void *objp); #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
+#ifdef CONFIG_USERSWAP +# define VM_USWAP_BIT 62 +#define VM_USWAP BIT(VM_USWAP_BIT) +#else /* !CONFIG_USERSWAP */ +#define VM_USWAP VM_NONE +#endif /* CONFIG_USERSWAP */ + #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ diff --git a/include/linux/swap.h b/include/linux/swap.h index f6dd6575b905..fe20c462fecb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -55,6 +55,18 @@ static inline int current_is_kswapd(void) * actions on faults. */
+/* + * Userswap entry type + */ +#ifdef CONFIG_USERSWAP +#define SWP_USERSWAP_NUM 1 +#define SWP_USERSWAP_ENTRY (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ + SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \ + SWP_PTE_MARKER_NUM) +#else +#define SWP_USERSWAP_NUM 0 +#endif + /* * PTE markers are used to persist information onto PTEs that otherwise * should be a none pte. As its name "PTE" hints, it should only be @@ -117,7 +129,7 @@ static inline int current_is_kswapd(void) #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ - SWP_PTE_MARKER_NUM) + SWP_PTE_MARKER_NUM - SWP_USERSWAP_NUM)
/* * Magic header for a swap area. The first part of the union is diff --git a/include/linux/swapops.h b/include/linux/swapops.h index bff1e8d97de0..6b4ed6bfb67c 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -455,6 +455,18 @@ static inline int pte_none_mostly(pte_t pte) return pte_none(pte) || is_pte_marker(pte); }
+#ifdef CONFIG_USERSWAP +static inline int is_userswap_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_USERSWAP_ENTRY); +} +#else +static inline int is_userswap_entry(swp_entry_t entry) +{ + return 0; +} +#endif + static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset_pfn(entry)); diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 1478b9dd05fa..18d30581137a 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -164,6 +164,12 @@ IF_HAVE_PG_ARCH_X(arch_3) # define IF_HAVE_UFFD_MINOR(flag, name) #endif
+#ifdef CONFIG_USERSWAP +#define IF_HAVE_VM_USWAP(flag, name) {flag, name }, +#else +#define IF_HAVE_VM_USWAP(flag, name) +#endif + #define __def_vmaflag_names \ {VM_READ, "read" }, \ {VM_WRITE, "write" }, \ @@ -196,6 +202,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ {VM_MIXEDMAP, "mixedmap" }, \ {VM_HUGEPAGE, "hugepage" }, \ {VM_NOHUGEPAGE, "nohugepage" }, \ +IF_HAVE_VM_USWAP(VM_USWAP, "userswap" ) \ {VM_MERGEABLE, "mergeable" } \
#define show_vma_flags(flags) \ diff --git a/mm/Kconfig b/mm/Kconfig index ece4f2847e2b..ca35151fe561 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1227,6 +1227,16 @@ config PTE_MARKER_UFFD_WP purposes. It is required to enable userfaultfd write protection on file-backed memory types like shmem and hugetlbfs.
+config USERSWAP + bool "Enable User Swap" + depends on MMU && USERFAULTFD + depends on X86 || ARM64 + default n + + help + Support for User Swap. This is based on userfaultfd. We can implement + our own swapout and swapin functions in usersapce. + # multi-gen LRU { config LRU_GEN bool "Multi-Gen LRU" diff --git a/mm/memory.c b/mm/memory.c index e1a0eb8b776a..862e14416027 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1516,6 +1516,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, is_poisoned_swp_entry(entry)) { if (!should_zap_cows(details)) continue; + } else if (is_userswap_entry(entry)) { + if (!should_zap_cows(details)) + continue; } else { /* We should have covered all the swap entry types */ WARN_ON_ONCE(1);
From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
Add enable_userswap boot option to enable userswap feature. Add static key userswap_enabled to indicate whether the feature is enabled.
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- include/linux/swapops.h | 4 ++++ mm/Makefile | 1 + mm/userswap.c | 17 +++++++++++++++++ 3 files changed, 22 insertions(+) create mode 100644 mm/userswap.c
diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 6b4ed6bfb67c..4a7e53612fdb 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -456,8 +456,12 @@ static inline int pte_none_mostly(pte_t pte) }
#ifdef CONFIG_USERSWAP +extern struct static_key_false userswap_enabled; + static inline int is_userswap_entry(swp_entry_t entry) { + if (!static_branch_unlikely(&userswap_enabled)) + return 0; return unlikely(swp_type(entry) == SWP_USERSWAP_ENTRY); } #else diff --git a/mm/Makefile b/mm/Makefile index ec65984e2ade..02b79ca7d0e6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -122,6 +122,7 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_SECRETMEM) += secretmem.o obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o +obj-$(CONFIG_USERSWAP) += userswap.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_DEBUG_PAGEALLOC) += debug_page_alloc.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o diff --git a/mm/userswap.c b/mm/userswap.c new file mode 100644 index 000000000000..a2f180b4457f --- /dev/null +++ b/mm/userswap.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * userswap core file + */ + +#include "internal.h" + +DEFINE_STATIC_KEY_FALSE(userswap_enabled); + +static int __init enable_userswap_setup(char *str) +{ + static_branch_enable(&userswap_enabled); + return 1; +} +__setup("enable_userswap", enable_userswap_setup);
From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
We introduce MREMAP_USWAP_SET_PTE to implement remapping in the swap-out phase. Unmap the pages between 'addr ~ addr+old_len' and remap them to 'new_addr ~ new_addr+new_len'. During unmapping, the PTE of old_addr is set to SWP_USERSWAP_ENTRY.
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- include/linux/mm.h | 1 + include/linux/userswap.h | 25 +++ include/uapi/linux/mman.h | 1 + mm/mremap.c | 8 +- mm/userswap.c | 380 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 414 insertions(+), 1 deletion(-) create mode 100644 include/linux/userswap.h
diff --git a/include/linux/mm.h b/include/linux/mm.h index 3592fabc8507..cea31e552614 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2486,6 +2486,7 @@ int set_page_dirty_lock(struct page *page);
int get_cmdline(struct task_struct *task, char *buffer, int buflen);
+extern pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, diff --git a/include/linux/userswap.h b/include/linux/userswap.h new file mode 100644 index 000000000000..f8063185056c --- /dev/null +++ b/include/linux/userswap.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved. + */ + +#ifndef _LINUX_USERSWAP_H +#define _LINUX_USERSWAP_H + +#include <linux/mman.h> + +#ifdef CONFIG_USERSWAP + +extern struct static_key_false userswap_enabled; + +/* + * In uswap situation, we use the bit 0 of the returned address to indicate + * whether the pages are dirty. + */ +#define USWAP_PAGES_DIRTY 1 + +unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len); + +#endif /* CONFIG_USERSWAP */ +#endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index a246e11988d5..3984f2133906 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -9,6 +9,7 @@ #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 #define MREMAP_DONTUNMAP 4 +#define MREMAP_USWAP_SET_PTE 64
#define OVERCOMMIT_GUESS 0 #define OVERCOMMIT_ALWAYS 1 diff --git a/mm/mremap.c b/mm/mremap.c index 382e81c33fc4..f4bd0fcf071b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,6 +25,7 @@ #include <linux/uaccess.h> #include <linux/userfaultfd_k.h> #include <linux/mempolicy.h> +#include <linux/userswap.h>
#include <asm/cacheflush.h> #include <asm/tlb.h> @@ -32,7 +33,7 @@
#include "internal.h"
-static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) +pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; @@ -930,6 +931,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, */ addr = untagged_addr(addr);
+#ifdef CONFIG_USERSWAP + if (flags == MREMAP_USWAP_SET_PTE) + return uswap_mremap(addr, old_len, new_addr, new_len); +#endif + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) return ret;
diff --git a/mm/userswap.c b/mm/userswap.c index a2f180b4457f..56f7140d5335 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -5,10 +5,390 @@ * userswap core file */
+#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/rmap.h> +#include <linux/mmu_notifier.h> +#include <linux/hugetlb.h> +#include <linux/userswap.h> + #include "internal.h"
DEFINE_STATIC_KEY_FALSE(userswap_enabled);
+static bool vma_uswap_compatible(struct vm_area_struct *vma) +{ + if (!vma || !vma_is_anonymous(vma) || vma->vm_file || + (vma->vm_flags & (VM_SHARED | VM_LOCKED | VM_STACK | VM_IO | + VM_PFNMAP | VM_HUGETLB))) + return false; + return true; +} + +/* + * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get + * the reference of the pages and return the pages through input parameters + * 'ppages'. + */ +static unsigned long pages_can_be_swapped(struct mm_struct *mm, + unsigned long addr, + unsigned long len, + struct page ***ppages) +{ + struct vm_area_struct *vma; + struct page *page = NULL; + struct page **pages = NULL; + unsigned long addr_end = addr + len; + unsigned long ret; + unsigned long i, page_num = 0; + *ppages = NULL; + + pages = kvzalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + while (addr < addr_end) { + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start || + !(vma->vm_flags & VM_USWAP) || + !vma_uswap_compatible(vma)) { + ret = -EINVAL; + goto out_err; + } + + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + ret = -EAGAIN; + goto out_err; + } +get_again: + /* + * follow_page will inc page ref, dec the ref after we remap + * the page. + */ + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + if (IS_ERR_OR_NULL(page)) { + ret = -ENODEV; + goto out_err; + } + + pages[page_num++] = page; + if (!PageAnon(page) || !PageSwapBacked(page) || + PageHuge(page) || PageSwapCache(page)) { + ret = -EINVAL; + goto out_err; + } + + if (PageTransCompound(page)) { + if (trylock_page(page)) { + if (!split_huge_page(page)) { + unlock_page(page); + put_page(page); + page_num--; + goto get_again; + } else { + unlock_page(page); + } + } + ret = -EINVAL; + goto out_err; + } + + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if (page_mapcount(page) > 1 || + page_mapcount(page) + 1 != page_count(page)) { + ret = -EBUSY; + goto out_err; + } + addr += PAGE_SIZE; + } + + *ppages = pages; + return 0; + +out_err: + for (i = 0; i < page_num; i++) + put_page(pages[i]); + kvfree(pages); + return ret; +} + +static bool is_thp_or_huge(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = get_old_pud(mm, addr); + if (!pud) + return false; + else if (pud_huge(*pud)) + return true; + + pmd = pmd_offset(pud, addr); + if (!pmd) + return false; + else if (pmd_huge(*pmd) || pmd_trans_huge(*pmd)) + return true; + + return false; +} + +static int uswap_unmap_anon_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, struct page *page, + pmd_t *pmd, pte_t *old_pte, bool set_to_swp) +{ + struct mmu_notifier_range range; + spinlock_t *ptl; + pte_t *pte, _old_pte; + int ret = 0; + + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, addr, + addr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_present(*pte)) { + ret = -EINVAL; + goto out_release_unlock; + } + flush_cache_page(vma, addr, pte_pfn(*pte)); + _old_pte = ptep_clear_flush(vma, addr, pte); + if (old_pte) + *old_pte = _old_pte; + if (set_to_swp) + set_pte_at(mm, addr, pte, swp_entry_to_pte(swp_entry( + SWP_USERSWAP_ENTRY, page_to_pfn(page)))); + + dec_mm_counter(mm, MM_ANONPAGES); + page_remove_rmap(page, vma, false); + page->mapping = NULL; + +out_release_unlock: + pte_unmap_unlock(pte, ptl); + mmu_notifier_invalidate_range_end(&range); + return ret; +} + +static unsigned long vm_insert_anon_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + int ret = 0; + pte_t *pte, dst_pte; + spinlock_t *ptl; + + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + + flush_dcache_page(page); + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + return -ENOMEM; + if (!pte_none(*pte)) { + ret = -EBUSY; + goto out_unlock; + } + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr); + dst_pte = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + dst_pte = pte_mkwrite_novma(pte_mkdirty(dst_pte)); + set_pte_at(mm, addr, pte, dst_pte); + +out_unlock: + pte_unmap_unlock(pte, ptl); + return ret; +} + +static void uswap_map_anon_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + struct page *page, + pmd_t *pmd, + pte_t old_pte) +{ + spinlock_t *ptl; + pte_t *pte; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + flush_cache_page(vma, addr, pte_pfn(*pte)); + set_pte_at(mm, addr, pte, old_pte); + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr); + pte_unmap_unlock(pte, ptl); +} + +static void uswapout_recover(struct mm_struct *mm, + unsigned long old_addr_start, unsigned long len, + struct page **pages, unsigned long new_addr_start, + pte_t *ptes) +{ + unsigned long unmap_old_addr = old_addr_start; + unsigned long unmap_new_addr = new_addr_start; + struct page *page; + pmd_t *old_pmd, *new_pmd; + pte_t pte; + unsigned long i; + + for (i = 0; i < len; i++) { + page = pages[i]; + pte = ptes[i]; + new_pmd = mm_find_pmd(mm, new_addr_start); + old_pmd = mm_find_pmd(mm, unmap_old_addr); + + uswap_unmap_anon_page(mm, find_vma(mm, unmap_new_addr), + unmap_new_addr, page, new_pmd, NULL, + false); + uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr), + unmap_old_addr, page, old_pmd, pte); + unmap_old_addr += PAGE_SIZE; + unmap_new_addr += PAGE_SIZE; + } + if (pte_val(ptes[len]) != 0) { + page = pages[len]; + pte = ptes[len]; + old_pmd = mm_find_pmd(mm, unmap_old_addr); + + uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr), + unmap_old_addr, page, old_pmd, pte); + get_page(page); + } +} + +/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */ +static unsigned long do_user_swap(struct mm_struct *mm, + unsigned long old_addr_start, + unsigned long len, struct page **pages, + unsigned long new_addr_start) +{ + struct vm_area_struct *old_vma, *new_vma; + unsigned long old_addr = old_addr_start; + unsigned long new_addr = new_addr_start; + struct page *page; + pmd_t *pmd; + pte_t old_pte, *ptes; + bool pages_dirty = false; + unsigned long i = 0, j; + int ret; + + ptes = kvzalloc(sizeof(pte_t) * (len / PAGE_SIZE), GFP_KERNEL); + if (!ptes) + return -ENOMEM; + lru_add_drain(); + for (j = 0; j < len; j += PAGE_SIZE) { + page = pages[i]; + ret = -EINVAL; + if (!page) + goto out_recover; + if (is_thp_or_huge(mm, new_addr)) + goto out_recover; + old_vma = find_vma(mm, old_addr); + if (!old_vma || old_addr < old_vma->vm_start) + goto out_recover; + new_vma = find_vma(mm, new_addr); + if (!new_vma || new_addr < new_vma->vm_start) + goto out_recover; + if (!vma_uswap_compatible(new_vma)) + goto out_recover; + + ret = -EACCES; + if (!(old_vma->vm_flags & VM_WRITE) && + (new_vma->vm_flags & VM_WRITE)) + goto out_recover; + + ret = -ENXIO; + pmd = mm_find_pmd(mm, old_addr); + if (!pmd) + goto out_recover; + ret = uswap_unmap_anon_page(mm, old_vma, old_addr, page, pmd, + &old_pte, true); + if (ret) + goto out_recover; + ptes[i] = old_pte; + if (pte_dirty(old_pte) || PageDirty(page)) + pages_dirty = true; + put_page(page); + + ret = vm_insert_anon_page(new_vma, new_addr, page); + if (ret) + goto out_recover; + get_page(page); + + old_addr += PAGE_SIZE; + new_addr += PAGE_SIZE; + i++; + } + + if (pages_dirty) + new_addr_start = new_addr_start | USWAP_PAGES_DIRTY; + kvfree(ptes); + return new_addr_start; + +out_recover: + uswapout_recover(mm, old_addr_start, i, pages, new_addr_start, ptes); + kvfree(ptes); + return ret; +} + + +/* + * When flags is MREMAP_USWAP_SET_PTE, uswap_mremap() is called in syscall + * mremap. + * Unmap the pages between 'addr ~ addr + old_len' and remap them to 'new_addr + * ~ new_addr + new_len'. Set the pte of old_addr to SWP_USERSWAP_ENTRY. + */ +unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len) +{ + struct page **pages = NULL; + struct mm_struct *mm = current->mm; + unsigned long len = old_len; + unsigned long ret = -EINVAL; + unsigned long i; + + if (!static_branch_unlikely(&userswap_enabled)) + goto out; + + if (offset_in_page(old_addr)) + goto out; + + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + + if (!new_len || old_len != new_len || offset_in_page(new_addr)) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len || + old_addr > TASK_SIZE - old_len) + goto out; + + /* Ensure the old/new locations do not overlap */ + if (old_addr + old_len > new_addr && new_addr + new_len > old_addr) + goto out; + + lru_add_drain_all(); + mmap_write_lock(mm); + ret = pages_can_be_swapped(mm, old_addr, len, &pages); + if (ret) + goto out_release_unlock; + + ret = do_user_swap(mm, old_addr, len, pages, new_addr); + /* follow_page() above increased the reference */ + for (i = 0; i < len / PAGE_SIZE; i++) + if (pages[i]) + put_page(pages[i]); + + kvfree(pages); + +out_release_unlock: + mmap_write_unlock(mm); +out: + return ret; +} + static int __init enable_userswap_setup(char *str) { static_branch_enable(&userswap_enabled);
From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
This patch modify the userfaultfd to support userswap. VM_USWAP is set in userfaultfd_register() and cleared in userfaultfd_unregister() and userfaultfd_release(). Use do_uswap_page() to handle page faults of the userswap swap entry in do_swap_page(). Add uswap_must_wait() to handle userswap type userfaults in userfaultfd_must_wait().
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- fs/userfaultfd.c | 32 +++++++++++++-- include/linux/userswap.h | 16 ++++++++ include/uapi/linux/userfaultfd.h | 2 + mm/memory.c | 5 +++ mm/userswap.c | 70 ++++++++++++++++++++++++++++++++ 5 files changed, 122 insertions(+), 3 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 56eaae9dac1a..5d5d642a4686 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -31,6 +31,7 @@ #include <linux/hugetlb.h> #include <linux/swapops.h> #include <linux/miscdevice.h> +#include <linux/userswap.h>
static int sysctl_unprivileged_userfaultfd __read_mostly;
@@ -373,6 +374,9 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, * ptes here. */ ptent = ptep_get(pte); +#ifdef CONFIG_USERSWAP + uswap_must_wait(reason, ptent, &ret); +#endif if (pte_none_mostly(ptent)) ret = true; if (!pte_write(ptent) && (reason & VM_UFFD_WP)) @@ -442,10 +446,14 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) BUG_ON(ctx->mm != mm);
/* Any unrecognized flag is a bug. */ - VM_BUG_ON(reason & ~__VM_UFFD_FLAGS); + VM_BUG_ON(reason & ~(__VM_UFFD_FLAGS | VM_USWAP)); /* 0 or > 1 flags set is a bug; we expect exactly 1. */ VM_BUG_ON(!reason || (reason & (reason - 1)));
+ if (IS_ENABLED(CONFIG_USERSWAP) && (reason == VM_UFFD_MISSING) && + (vma->vm_flags & VM_USWAP)) + reason |= VM_USWAP; + if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) @@ -520,6 +528,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) uwq.wq.private = current; uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, reason, ctx->features); +#ifdef CONFIG_USERSWAP + if ((reason & VM_USWAP) && pte_none(vmf->orig_pte)) + uwq.msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_FIRST; +#endif uwq.ctx = ctx; uwq.waken = false;
@@ -921,7 +933,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; continue; } - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; + new_flags = vma->vm_flags & ~(__VM_UFFD_FLAGS | VM_USWAP); prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, @@ -1326,6 +1338,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long start, end, vma_end; struct vma_iterator vmi; pgoff_t pgoff; +#ifdef CONFIG_USERSWAP + bool uswap_mode = false; +#endif
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1337,6 +1352,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (!uffdio_register.mode) goto out; +#ifdef CONFIG_USERSWAP + if (!uswap_register(&uffdio_register, &uswap_mode)) + goto out; +#endif if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES) goto out; vm_flags = 0; @@ -1359,6 +1378,13 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, uffdio_register.range.len); if (ret) goto out; +#ifdef CONFIG_USERSWAP + if (unlikely(uswap_mode)) { + ret = -EINVAL; + if (!uswap_adjust_uffd_range(&uffdio_register, &vm_flags, mm)) + goto out; + } +#endif
start = uffdio_register.range.start; end = start + uffdio_register.range.len; @@ -1663,7 +1689,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, if (userfaultfd_wp(vma)) uffd_wp_range(vma, start, vma_end - start, false);
- new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; + new_flags = vma->vm_flags & ~(__VM_UFFD_FLAGS | VM_USWAP); pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, pgoff, diff --git a/include/linux/userswap.h b/include/linux/userswap.h index f8063185056c..bd6475259a9d 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -21,5 +21,21 @@ extern struct static_key_false userswap_enabled; unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, unsigned long new_addr, unsigned long new_len);
+bool uswap_register(struct uffdio_register *uffdio_register, bool *uswap_mode); + +bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, + unsigned long *vm_flags, struct mm_struct *mm); + +vm_fault_t do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, + struct vm_area_struct *vma); + +static inline void uswap_must_wait(unsigned long reason, pte_t pte, bool *ret) +{ + if (!static_branch_unlikely(&userswap_enabled)) + return; + if ((reason & VM_USWAP) && (!pte_present(pte))) + *ret = true; +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 62151706c5a3..b2cd9fe5a4ca 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -152,6 +152,7 @@ struct uffd_msg { #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ #define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ +#define UFFD_PAGEFAULT_FLAG_FIRST (1<<10) /* USWAP first page fault */
struct uffdio_api { /* userland asks for an API number and the features to enable */ @@ -247,6 +248,7 @@ struct uffdio_register { #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) #define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) #define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) +#define UFFDIO_REGISTER_MODE_USWAP ((__u64)1<<3) __u64 mode;
/* diff --git a/mm/memory.c b/mm/memory.c index 862e14416027..6569c9e97c9d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,6 +77,7 @@ #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> +#include <linux/userswap.h>
#include <trace/events/kmem.h>
@@ -3778,6 +3779,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out;
entry = pte_to_swp_entry(vmf->orig_pte); +#ifdef CONFIG_USERSWAP + if (is_userswap_entry(entry)) + return do_uswap_page(entry, vmf, vma); +#endif if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, diff --git a/mm/userswap.c b/mm/userswap.c index 56f7140d5335..4f798c8226a1 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -389,6 +389,76 @@ unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, return ret; }
+bool uswap_register(struct uffdio_register *uffdio_register, bool *uswap_mode) +{ + if (!static_branch_unlikely(&userswap_enabled)) + return true; + if (!(uffdio_register->mode & UFFDIO_REGISTER_MODE_USWAP)) + return true; + uffdio_register->mode &= ~UFFDIO_REGISTER_MODE_USWAP; + if (uffdio_register->mode != UFFDIO_REGISTER_MODE_MISSING) + return false; + *uswap_mode = true; + return true; +} + +/* + * register the whole vma overlapping with the address range to avoid splitting + * the vma which could reduce fragmentation. + */ +bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, + unsigned long *vm_flags, struct mm_struct *mm) +{ + struct vm_area_struct *vma, *cur; + unsigned long end; + bool ret = false; + + VMA_ITERATOR(vmi, mm, uffdio_register->range.start); + + end = uffdio_register->range.start + uffdio_register->range.len - 1; + + mmap_read_lock(mm); + vma = find_vma(mm, uffdio_register->range.start); + if (!vma || vma->vm_start >= end) + goto out_unlock; + for_each_vma_range(vmi, cur, end) + if (!vma_uswap_compatible(cur)) + goto out_unlock; + + uffdio_register->range.start = vma->vm_start; + vma = find_vma(mm, end); + if (vma && end >= vma->vm_start) + uffdio_register->range.len = vma->vm_end - uffdio_register->range.start; + + *vm_flags |= VM_USWAP; + + ret = true; +out_unlock: + mmap_read_unlock(mm); + return ret; +} + +vm_fault_t do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, + struct vm_area_struct *vma) +{ + const char *process_prefix = "uswap"; + + /* print error if we come across a nested fault */ + if (!strncmp(current->comm, process_prefix, strlen(process_prefix))) { + pr_err("USWAP: fault %lx is triggered by %s\n", vmf->address, + current->comm); + return VM_FAULT_SIGBUS; + } + + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + pr_err("USWAP: addr %lx flags %lx is not a user swap page", + vmf->address, vma->vm_flags); + return VM_FAULT_SIGBUS; + } + + return handle_userfault(vmf, VM_UFFD_MISSING); +} + static int __init enable_userswap_setup(char *str) { static_branch_enable(&userswap_enabled);
From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
Add a new UFFDIO_COPY mode UFFDIO_COPY_MODE_DIRECT_MAP to map physical pages without copy_from_user(). We use uswap_unmap_anon_page() to unmap an anonymous page and uswap_map_anon_page() to map page to src addr. We introduce mfill_atomic_pte_nocopy() to achieve zero copy by unmapping src_addr to the physical page and establishing the mapping from dst_addr to the physical page.
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- fs/userfaultfd.c | 8 ++- include/linux/userfaultfd_k.h | 5 ++ include/linux/userswap.h | 13 +++++ include/uapi/linux/userfaultfd.h | 1 + mm/userfaultfd.c | 12 +++++ mm/userswap.c | 85 ++++++++++++++++++++++++++++++++ 6 files changed, 123 insertions(+), 1 deletion(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5d5d642a4686..207467a46e7a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1797,10 +1797,16 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out;
ret = -EINVAL; - if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE | + UFFDIO_COPY_MODE_WP | + IS_ENABLED(CONFIG_USERSWAP) ? + UFFDIO_COPY_MODE_DIRECT_MAP : 0)) goto out; if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) flags |= MFILL_ATOMIC_WP; + if (IS_ENABLED(CONFIG_USERSWAP) && + (uffdio_copy.mode & UFFDIO_COPY_MODE_DIRECT_MAP)) + flags |= MFILL_ATOMIC_DIRECT_MAP; if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src, uffdio_copy.len, &ctx->mmap_changing, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ac8c6854097c..9427d5fccf7b 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -47,6 +47,7 @@ enum mfill_atomic_mode { MFILL_ATOMIC_ZEROPAGE, MFILL_ATOMIC_CONTINUE, MFILL_ATOMIC_POISON, + MFILL_ATOMIC_DIRECT_MAP, NR_MFILL_ATOMIC_MODES, };
@@ -62,6 +63,10 @@ static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode
static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode) { + if (IS_ENABLED(CONFIG_USERSWAP) && (flags & MFILL_ATOMIC_DIRECT_MAP) && + uffd_flags_mode_is(mode, MFILL_ATOMIC_COPY)) + mode = MFILL_ATOMIC_DIRECT_MAP; + flags &= ~MFILL_ATOMIC_MODE_MASK; return flags | ((__force uffd_flags_t) mode); } diff --git a/include/linux/userswap.h b/include/linux/userswap.h index bd6475259a9d..10a7111e9129 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -29,6 +29,10 @@ bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, vm_fault_t do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, struct vm_area_struct *vma);
+int mfill_atomic_pte_nocopy(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr); + static inline void uswap_must_wait(unsigned long reason, pte_t pte, bool *ret) { if (!static_branch_unlikely(&userswap_enabled)) @@ -37,5 +41,14 @@ static inline void uswap_must_wait(unsigned long reason, pte_t pte, bool *ret) *ret = true; }
+static inline bool uswap_check_copy(struct vm_area_struct *vma, + uffd_flags_t flags) +{ + if (!!uffd_flags_mode_is(flags, MFILL_ATOMIC_DIRECT_MAP) ^ + !!(vma->vm_flags & VM_USWAP)) + return false; + return true; +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index b2cd9fe5a4ca..bda754f43203 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -270,6 +270,7 @@ struct uffdio_copy { * according to the uffdio_register.ioctls. */ #define UFFDIO_COPY_MODE_WP ((__u64)1<<1) +#define UFFDIO_COPY_MODE_DIRECT_MAP ((__u64)1<<10) __u64 mode;
/* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 96d9eae5c7cc..32fa1a22c85a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -15,6 +15,7 @@ #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> +#include <linux/userswap.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" @@ -603,6 +604,10 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, goto out_unlock;
err = -EINVAL; +#ifdef CONFIG_USERSWAP + if (!uswap_check_copy(dst_vma, flags)) + goto out_unlock; +#endif /* * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but * it will overwrite vm_ops, so vma_is_anonymous must return false. @@ -675,6 +680,13 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd));
+#ifdef CONFIG_USERSWAP + if (static_branch_unlikely(&userswap_enabled) && + uffd_flags_mode_is(flags, MFILL_ATOMIC_DIRECT_MAP)) + err = mfill_atomic_pte_nocopy(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr); + else +#endif err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, flags, &folio); cond_resched(); diff --git a/mm/userswap.c b/mm/userswap.c index 4f798c8226a1..18c99c2a0fc7 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -459,6 +459,91 @@ vm_fault_t do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, return handle_userfault(vmf, VM_UFFD_MISSING); }
+int mfill_atomic_pte_nocopy(struct mm_struct *mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr) +{ + struct vm_area_struct *src_vma; + pte_t dst_pte, *pte, src_pte; + struct page *page; + spinlock_t *ptl; + pmd_t *src_pmd; + int ret; + + src_vma = find_vma(mm, src_addr); + if (!src_vma || src_addr < src_vma->vm_start) + return -EINVAL; + + if (!vma_uswap_compatible(src_vma)) + return -EINVAL; + + page = follow_page(src_vma, src_addr, FOLL_GET | FOLL_DUMP); + if (IS_ERR_OR_NULL(page)) + return -ENODEV; + + ret = -ENXIO; + src_pmd = mm_find_pmd(mm, src_addr); + if (!src_pmd) + goto out_put_page; + + if (!PageLRU(page)) + lru_add_drain_all(); + + ret = -EBUSY; + if (page_mapcount(page) > 1 || + page_mapcount(page) + 1 != page_count(page)) + goto out_put_page; + + ret = uswap_unmap_anon_page(mm, src_vma, src_addr, page, src_pmd, + &src_pte, false); + if (ret) + goto out_put_page; + if (dst_vma->vm_flags & VM_USWAP) + ClearPageDirty(page); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + dst_pte = mk_pte(page, dst_vma->vm_page_prot); + if (dst_vma->vm_flags & VM_WRITE) + dst_pte = pte_mkwrite_novma(pte_mkdirty(dst_pte)); + if (dst_vma->vm_flags & VM_USWAP) + dst_pte = pte_mkclean(dst_pte); + + pte = pte_offset_map_lock(mm, dst_pmd, dst_addr, &ptl); + /* + * The userspace may swap in a large area. Part of the area is not + * swapped out. If concurrent execution, PTE may be present. Skip those + * pages (pte_present). + * No other scenes should be handled except first pagefault (pte_none) + * and after userswap out (SWP_USERSWAP_ENTRY). + */ + if (pte_present(*pte) || (!pte_none(*pte) && + !is_userswap_entry(pte_to_swp_entry(*pte)))) { + pte_unmap_unlock(pte, ptl); + uswap_map_anon_page(mm, src_vma, src_addr, page, src_pmd, + src_pte); + ret = -EEXIST; + goto out_put_page; + } + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, dst_vma, dst_addr); + set_pte_at(mm, dst_addr, pte, dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, pte); + pte_unmap_unlock(pte, ptl); + ret = 0; + +out_put_page: + put_page(page); + return ret; +} + static int __init enable_userswap_setup(char *str) { static_branch_enable(&userswap_enabled);
From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
The uffd_msg.reserved3 field is used to transfer the CPU information of the PF.
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- fs/userfaultfd.c | 3 +++ include/linux/userswap.h | 6 ++++++ 2 files changed, 9 insertions(+)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 207467a46e7a..4db4a6b8a4a3 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -269,6 +269,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address, msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR; if (features & UFFD_FEATURE_THREAD_ID) msg.arg.pagefault.feat.ptid = task_pid_vnr(current); +#ifdef CONFIG_USERSWAP + uswap_get_cpu_id(reason, &msg); +#endif return msg; }
diff --git a/include/linux/userswap.h b/include/linux/userswap.h index 10a7111e9129..cecdef09c66f 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -50,5 +50,11 @@ static inline bool uswap_check_copy(struct vm_area_struct *vma, return true; }
+static inline void uswap_get_cpu_id(unsigned long reason, struct uffd_msg *msg) +{ + if (reason & VM_USWAP) + msg->reserved3 = smp_processor_id(); +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */
From: ZhangPeng zhangpeng362@huawei.com
This patch series support userswap feature including registration, unregistration, swap-out and swap-in.
Userswap feature depends on CONFIG_USERSWAP and can be enabled by cmdline enable_userswap.
We tested the concurrent scenario of multi-threaded page fault and multi-threaded swap-in in the uswap demo;and the remapping in the swap-out phase and the copy-free function in the swap-in phase were ok. During the test, related debugging functions including CONFIG_DEBUG_VM, lockdep, slub debug, kasan and kmemleak are enabled.
ZhangPeng (6): mm/userswap: add VM_USWAP and SWP_USERSWAP_ENTRY mm/userswap: add enable_userswap boot option mm/userswap: introduce MREMAP_USWAP_SET_PTE mm/userswap: support userswap via userfaultfd mm/userswap: introduce UFFDIO_COPY_MODE_DIRECT_MAP mm/userswap: provide cpu info in userfault msg
fs/proc/task_mmu.c | 3 + fs/userfaultfd.c | 43 ++- include/linux/mm.h | 8 + include/linux/swap.h | 14 +- include/linux/swapops.h | 16 + include/linux/userfaultfd_k.h | 5 + include/linux/userswap.h | 60 ++++ include/trace/events/mmflags.h | 7 + include/uapi/linux/mman.h | 1 + include/uapi/linux/userfaultfd.h | 3 + mm/Kconfig | 10 + mm/Makefile | 1 + mm/memory.c | 8 + mm/mremap.c | 8 +- mm/userfaultfd.c | 12 + mm/userswap.c | 552 +++++++++++++++++++++++++++++++ 16 files changed, 745 insertions(+), 6 deletions(-) create mode 100644 include/linux/userswap.h create mode 100644 mm/userswap.c
From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
VM_USWAP gets set in vma->vm_flags to tell the VM common code that the userswap is registered. SWP_USERSWAP_ENTRY is the swap entry when userswap memory is swapped out. In addition, is_userswap_entry() is introduced to determine whether the entry is a userswap swap entry. Add the userswap entry case in zap_pte_range() to prevent WARN_ON_ONCE(1).
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- fs/proc/task_mmu.c | 3 +++ include/linux/mm.h | 7 +++++++ include/linux/swap.h | 14 +++++++++++++- include/linux/swapops.h | 12 ++++++++++++ include/trace/events/mmflags.h | 7 +++++++ mm/Kconfig | 10 ++++++++++ mm/memory.c | 3 +++ 7 files changed, 55 insertions(+), 1 deletion(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3dd5be96691b..fe12b057d077 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -700,6 +700,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #ifdef CONFIG_X86_USER_SHADOW_STACK [ilog2(VM_SHADOW_STACK)] = "ss", #endif +#ifdef CONFIG_USERSWAP + [ilog2(VM_USWAP)] = "us", +#endif /* CONFIG_USERSWAP */ }; size_t i;
diff --git a/include/linux/mm.h b/include/linux/mm.h index 8cf86b56aba5..3592fabc8507 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -313,6 +313,13 @@ extern unsigned int kobjsize(const void *objp); #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */
+#ifdef CONFIG_USERSWAP +# define VM_USWAP_BIT 62 +#define VM_USWAP BIT(VM_USWAP_BIT) +#else /* !CONFIG_USERSWAP */ +#define VM_USWAP VM_NONE +#endif /* CONFIG_USERSWAP */ + #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ diff --git a/include/linux/swap.h b/include/linux/swap.h index f6dd6575b905..fe20c462fecb 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -55,6 +55,18 @@ static inline int current_is_kswapd(void) * actions on faults. */
+/* + * Userswap entry type + */ +#ifdef CONFIG_USERSWAP +#define SWP_USERSWAP_NUM 1 +#define SWP_USERSWAP_ENTRY (MAX_SWAPFILES + SWP_HWPOISON_NUM + \ + SWP_MIGRATION_NUM + SWP_DEVICE_NUM + \ + SWP_PTE_MARKER_NUM) +#else +#define SWP_USERSWAP_NUM 0 +#endif + /* * PTE markers are used to persist information onto PTEs that otherwise * should be a none pte. As its name "PTE" hints, it should only be @@ -117,7 +129,7 @@ static inline int current_is_kswapd(void) #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - \ - SWP_PTE_MARKER_NUM) + SWP_PTE_MARKER_NUM - SWP_USERSWAP_NUM)
/* * Magic header for a swap area. The first part of the union is diff --git a/include/linux/swapops.h b/include/linux/swapops.h index bff1e8d97de0..6b4ed6bfb67c 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -455,6 +455,18 @@ static inline int pte_none_mostly(pte_t pte) return pte_none(pte) || is_pte_marker(pte); }
+#ifdef CONFIG_USERSWAP +static inline int is_userswap_entry(swp_entry_t entry) +{ + return unlikely(swp_type(entry) == SWP_USERSWAP_ENTRY); +} +#else +static inline int is_userswap_entry(swp_entry_t entry) +{ + return 0; +} +#endif + static inline struct page *pfn_swap_entry_to_page(swp_entry_t entry) { struct page *p = pfn_to_page(swp_offset_pfn(entry)); diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 1478b9dd05fa..18d30581137a 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -164,6 +164,12 @@ IF_HAVE_PG_ARCH_X(arch_3) # define IF_HAVE_UFFD_MINOR(flag, name) #endif
+#ifdef CONFIG_USERSWAP +#define IF_HAVE_VM_USWAP(flag, name) {flag, name }, +#else +#define IF_HAVE_VM_USWAP(flag, name) +#endif + #define __def_vmaflag_names \ {VM_READ, "read" }, \ {VM_WRITE, "write" }, \ @@ -196,6 +202,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ {VM_MIXEDMAP, "mixedmap" }, \ {VM_HUGEPAGE, "hugepage" }, \ {VM_NOHUGEPAGE, "nohugepage" }, \ +IF_HAVE_VM_USWAP(VM_USWAP, "userswap" ) \ {VM_MERGEABLE, "mergeable" } \
#define show_vma_flags(flags) \ diff --git a/mm/Kconfig b/mm/Kconfig index ece4f2847e2b..ca35151fe561 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1227,6 +1227,16 @@ config PTE_MARKER_UFFD_WP purposes. It is required to enable userfaultfd write protection on file-backed memory types like shmem and hugetlbfs.
+config USERSWAP + bool "Enable User Swap" + depends on MMU && USERFAULTFD + depends on X86 || ARM64 + default n + + help + Support for User Swap. This is based on userfaultfd. We can implement + our own swapout and swapin functions in usersapce. + # multi-gen LRU { config LRU_GEN bool "Multi-Gen LRU" diff --git a/mm/memory.c b/mm/memory.c index e1a0eb8b776a..862e14416027 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1516,6 +1516,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, is_poisoned_swp_entry(entry)) { if (!should_zap_cows(details)) continue; + } else if (is_userswap_entry(entry)) { + if (!should_zap_cows(details)) + continue; } else { /* We should have covered all the swap entry types */ WARN_ON_ONCE(1);
From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
Add enable_userswap boot option to enable userswap feature. Add static key userswap_enabled to indicate whether the feature is enabled.
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- include/linux/swapops.h | 4 ++++ mm/Makefile | 1 + mm/userswap.c | 17 +++++++++++++++++ 3 files changed, 22 insertions(+) create mode 100644 mm/userswap.c
diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 6b4ed6bfb67c..4a7e53612fdb 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -456,8 +456,12 @@ static inline int pte_none_mostly(pte_t pte) }
#ifdef CONFIG_USERSWAP +extern struct static_key_false userswap_enabled; + static inline int is_userswap_entry(swp_entry_t entry) { + if (!static_branch_unlikely(&userswap_enabled)) + return 0; return unlikely(swp_type(entry) == SWP_USERSWAP_ENTRY); } #else diff --git a/mm/Makefile b/mm/Makefile index ec65984e2ade..02b79ca7d0e6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -122,6 +122,7 @@ obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_SECRETMEM) += secretmem.o obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o +obj-$(CONFIG_USERSWAP) += userswap.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o obj-$(CONFIG_DEBUG_PAGEALLOC) += debug_page_alloc.o obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o diff --git a/mm/userswap.c b/mm/userswap.c new file mode 100644 index 000000000000..a2f180b4457f --- /dev/null +++ b/mm/userswap.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * userswap core file + */ + +#include "internal.h" + +DEFINE_STATIC_KEY_FALSE(userswap_enabled); + +static int __init enable_userswap_setup(char *str) +{ + static_branch_enable(&userswap_enabled); + return 1; +} +__setup("enable_userswap", enable_userswap_setup);
From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
We introduce MREMAP_USWAP_SET_PTE to implement remapping in the swap-out phase. Unmap the pages between 'addr ~ addr+old_len' and remap them to 'new_addr ~ new_addr+new_len'. During unmapping, the PTE of old_addr is set to SWP_USERSWAP_ENTRY.
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- include/linux/mm.h | 1 + include/linux/userswap.h | 25 +++ include/uapi/linux/mman.h | 1 + mm/mremap.c | 8 +- mm/userswap.c | 380 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 414 insertions(+), 1 deletion(-) create mode 100644 include/linux/userswap.h
diff --git a/include/linux/mm.h b/include/linux/mm.h index 3592fabc8507..cea31e552614 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2486,6 +2486,7 @@ int set_page_dirty_lock(struct page *page);
int get_cmdline(struct task_struct *task, char *buffer, int buflen);
+extern pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, diff --git a/include/linux/userswap.h b/include/linux/userswap.h new file mode 100644 index 000000000000..f8063185056c --- /dev/null +++ b/include/linux/userswap.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved. + */ + +#ifndef _LINUX_USERSWAP_H +#define _LINUX_USERSWAP_H + +#include <linux/mman.h> + +#ifdef CONFIG_USERSWAP + +extern struct static_key_false userswap_enabled; + +/* + * In uswap situation, we use the bit 0 of the returned address to indicate + * whether the pages are dirty. + */ +#define USWAP_PAGES_DIRTY 1 + +unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len); + +#endif /* CONFIG_USERSWAP */ +#endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index a246e11988d5..3984f2133906 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -9,6 +9,7 @@ #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 #define MREMAP_DONTUNMAP 4 +#define MREMAP_USWAP_SET_PTE 64
#define OVERCOMMIT_GUESS 0 #define OVERCOMMIT_ALWAYS 1 diff --git a/mm/mremap.c b/mm/mremap.c index 382e81c33fc4..f4bd0fcf071b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,6 +25,7 @@ #include <linux/uaccess.h> #include <linux/userfaultfd_k.h> #include <linux/mempolicy.h> +#include <linux/userswap.h>
#include <asm/cacheflush.h> #include <asm/tlb.h> @@ -32,7 +33,7 @@
#include "internal.h"
-static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) +pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; @@ -930,6 +931,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, */ addr = untagged_addr(addr);
+#ifdef CONFIG_USERSWAP + if (flags == MREMAP_USWAP_SET_PTE) + return uswap_mremap(addr, old_len, new_addr, new_len); +#endif + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) return ret;
diff --git a/mm/userswap.c b/mm/userswap.c index a2f180b4457f..56f7140d5335 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -5,10 +5,390 @@ * userswap core file */
+#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/rmap.h> +#include <linux/mmu_notifier.h> +#include <linux/hugetlb.h> +#include <linux/userswap.h> + #include "internal.h"
DEFINE_STATIC_KEY_FALSE(userswap_enabled);
+static bool vma_uswap_compatible(struct vm_area_struct *vma) +{ + if (!vma || !vma_is_anonymous(vma) || vma->vm_file || + (vma->vm_flags & (VM_SHARED | VM_LOCKED | VM_STACK | VM_IO | + VM_PFNMAP | VM_HUGETLB))) + return false; + return true; +} + +/* + * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get + * the reference of the pages and return the pages through input parameters + * 'ppages'. + */ +static unsigned long pages_can_be_swapped(struct mm_struct *mm, + unsigned long addr, + unsigned long len, + struct page ***ppages) +{ + struct vm_area_struct *vma; + struct page *page = NULL; + struct page **pages = NULL; + unsigned long addr_end = addr + len; + unsigned long ret; + unsigned long i, page_num = 0; + *ppages = NULL; + + pages = kvzalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + while (addr < addr_end) { + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start || + !(vma->vm_flags & VM_USWAP) || + !vma_uswap_compatible(vma)) { + ret = -EINVAL; + goto out_err; + } + + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + ret = -EAGAIN; + goto out_err; + } +get_again: + /* + * follow_page will inc page ref, dec the ref after we remap + * the page. + */ + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + if (IS_ERR_OR_NULL(page)) { + ret = -ENODEV; + goto out_err; + } + + pages[page_num++] = page; + if (!PageAnon(page) || !PageSwapBacked(page) || + PageHuge(page) || PageSwapCache(page)) { + ret = -EINVAL; + goto out_err; + } + + if (PageTransCompound(page)) { + if (trylock_page(page)) { + if (!split_huge_page(page)) { + unlock_page(page); + put_page(page); + page_num--; + goto get_again; + } else { + unlock_page(page); + } + } + ret = -EINVAL; + goto out_err; + } + + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if (page_mapcount(page) > 1 || + page_mapcount(page) + 1 != page_count(page)) { + ret = -EBUSY; + goto out_err; + } + addr += PAGE_SIZE; + } + + *ppages = pages; + return 0; + +out_err: + for (i = 0; i < page_num; i++) + put_page(pages[i]); + kvfree(pages); + return ret; +} + +static bool is_thp_or_huge(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = get_old_pud(mm, addr); + if (!pud) + return false; + else if (pud_huge(*pud)) + return true; + + pmd = pmd_offset(pud, addr); + if (!pmd) + return false; + else if (pmd_huge(*pmd) || pmd_trans_huge(*pmd)) + return true; + + return false; +} + +static int uswap_unmap_anon_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, struct page *page, + pmd_t *pmd, pte_t *old_pte, bool set_to_swp) +{ + struct mmu_notifier_range range; + spinlock_t *ptl; + pte_t *pte, _old_pte; + int ret = 0; + + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, addr, + addr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_present(*pte)) { + ret = -EINVAL; + goto out_release_unlock; + } + flush_cache_page(vma, addr, pte_pfn(*pte)); + _old_pte = ptep_clear_flush(vma, addr, pte); + if (old_pte) + *old_pte = _old_pte; + if (set_to_swp) + set_pte_at(mm, addr, pte, swp_entry_to_pte(swp_entry( + SWP_USERSWAP_ENTRY, page_to_pfn(page)))); + + dec_mm_counter(mm, MM_ANONPAGES); + page_remove_rmap(page, vma, false); + page->mapping = NULL; + +out_release_unlock: + pte_unmap_unlock(pte, ptl); + mmu_notifier_invalidate_range_end(&range); + return ret; +} + +static unsigned long vm_insert_anon_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + int ret = 0; + pte_t *pte, dst_pte; + spinlock_t *ptl; + + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + + flush_dcache_page(page); + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + return -ENOMEM; + if (!pte_none(*pte)) { + ret = -EBUSY; + goto out_unlock; + } + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr); + dst_pte = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + dst_pte = pte_mkwrite_novma(pte_mkdirty(dst_pte)); + set_pte_at(mm, addr, pte, dst_pte); + +out_unlock: + pte_unmap_unlock(pte, ptl); + return ret; +} + +static void uswap_map_anon_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + struct page *page, + pmd_t *pmd, + pte_t old_pte) +{ + spinlock_t *ptl; + pte_t *pte; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + flush_cache_page(vma, addr, pte_pfn(*pte)); + set_pte_at(mm, addr, pte, old_pte); + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr); + pte_unmap_unlock(pte, ptl); +} + +static void uswapout_recover(struct mm_struct *mm, + unsigned long old_addr_start, unsigned long len, + struct page **pages, unsigned long new_addr_start, + pte_t *ptes) +{ + unsigned long unmap_old_addr = old_addr_start; + unsigned long unmap_new_addr = new_addr_start; + struct page *page; + pmd_t *old_pmd, *new_pmd; + pte_t pte; + unsigned long i; + + for (i = 0; i < len; i++) { + page = pages[i]; + pte = ptes[i]; + new_pmd = mm_find_pmd(mm, new_addr_start); + old_pmd = mm_find_pmd(mm, unmap_old_addr); + + uswap_unmap_anon_page(mm, find_vma(mm, unmap_new_addr), + unmap_new_addr, page, new_pmd, NULL, + false); + uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr), + unmap_old_addr, page, old_pmd, pte); + unmap_old_addr += PAGE_SIZE; + unmap_new_addr += PAGE_SIZE; + } + if (pte_val(ptes[len]) != 0) { + page = pages[len]; + pte = ptes[len]; + old_pmd = mm_find_pmd(mm, unmap_old_addr); + + uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr), + unmap_old_addr, page, old_pmd, pte); + get_page(page); + } +} + +/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */ +static unsigned long do_user_swap(struct mm_struct *mm, + unsigned long old_addr_start, + unsigned long len, struct page **pages, + unsigned long new_addr_start) +{ + struct vm_area_struct *old_vma, *new_vma; + unsigned long old_addr = old_addr_start; + unsigned long new_addr = new_addr_start; + struct page *page; + pmd_t *pmd; + pte_t old_pte, *ptes; + bool pages_dirty = false; + unsigned long i = 0, j; + int ret; + + ptes = kvzalloc(sizeof(pte_t) * (len / PAGE_SIZE), GFP_KERNEL); + if (!ptes) + return -ENOMEM; + lru_add_drain(); + for (j = 0; j < len; j += PAGE_SIZE) { + page = pages[i]; + ret = -EINVAL; + if (!page) + goto out_recover; + if (is_thp_or_huge(mm, new_addr)) + goto out_recover; + old_vma = find_vma(mm, old_addr); + if (!old_vma || old_addr < old_vma->vm_start) + goto out_recover; + new_vma = find_vma(mm, new_addr); + if (!new_vma || new_addr < new_vma->vm_start) + goto out_recover; + if (!vma_uswap_compatible(new_vma)) + goto out_recover; + + ret = -EACCES; + if (!(old_vma->vm_flags & VM_WRITE) && + (new_vma->vm_flags & VM_WRITE)) + goto out_recover; + + ret = -ENXIO; + pmd = mm_find_pmd(mm, old_addr); + if (!pmd) + goto out_recover; + ret = uswap_unmap_anon_page(mm, old_vma, old_addr, page, pmd, + &old_pte, true); + if (ret) + goto out_recover; + ptes[i] = old_pte; + if (pte_dirty(old_pte) || PageDirty(page)) + pages_dirty = true; + put_page(page); + + ret = vm_insert_anon_page(new_vma, new_addr, page); + if (ret) + goto out_recover; + get_page(page); + + old_addr += PAGE_SIZE; + new_addr += PAGE_SIZE; + i++; + } + + if (pages_dirty) + new_addr_start = new_addr_start | USWAP_PAGES_DIRTY; + kvfree(ptes); + return new_addr_start; + +out_recover: + uswapout_recover(mm, old_addr_start, i, pages, new_addr_start, ptes); + kvfree(ptes); + return ret; +} + + +/* + * When flags is MREMAP_USWAP_SET_PTE, uswap_mremap() is called in syscall + * mremap. + * Unmap the pages between 'addr ~ addr + old_len' and remap them to 'new_addr + * ~ new_addr + new_len'. Set the pte of old_addr to SWP_USERSWAP_ENTRY. + */ +unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len) +{ + struct page **pages = NULL; + struct mm_struct *mm = current->mm; + unsigned long len = old_len; + unsigned long ret = -EINVAL; + unsigned long i; + + if (!static_branch_unlikely(&userswap_enabled)) + goto out; + + if (offset_in_page(old_addr)) + goto out; + + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + + if (!new_len || old_len != new_len || offset_in_page(new_addr)) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len || + old_addr > TASK_SIZE - old_len) + goto out; + + /* Ensure the old/new locations do not overlap */ + if (old_addr + old_len > new_addr && new_addr + new_len > old_addr) + goto out; + + lru_add_drain_all(); + mmap_write_lock(mm); + ret = pages_can_be_swapped(mm, old_addr, len, &pages); + if (ret) + goto out_release_unlock; + + ret = do_user_swap(mm, old_addr, len, pages, new_addr); + /* follow_page() above increased the reference */ + for (i = 0; i < len / PAGE_SIZE; i++) + if (pages[i]) + put_page(pages[i]); + + kvfree(pages); + +out_release_unlock: + mmap_write_unlock(mm); +out: + return ret; +} + static int __init enable_userswap_setup(char *str) { static_branch_enable(&userswap_enabled);
From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
This patch modify the userfaultfd to support userswap. VM_USWAP is set in userfaultfd_register() and cleared in userfaultfd_unregister() and userfaultfd_release(). Use do_uswap_page() to handle page faults of the userswap swap entry in do_swap_page(). Add uswap_must_wait() to handle userswap type userfaults in userfaultfd_must_wait().
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- fs/userfaultfd.c | 32 +++++++++++++-- include/linux/userswap.h | 16 ++++++++ include/uapi/linux/userfaultfd.h | 2 + mm/memory.c | 5 +++ mm/userswap.c | 70 ++++++++++++++++++++++++++++++++ 5 files changed, 122 insertions(+), 3 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 56eaae9dac1a..5d5d642a4686 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -31,6 +31,7 @@ #include <linux/hugetlb.h> #include <linux/swapops.h> #include <linux/miscdevice.h> +#include <linux/userswap.h>
static int sysctl_unprivileged_userfaultfd __read_mostly;
@@ -373,6 +374,9 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, * ptes here. */ ptent = ptep_get(pte); +#ifdef CONFIG_USERSWAP + uswap_must_wait(reason, ptent, &ret); +#endif if (pte_none_mostly(ptent)) ret = true; if (!pte_write(ptent) && (reason & VM_UFFD_WP)) @@ -442,10 +446,14 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) BUG_ON(ctx->mm != mm);
/* Any unrecognized flag is a bug. */ - VM_BUG_ON(reason & ~__VM_UFFD_FLAGS); + VM_BUG_ON(reason & ~(__VM_UFFD_FLAGS | VM_USWAP)); /* 0 or > 1 flags set is a bug; we expect exactly 1. */ VM_BUG_ON(!reason || (reason & (reason - 1)));
+ if (IS_ENABLED(CONFIG_USERSWAP) && (reason == VM_UFFD_MISSING) && + (vma->vm_flags & VM_USWAP)) + reason |= VM_USWAP; + if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) @@ -520,6 +528,10 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) uwq.wq.private = current; uwq.msg = userfault_msg(vmf->address, vmf->real_address, vmf->flags, reason, ctx->features); +#ifdef CONFIG_USERSWAP + if ((reason & VM_USWAP) && pte_none(vmf->orig_pte)) + uwq.msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_FIRST; +#endif uwq.ctx = ctx; uwq.waken = false;
@@ -921,7 +933,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) prev = vma; continue; } - new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; + new_flags = vma->vm_flags & ~(__VM_UFFD_FLAGS | VM_USWAP); prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, @@ -1326,6 +1338,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long start, end, vma_end; struct vma_iterator vmi; pgoff_t pgoff; +#ifdef CONFIG_USERSWAP + bool uswap_mode = false; +#endif
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1337,6 +1352,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = -EINVAL; if (!uffdio_register.mode) goto out; +#ifdef CONFIG_USERSWAP + if (!uswap_register(&uffdio_register, &uswap_mode)) + goto out; +#endif if (uffdio_register.mode & ~UFFD_API_REGISTER_MODES) goto out; vm_flags = 0; @@ -1359,6 +1378,13 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, uffdio_register.range.len); if (ret) goto out; +#ifdef CONFIG_USERSWAP + if (unlikely(uswap_mode)) { + ret = -EINVAL; + if (!uswap_adjust_uffd_range(&uffdio_register, &vm_flags, mm)) + goto out; + } +#endif
start = uffdio_register.range.start; end = start + uffdio_register.range.len; @@ -1663,7 +1689,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, if (userfaultfd_wp(vma)) uffd_wp_range(vma, start, vma_end - start, false);
- new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; + new_flags = vma->vm_flags & ~(__VM_UFFD_FLAGS | VM_USWAP); pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, pgoff, diff --git a/include/linux/userswap.h b/include/linux/userswap.h index f8063185056c..bd6475259a9d 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -21,5 +21,21 @@ extern struct static_key_false userswap_enabled; unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, unsigned long new_addr, unsigned long new_len);
+bool uswap_register(struct uffdio_register *uffdio_register, bool *uswap_mode); + +bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, + unsigned long *vm_flags, struct mm_struct *mm); + +vm_fault_t do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, + struct vm_area_struct *vma); + +static inline void uswap_must_wait(unsigned long reason, pte_t pte, bool *ret) +{ + if (!static_branch_unlikely(&userswap_enabled)) + return; + if ((reason & VM_USWAP) && (!pte_present(pte))) + *ret = true; +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 62151706c5a3..b2cd9fe5a4ca 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -152,6 +152,7 @@ struct uffd_msg { #define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */ #define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */ #define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */ +#define UFFD_PAGEFAULT_FLAG_FIRST (1<<10) /* USWAP first page fault */
struct uffdio_api { /* userland asks for an API number and the features to enable */ @@ -247,6 +248,7 @@ struct uffdio_register { #define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0) #define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1) #define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2) +#define UFFDIO_REGISTER_MODE_USWAP ((__u64)1<<3) __u64 mode;
/* diff --git a/mm/memory.c b/mm/memory.c index 862e14416027..6569c9e97c9d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -77,6 +77,7 @@ #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <linux/sched/sysctl.h> +#include <linux/userswap.h>
#include <trace/events/kmem.h>
@@ -3778,6 +3779,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out;
entry = pte_to_swp_entry(vmf->orig_pte); +#ifdef CONFIG_USERSWAP + if (is_userswap_entry(entry)) + return do_uswap_page(entry, vmf, vma); +#endif if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, diff --git a/mm/userswap.c b/mm/userswap.c index 56f7140d5335..4f798c8226a1 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -389,6 +389,76 @@ unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, return ret; }
+bool uswap_register(struct uffdio_register *uffdio_register, bool *uswap_mode) +{ + if (!static_branch_unlikely(&userswap_enabled)) + return true; + if (!(uffdio_register->mode & UFFDIO_REGISTER_MODE_USWAP)) + return true; + uffdio_register->mode &= ~UFFDIO_REGISTER_MODE_USWAP; + if (uffdio_register->mode != UFFDIO_REGISTER_MODE_MISSING) + return false; + *uswap_mode = true; + return true; +} + +/* + * register the whole vma overlapping with the address range to avoid splitting + * the vma which could reduce fragmentation. + */ +bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, + unsigned long *vm_flags, struct mm_struct *mm) +{ + struct vm_area_struct *vma, *cur; + unsigned long end; + bool ret = false; + + VMA_ITERATOR(vmi, mm, uffdio_register->range.start); + + end = uffdio_register->range.start + uffdio_register->range.len - 1; + + mmap_read_lock(mm); + vma = find_vma(mm, uffdio_register->range.start); + if (!vma || vma->vm_start >= end) + goto out_unlock; + for_each_vma_range(vmi, cur, end) + if (!vma_uswap_compatible(cur)) + goto out_unlock; + + uffdio_register->range.start = vma->vm_start; + vma = find_vma(mm, end); + if (vma && end >= vma->vm_start) + uffdio_register->range.len = vma->vm_end - uffdio_register->range.start; + + *vm_flags |= VM_USWAP; + + ret = true; +out_unlock: + mmap_read_unlock(mm); + return ret; +} + +vm_fault_t do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, + struct vm_area_struct *vma) +{ + const char *process_prefix = "uswap"; + + /* print error if we come across a nested fault */ + if (!strncmp(current->comm, process_prefix, strlen(process_prefix))) { + pr_err("USWAP: fault %lx is triggered by %s\n", vmf->address, + current->comm); + return VM_FAULT_SIGBUS; + } + + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + pr_err("USWAP: addr %lx flags %lx is not a user swap page", + vmf->address, vma->vm_flags); + return VM_FAULT_SIGBUS; + } + + return handle_userfault(vmf, VM_UFFD_MISSING); +} + static int __init enable_userswap_setup(char *str) { static_branch_enable(&userswap_enabled);
From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
Add a new UFFDIO_COPY mode UFFDIO_COPY_MODE_DIRECT_MAP to map physical pages without copy_from_user(). We use uswap_unmap_anon_page() to unmap an anonymous page and uswap_map_anon_page() to map page to src addr. We introduce mfill_atomic_pte_nocopy() to achieve zero copy by unmapping src_addr to the physical page and establishing the mapping from dst_addr to the physical page.
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- fs/userfaultfd.c | 8 ++- include/linux/userfaultfd_k.h | 5 ++ include/linux/userswap.h | 13 +++++ include/uapi/linux/userfaultfd.h | 1 + mm/userfaultfd.c | 12 +++++ mm/userswap.c | 85 ++++++++++++++++++++++++++++++++ 6 files changed, 123 insertions(+), 1 deletion(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5d5d642a4686..207467a46e7a 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1797,10 +1797,16 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out;
ret = -EINVAL; - if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP)) + if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE | + UFFDIO_COPY_MODE_WP | + IS_ENABLED(CONFIG_USERSWAP) ? + UFFDIO_COPY_MODE_DIRECT_MAP : 0)) goto out; if (uffdio_copy.mode & UFFDIO_COPY_MODE_WP) flags |= MFILL_ATOMIC_WP; + if (IS_ENABLED(CONFIG_USERSWAP) && + (uffdio_copy.mode & UFFDIO_COPY_MODE_DIRECT_MAP)) + flags |= MFILL_ATOMIC_DIRECT_MAP; if (mmget_not_zero(ctx->mm)) { ret = mfill_atomic_copy(ctx->mm, uffdio_copy.dst, uffdio_copy.src, uffdio_copy.len, &ctx->mmap_changing, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ac8c6854097c..9427d5fccf7b 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -47,6 +47,7 @@ enum mfill_atomic_mode { MFILL_ATOMIC_ZEROPAGE, MFILL_ATOMIC_CONTINUE, MFILL_ATOMIC_POISON, + MFILL_ATOMIC_DIRECT_MAP, NR_MFILL_ATOMIC_MODES, };
@@ -62,6 +63,10 @@ static inline bool uffd_flags_mode_is(uffd_flags_t flags, enum mfill_atomic_mode
static inline uffd_flags_t uffd_flags_set_mode(uffd_flags_t flags, enum mfill_atomic_mode mode) { + if (IS_ENABLED(CONFIG_USERSWAP) && (flags & MFILL_ATOMIC_DIRECT_MAP) && + uffd_flags_mode_is(mode, MFILL_ATOMIC_COPY)) + mode = MFILL_ATOMIC_DIRECT_MAP; + flags &= ~MFILL_ATOMIC_MODE_MASK; return flags | ((__force uffd_flags_t) mode); } diff --git a/include/linux/userswap.h b/include/linux/userswap.h index bd6475259a9d..10a7111e9129 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -29,6 +29,10 @@ bool uswap_adjust_uffd_range(struct uffdio_register *uffdio_register, vm_fault_t do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, struct vm_area_struct *vma);
+int mfill_atomic_pte_nocopy(struct mm_struct *dst_mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr); + static inline void uswap_must_wait(unsigned long reason, pte_t pte, bool *ret) { if (!static_branch_unlikely(&userswap_enabled)) @@ -37,5 +41,14 @@ static inline void uswap_must_wait(unsigned long reason, pte_t pte, bool *ret) *ret = true; }
+static inline bool uswap_check_copy(struct vm_area_struct *vma, + uffd_flags_t flags) +{ + if (!!uffd_flags_mode_is(flags, MFILL_ATOMIC_DIRECT_MAP) ^ + !!(vma->vm_flags & VM_USWAP)) + return false; + return true; +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index b2cd9fe5a4ca..bda754f43203 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -270,6 +270,7 @@ struct uffdio_copy { * according to the uffdio_register.ioctls. */ #define UFFDIO_COPY_MODE_WP ((__u64)1<<1) +#define UFFDIO_COPY_MODE_DIRECT_MAP ((__u64)1<<10) __u64 mode;
/* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 96d9eae5c7cc..32fa1a22c85a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -15,6 +15,7 @@ #include <linux/mmu_notifier.h> #include <linux/hugetlb.h> #include <linux/shmem_fs.h> +#include <linux/userswap.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include "internal.h" @@ -603,6 +604,10 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, goto out_unlock;
err = -EINVAL; +#ifdef CONFIG_USERSWAP + if (!uswap_check_copy(dst_vma, flags)) + goto out_unlock; +#endif /* * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but * it will overwrite vm_ops, so vma_is_anonymous must return false. @@ -675,6 +680,13 @@ static __always_inline ssize_t mfill_atomic(struct mm_struct *dst_mm, BUG_ON(pmd_none(*dst_pmd)); BUG_ON(pmd_trans_huge(*dst_pmd));
+#ifdef CONFIG_USERSWAP + if (static_branch_unlikely(&userswap_enabled) && + uffd_flags_mode_is(flags, MFILL_ATOMIC_DIRECT_MAP)) + err = mfill_atomic_pte_nocopy(dst_mm, dst_pmd, dst_vma, + dst_addr, src_addr); + else +#endif err = mfill_atomic_pte(dst_pmd, dst_vma, dst_addr, src_addr, flags, &folio); cond_resched(); diff --git a/mm/userswap.c b/mm/userswap.c index 4f798c8226a1..18c99c2a0fc7 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -459,6 +459,91 @@ vm_fault_t do_uswap_page(swp_entry_t entry, struct vm_fault *vmf, return handle_userfault(vmf, VM_UFFD_MISSING); }
+int mfill_atomic_pte_nocopy(struct mm_struct *mm, pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, unsigned long src_addr) +{ + struct vm_area_struct *src_vma; + pte_t dst_pte, *pte, src_pte; + struct page *page; + spinlock_t *ptl; + pmd_t *src_pmd; + int ret; + + src_vma = find_vma(mm, src_addr); + if (!src_vma || src_addr < src_vma->vm_start) + return -EINVAL; + + if (!vma_uswap_compatible(src_vma)) + return -EINVAL; + + page = follow_page(src_vma, src_addr, FOLL_GET | FOLL_DUMP); + if (IS_ERR_OR_NULL(page)) + return -ENODEV; + + ret = -ENXIO; + src_pmd = mm_find_pmd(mm, src_addr); + if (!src_pmd) + goto out_put_page; + + if (!PageLRU(page)) + lru_add_drain_all(); + + ret = -EBUSY; + if (page_mapcount(page) > 1 || + page_mapcount(page) + 1 != page_count(page)) + goto out_put_page; + + ret = uswap_unmap_anon_page(mm, src_vma, src_addr, page, src_pmd, + &src_pte, false); + if (ret) + goto out_put_page; + if (dst_vma->vm_flags & VM_USWAP) + ClearPageDirty(page); + /* + * The memory barrier inside __SetPageUptodate makes sure that + * preceding stores to the page contents become visible before + * the set_pte_at() write. + */ + __SetPageUptodate(page); + + dst_pte = mk_pte(page, dst_vma->vm_page_prot); + if (dst_vma->vm_flags & VM_WRITE) + dst_pte = pte_mkwrite_novma(pte_mkdirty(dst_pte)); + if (dst_vma->vm_flags & VM_USWAP) + dst_pte = pte_mkclean(dst_pte); + + pte = pte_offset_map_lock(mm, dst_pmd, dst_addr, &ptl); + /* + * The userspace may swap in a large area. Part of the area is not + * swapped out. If concurrent execution, PTE may be present. Skip those + * pages (pte_present). + * No other scenes should be handled except first pagefault (pte_none) + * and after userswap out (SWP_USERSWAP_ENTRY). + */ + if (pte_present(*pte) || (!pte_none(*pte) && + !is_userswap_entry(pte_to_swp_entry(*pte)))) { + pte_unmap_unlock(pte, ptl); + uswap_map_anon_page(mm, src_vma, src_addr, page, src_pmd, + src_pte); + ret = -EEXIST; + goto out_put_page; + } + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, dst_vma, dst_addr); + set_pte_at(mm, dst_addr, pte, dst_pte); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(dst_vma, dst_addr, pte); + pte_unmap_unlock(pte, ptl); + ret = 0; + +out_put_page: + put_page(page); + return ret; +} + static int __init enable_userswap_setup(char *str) { static_branch_enable(&userswap_enabled);
From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
The uffd_msg.reserved3 field is used to transfer the CPU information of the PF.
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- fs/userfaultfd.c | 3 +++ include/linux/userswap.h | 6 ++++++ 2 files changed, 9 insertions(+)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 207467a46e7a..4db4a6b8a4a3 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -269,6 +269,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address, msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR; if (features & UFFD_FEATURE_THREAD_ID) msg.arg.pagefault.feat.ptid = task_pid_vnr(current); +#ifdef CONFIG_USERSWAP + uswap_get_cpu_id(reason, &msg); +#endif return msg; }
diff --git a/include/linux/userswap.h b/include/linux/userswap.h index 10a7111e9129..cecdef09c66f 100644 --- a/include/linux/userswap.h +++ b/include/linux/userswap.h @@ -50,5 +50,11 @@ static inline bool uswap_check_copy(struct vm_area_struct *vma, return true; }
+static inline void uswap_get_cpu_id(unsigned long reason, struct uffd_msg *msg) +{ + if (reason & VM_USWAP) + msg->reserved3 = smp_processor_id(); +} + #endif /* CONFIG_USERSWAP */ #endif /* _LINUX_USERSWAP_H */
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/3318 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/K...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/3318 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/K...