From: Guo Fan guofan5@huawei.com
hulk inclusion category: feature bugzilla: 47439 CVE: NA
-------------------------------------------------
To make sure there are no other userspace threads access the memory region we are swapping out, we need unmmap the memory region, map it to a new address and use the new address to perform the swapout. We add a new flag 'MAP_REPLACE' for mmap() to unmap the pages of the input parameter 'VA' and remap them to a new tmpVA.
Signed-off-by: Guo Fan guofan5@huawei.com Signed-off-by: Xiongfeng Wang wangxiongfeng2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Jing Xiangfeng jingxiangfeng@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- fs/proc/task_mmu.c | 3 + include/linux/mm.h | 5 + include/linux/swap.h | 12 +- include/trace/events/mmflags.h | 7 ++ include/uapi/asm-generic/mman.h | 4 + mm/Kconfig | 9 ++ mm/mmap.c | 207 ++++++++++++++++++++++++++++++++ 7 files changed, 246 insertions(+), 1 deletion(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ac7f57badcfd..66939a7998ab 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -665,6 +665,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_PKEY_BIT4)] = "", #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ +#ifdef CONFIG_USERSWAP + [ilog2(VM_USWAP)] = "us", +#endif }; size_t i;
diff --git a/include/linux/mm.h b/include/linux/mm.h index 073295cc94f3..61734ef3c184 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -236,6 +236,11 @@ extern unsigned int kobjsize(const void *objp);
#define VM_CHECKNODE 0x200000000
+#ifdef CONFIG_USERSWAP +/* bit[32:36] is the protection key of intel, so use a large value for VM_USWAP */ +#define VM_USWAP 0x2000000000000000 +#endif + #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ diff --git a/include/linux/swap.h b/include/linux/swap.h index c6f9dba6d713..b7cfad35987a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -52,6 +52,16 @@ static inline int current_is_kswapd(void) * actions on faults. */
+/* + * Userswap entry type + */ +#ifdef CONFIG_USERSWAP +#define SWP_USERSWAP_NUM 1 +#define SWP_USERSWAP_ENTRY (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+SWP_DEVICE_NUM) +#else +#define SWP_USERSWAP_NUM 0 +#endif + /* * Unaddressable device memory support. See include/linux/hmm.h and * Documentation/vm/hmm.rst. Short description is we need struct pages for @@ -92,7 +102,7 @@ static inline int current_is_kswapd(void)
#define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM) + SWP_MIGRATION_NUM - SWP_HWPOISON_NUM - SWP_USERSWAP_NUM)
/* * Magic header for a swap area. The first part of the union is diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 2994f1c86a46..b817bf1885a0 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -130,6 +130,12 @@ IF_HAVE_PG_IDLE(PG_idle, "idle"), \ #define IF_HAVE_VM_SOFTDIRTY(flag,name) #endif
+#ifdef CONFIG_USERSWAP +#define IF_HAVE_VM_USWAP(flag,name) {flag, name }, +#else +#define IF_HAVE_VM_USWAP(flag,name) +#endif + #define __def_vmaflag_names \ {VM_READ, "read" }, \ {VM_WRITE, "write" }, \ @@ -161,6 +167,7 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \ {VM_MIXEDMAP, "mixedmap" }, \ {VM_HUGEPAGE, "hugepage" }, \ {VM_NOHUGEPAGE, "nohugepage" }, \ +IF_HAVE_VM_USWAP(VM_USWAP, "userswap" ) \ {VM_MERGEABLE, "mergeable" } \
#define show_vma_flags(flags) \ diff --git a/include/uapi/asm-generic/mman.h b/include/uapi/asm-generic/mman.h index 233a5e82407c..defdf92911c3 100644 --- a/include/uapi/asm-generic/mman.h +++ b/include/uapi/asm-generic/mman.h @@ -17,6 +17,10 @@ #define MAP_SYNC 0x80000 /* perform synchronous page faults for the mapping */ #define MAP_PA32BIT 0x400000 /* physical address is within 4G */
+#ifdef CONFIG_USERSWAP +#define MAP_REPLACE 0x1000000 +#endif + /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */
#define MCL_CURRENT 1 /* lock all current mappings */ diff --git a/mm/Kconfig b/mm/Kconfig index 7ebd52dc1e40..4e075a27d737 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -503,6 +503,15 @@ config SHRINK_PAGECACHE
if unsure, say N to disable the SHRINK_PAGECACHE.
+config USERSWAP + bool "Enable User Swap" + depends on MMU && USERFAULTFD + depends on X86 || ARM64 + default n + help + Support for User Swap. This is based on userfaultfd. We can implement + our own swapout and swapin functions in usersapce. + config CMA bool "Contiguous Memory Allocator" depends on HAVE_MEMBLOCK && MMU diff --git a/mm/mmap.c b/mm/mmap.c index 3fcfed26d298..1b0eda02dc7f 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -46,6 +46,7 @@ #include <linux/pkeys.h> #include <linux/oom.h> #include <linux/sched/mm.h> +#include <linux/swapops.h>
#include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -1372,6 +1373,169 @@ int unregister_mmap_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(unregister_mmap_notifier); #endif
+#ifdef CONFIG_USERSWAP +/* + * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get + * the reference of the pages and return the pages through input parameters + * 'ppages'. + */ +int pages_can_be_swapped(struct mm_struct *mm, unsigned long addr, + unsigned long len, struct page ***ppages) +{ + struct vm_area_struct *vma; + struct page *page = NULL; + struct page **pages = NULL; + unsigned long addr_start, addr_end; + unsigned long ret; + int i, page_num = 0; + + pages = kmalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + addr_start = addr; + addr_end = addr + len; + while (addr < addr_end) { + vma = find_vma(mm, addr); + if (!vma || !vma_is_anonymous(vma) || + (vma->vm_flags & VM_LOCKED) || vma->vm_file + || (vma->vm_flags & VM_STACK) || (vma->vm_flags & (VM_IO | VM_PFNMAP))) { + ret = -EINVAL; + goto out; + } + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + ret = -EAGAIN; + goto out; + } +get_again: + /* follow_page will inc page ref, dec the ref after we remap the page */ + page = follow_page(vma, addr, FOLL_GET); + if (IS_ERR_OR_NULL(page)) { + ret = -ENODEV; + goto out; + } + pages[page_num] = page; + page_num++; + if (!PageAnon(page) || !PageSwapBacked(page) || PageHuge(page) || PageSwapCache(page)) { + ret = -EINVAL; + goto out; + } else if (PageTransCompound(page)) { + if (trylock_page(page)) { + if (!split_huge_page(page)) { + put_page(page); + page_num--; + unlock_page(page); + goto get_again; + } else { + unlock_page(page); + ret = -EINVAL; + goto out; + } + } else { + ret = -EINVAL; + goto out; + } + } + if (page_mapcount(page) > 1 || page_mapcount(page) + 1 != page_count(page)) { + ret = -EBUSY; + goto out; + } + addr += PAGE_SIZE; + } + + *ppages = pages; + return 0; + +out: + for (i = 0; i < page_num; i++) + put_page(pages[i]); + if (pages) + kfree(pages); + *ppages = NULL; + return ret; +} + +/* + * In uswap situation, we use the bit 0 of the returned address to indicate + * whether the pages are dirty. + */ +#define USWAP_PAGES_DIRTY 1 + +/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */ +unsigned long do_user_swap(struct mm_struct *mm, unsigned long addr_start, + unsigned long len, struct page **pages, unsigned long new_addr) +{ + struct vm_area_struct *vma; + struct page *page; + pmd_t *pmd; + pte_t *pte, old_pte; + spinlock_t *ptl; + unsigned long addr, addr_end; + bool pages_dirty = false; + int i, err; + + addr_end = addr_start + len; + lru_add_drain(); + mmu_notifier_invalidate_range_start(mm, addr_start, addr_end); + addr = addr_start; + i = 0; + while (addr < addr_end) { + page = pages[i]; + vma = find_vma(mm, addr); + if (!vma) { + mmu_notifier_invalidate_range_end(mm, addr_start, addr_end); + WARN_ON("find_vma failed\n"); + return -EINVAL; + } + pmd = mm_find_pmd(mm, addr); + if (!pmd) { + mmu_notifier_invalidate_range_end(mm, addr_start, addr_end); + WARN_ON("mm_find_pmd failed, addr:%llx\n"); + return -ENXIO; + } + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + flush_cache_page(vma, addr, pte_pfn(*pte)); + old_pte = ptep_clear_flush(vma, addr, pte); + if (pte_dirty(old_pte) || PageDirty(page)) + pages_dirty = true; + set_pte(pte, swp_entry_to_pte(swp_entry(SWP_USERSWAP_ENTRY, page_to_pfn(page)))); + dec_mm_counter(mm, MM_ANONPAGES); + page_remove_rmap(page, false); + put_page(page); + + pte_unmap_unlock(pte, ptl); + vma->vm_flags |= VM_USWAP; + page->mapping = NULL; + addr += PAGE_SIZE; + i++; + } + mmu_notifier_invalidate_range_end(mm, addr_start, addr_end); + + addr_start = new_addr; + addr_end = new_addr + len; + addr = addr_start; + vma = find_vma(mm, addr); + i = 0; + while (addr < addr_end) { + page = pages[i]; + if (addr > vma->vm_end - 1) + vma = find_vma(mm, addr); + err = vm_insert_page(vma, addr, page); + if (err) { + pr_err("vm_insert_page failed:%d\n", err); + } + i++; + addr += PAGE_SIZE; + } + vma->vm_flags |= VM_USWAP; + + if (pages_dirty) + new_addr = new_addr | USWAP_PAGES_DIRTY; + + return new_addr; +} +#endif + /* * The caller must hold down_write(¤t->mm->mmap_sem). */ @@ -1383,6 +1547,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr, { struct mm_struct *mm = current->mm; int pkey = 0; +#ifdef CONFIG_USERSWAP + struct page **pages = NULL; + unsigned long addr_start = addr; + int i, page_num = 0; + unsigned long ret; +#endif
*populate = 0;
@@ -1399,6 +1569,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!(file && path_noexec(&file->f_path))) prot |= PROT_EXEC;
+#ifdef CONFIG_USERSWAP + if (flags & MAP_REPLACE) { + if (offset_in_page(addr) || (len % PAGE_SIZE)) + return -EINVAL; + page_num = len / PAGE_SIZE; + ret = pages_can_be_swapped(mm, addr, len, &pages); + if (ret) + return ret; + } +#endif + /* force arch specific MAP_FIXED handling in get_unmapped_area */ if (flags & MAP_FIXED_NOREPLACE) flags |= MAP_FIXED; @@ -1571,12 +1752,38 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (flags & MAP_CHECKNODE) set_vm_checknode(&vm_flags, flags);
+#ifdef CONFIG_USERSWAP + /* mark the vma as special to avoid merging with other vmas */ + if (flags & MAP_REPLACE) + vm_flags |= VM_SPECIAL; +#endif + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) *populate = len; +#ifndef CONFIG_USERSWAP return addr; +#else + if (!(flags & MAP_REPLACE)) + return addr; + + if (IS_ERR_VALUE(addr)) { + pr_info("mmap_region failed, return addr:%lx\n", addr); + ret = addr; + goto out; + } + + ret = do_user_swap(mm, addr_start, len, pages, addr); +out: + /* follow_page() above increased the reference*/ + for (i = 0; i < page_num; i++) + put_page(pages[i]); + if (pages) + kfree(pages); + return ret; +#endif }
unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,