From: ZhangPeng zhangpeng362@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KESX CVE: NA
--------------------------------
We introduce MREMAP_USWAP_SET_PTE to implement remapping in the swap-out phase. Unmap the pages between 'addr ~ addr+old_len' and remap them to 'new_addr ~ new_addr+new_len'. During unmapping, the PTE of old_addr is set to SWP_USERSWAP_ENTRY.
Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- include/linux/mm.h | 1 + include/linux/userswap.h | 25 +++ include/uapi/linux/mman.h | 1 + mm/mremap.c | 8 +- mm/userswap.c | 380 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 414 insertions(+), 1 deletion(-) create mode 100644 include/linux/userswap.h
diff --git a/include/linux/mm.h b/include/linux/mm.h index abafc9efc30f..80bacc4da324 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2492,6 +2492,7 @@ int set_page_dirty_lock(struct page *page);
int get_cmdline(struct task_struct *task, char *buffer, int buflen);
+extern pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, diff --git a/include/linux/userswap.h b/include/linux/userswap.h new file mode 100644 index 000000000000..f8063185056c --- /dev/null +++ b/include/linux/userswap.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved. + */ + +#ifndef _LINUX_USERSWAP_H +#define _LINUX_USERSWAP_H + +#include <linux/mman.h> + +#ifdef CONFIG_USERSWAP + +extern struct static_key_false userswap_enabled; + +/* + * In uswap situation, we use the bit 0 of the returned address to indicate + * whether the pages are dirty. + */ +#define USWAP_PAGES_DIRTY 1 + +unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len); + +#endif /* CONFIG_USERSWAP */ +#endif /* _LINUX_USERSWAP_H */ diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index a246e11988d5..3984f2133906 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -9,6 +9,7 @@ #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 #define MREMAP_DONTUNMAP 4 +#define MREMAP_USWAP_SET_PTE 64
#define OVERCOMMIT_GUESS 0 #define OVERCOMMIT_ALWAYS 1 diff --git a/mm/mremap.c b/mm/mremap.c index b6979f9d687c..5d701d3c4f6b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -26,6 +26,7 @@ #include <linux/userfaultfd_k.h> #include <linux/mempolicy.h> #include <linux/share_pool.h> +#include <linux/userswap.h>
#include <asm/cacheflush.h> #include <asm/tlb.h> @@ -33,7 +34,7 @@
#include "internal.h"
-static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) +pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; @@ -931,6 +932,11 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, */ addr = untagged_addr(addr);
+#ifdef CONFIG_USERSWAP + if (flags == MREMAP_USWAP_SET_PTE) + return uswap_mremap(addr, old_len, new_addr, new_len); +#endif + if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) return ret;
diff --git a/mm/userswap.c b/mm/userswap.c index a2f180b4457f..56f7140d5335 100644 --- a/mm/userswap.c +++ b/mm/userswap.c @@ -5,10 +5,390 @@ * userswap core file */
+#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/rmap.h> +#include <linux/mmu_notifier.h> +#include <linux/hugetlb.h> +#include <linux/userswap.h> + #include "internal.h"
DEFINE_STATIC_KEY_FALSE(userswap_enabled);
+static bool vma_uswap_compatible(struct vm_area_struct *vma) +{ + if (!vma || !vma_is_anonymous(vma) || vma->vm_file || + (vma->vm_flags & (VM_SHARED | VM_LOCKED | VM_STACK | VM_IO | + VM_PFNMAP | VM_HUGETLB))) + return false; + return true; +} + +/* + * Check if pages between 'addr ~ addr+len' can be user swapped. If so, get + * the reference of the pages and return the pages through input parameters + * 'ppages'. + */ +static unsigned long pages_can_be_swapped(struct mm_struct *mm, + unsigned long addr, + unsigned long len, + struct page ***ppages) +{ + struct vm_area_struct *vma; + struct page *page = NULL; + struct page **pages = NULL; + unsigned long addr_end = addr + len; + unsigned long ret; + unsigned long i, page_num = 0; + *ppages = NULL; + + pages = kvzalloc(sizeof(struct page *) * (len / PAGE_SIZE), GFP_KERNEL); + if (!pages) + return -ENOMEM; + + while (addr < addr_end) { + vma = find_vma(mm, addr); + if (!vma || addr < vma->vm_start || + !(vma->vm_flags & VM_USWAP) || + !vma_uswap_compatible(vma)) { + ret = -EINVAL; + goto out_err; + } + + if (!(vma->vm_flags & VM_UFFD_MISSING)) { + ret = -EAGAIN; + goto out_err; + } +get_again: + /* + * follow_page will inc page ref, dec the ref after we remap + * the page. + */ + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + if (IS_ERR_OR_NULL(page)) { + ret = -ENODEV; + goto out_err; + } + + pages[page_num++] = page; + if (!PageAnon(page) || !PageSwapBacked(page) || + PageHuge(page) || PageSwapCache(page)) { + ret = -EINVAL; + goto out_err; + } + + if (PageTransCompound(page)) { + if (trylock_page(page)) { + if (!split_huge_page(page)) { + unlock_page(page); + put_page(page); + page_num--; + goto get_again; + } else { + unlock_page(page); + } + } + ret = -EINVAL; + goto out_err; + } + + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if (page_mapcount(page) > 1 || + page_mapcount(page) + 1 != page_count(page)) { + ret = -EBUSY; + goto out_err; + } + addr += PAGE_SIZE; + } + + *ppages = pages; + return 0; + +out_err: + for (i = 0; i < page_num; i++) + put_page(pages[i]); + kvfree(pages); + return ret; +} + +static bool is_thp_or_huge(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = get_old_pud(mm, addr); + if (!pud) + return false; + else if (pud_huge(*pud)) + return true; + + pmd = pmd_offset(pud, addr); + if (!pmd) + return false; + else if (pmd_huge(*pmd) || pmd_trans_huge(*pmd)) + return true; + + return false; +} + +static int uswap_unmap_anon_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, struct page *page, + pmd_t *pmd, pte_t *old_pte, bool set_to_swp) +{ + struct mmu_notifier_range range; + spinlock_t *ptl; + pte_t *pte, _old_pte; + int ret = 0; + + mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, addr, + addr + PAGE_SIZE); + mmu_notifier_invalidate_range_start(&range); + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_present(*pte)) { + ret = -EINVAL; + goto out_release_unlock; + } + flush_cache_page(vma, addr, pte_pfn(*pte)); + _old_pte = ptep_clear_flush(vma, addr, pte); + if (old_pte) + *old_pte = _old_pte; + if (set_to_swp) + set_pte_at(mm, addr, pte, swp_entry_to_pte(swp_entry( + SWP_USERSWAP_ENTRY, page_to_pfn(page)))); + + dec_mm_counter(mm, MM_ANONPAGES); + page_remove_rmap(page, vma, false); + page->mapping = NULL; + +out_release_unlock: + pte_unmap_unlock(pte, ptl); + mmu_notifier_invalidate_range_end(&range); + return ret; +} + +static unsigned long vm_insert_anon_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + struct mm_struct *mm = vma->vm_mm; + int ret = 0; + pte_t *pte, dst_pte; + spinlock_t *ptl; + + if (unlikely(anon_vma_prepare(vma))) + return -ENOMEM; + + flush_dcache_page(page); + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + return -ENOMEM; + if (!pte_none(*pte)) { + ret = -EBUSY; + goto out_unlock; + } + + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr); + dst_pte = mk_pte(page, vma->vm_page_prot); + if (vma->vm_flags & VM_WRITE) + dst_pte = pte_mkwrite_novma(pte_mkdirty(dst_pte)); + set_pte_at(mm, addr, pte, dst_pte); + +out_unlock: + pte_unmap_unlock(pte, ptl); + return ret; +} + +static void uswap_map_anon_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + struct page *page, + pmd_t *pmd, + pte_t old_pte) +{ + spinlock_t *ptl; + pte_t *pte; + + pte = pte_offset_map_lock(mm, pmd, addr, &ptl); + flush_cache_page(vma, addr, pte_pfn(*pte)); + set_pte_at(mm, addr, pte, old_pte); + inc_mm_counter(mm, MM_ANONPAGES); + page_add_new_anon_rmap(page, vma, addr); + pte_unmap_unlock(pte, ptl); +} + +static void uswapout_recover(struct mm_struct *mm, + unsigned long old_addr_start, unsigned long len, + struct page **pages, unsigned long new_addr_start, + pte_t *ptes) +{ + unsigned long unmap_old_addr = old_addr_start; + unsigned long unmap_new_addr = new_addr_start; + struct page *page; + pmd_t *old_pmd, *new_pmd; + pte_t pte; + unsigned long i; + + for (i = 0; i < len; i++) { + page = pages[i]; + pte = ptes[i]; + new_pmd = mm_find_pmd(mm, new_addr_start); + old_pmd = mm_find_pmd(mm, unmap_old_addr); + + uswap_unmap_anon_page(mm, find_vma(mm, unmap_new_addr), + unmap_new_addr, page, new_pmd, NULL, + false); + uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr), + unmap_old_addr, page, old_pmd, pte); + unmap_old_addr += PAGE_SIZE; + unmap_new_addr += PAGE_SIZE; + } + if (pte_val(ptes[len]) != 0) { + page = pages[len]; + pte = ptes[len]; + old_pmd = mm_find_pmd(mm, unmap_old_addr); + + uswap_map_anon_page(mm, find_vma(mm, unmap_old_addr), + unmap_old_addr, page, old_pmd, pte); + get_page(page); + } +} + +/* unmap the pages between 'addr ~ addr+len' and remap them to a new address */ +static unsigned long do_user_swap(struct mm_struct *mm, + unsigned long old_addr_start, + unsigned long len, struct page **pages, + unsigned long new_addr_start) +{ + struct vm_area_struct *old_vma, *new_vma; + unsigned long old_addr = old_addr_start; + unsigned long new_addr = new_addr_start; + struct page *page; + pmd_t *pmd; + pte_t old_pte, *ptes; + bool pages_dirty = false; + unsigned long i = 0, j; + int ret; + + ptes = kvzalloc(sizeof(pte_t) * (len / PAGE_SIZE), GFP_KERNEL); + if (!ptes) + return -ENOMEM; + lru_add_drain(); + for (j = 0; j < len; j += PAGE_SIZE) { + page = pages[i]; + ret = -EINVAL; + if (!page) + goto out_recover; + if (is_thp_or_huge(mm, new_addr)) + goto out_recover; + old_vma = find_vma(mm, old_addr); + if (!old_vma || old_addr < old_vma->vm_start) + goto out_recover; + new_vma = find_vma(mm, new_addr); + if (!new_vma || new_addr < new_vma->vm_start) + goto out_recover; + if (!vma_uswap_compatible(new_vma)) + goto out_recover; + + ret = -EACCES; + if (!(old_vma->vm_flags & VM_WRITE) && + (new_vma->vm_flags & VM_WRITE)) + goto out_recover; + + ret = -ENXIO; + pmd = mm_find_pmd(mm, old_addr); + if (!pmd) + goto out_recover; + ret = uswap_unmap_anon_page(mm, old_vma, old_addr, page, pmd, + &old_pte, true); + if (ret) + goto out_recover; + ptes[i] = old_pte; + if (pte_dirty(old_pte) || PageDirty(page)) + pages_dirty = true; + put_page(page); + + ret = vm_insert_anon_page(new_vma, new_addr, page); + if (ret) + goto out_recover; + get_page(page); + + old_addr += PAGE_SIZE; + new_addr += PAGE_SIZE; + i++; + } + + if (pages_dirty) + new_addr_start = new_addr_start | USWAP_PAGES_DIRTY; + kvfree(ptes); + return new_addr_start; + +out_recover: + uswapout_recover(mm, old_addr_start, i, pages, new_addr_start, ptes); + kvfree(ptes); + return ret; +} + + +/* + * When flags is MREMAP_USWAP_SET_PTE, uswap_mremap() is called in syscall + * mremap. + * Unmap the pages between 'addr ~ addr + old_len' and remap them to 'new_addr + * ~ new_addr + new_len'. Set the pte of old_addr to SWP_USERSWAP_ENTRY. + */ +unsigned long uswap_mremap(unsigned long old_addr, unsigned long old_len, + unsigned long new_addr, unsigned long new_len) +{ + struct page **pages = NULL; + struct mm_struct *mm = current->mm; + unsigned long len = old_len; + unsigned long ret = -EINVAL; + unsigned long i; + + if (!static_branch_unlikely(&userswap_enabled)) + goto out; + + if (offset_in_page(old_addr)) + goto out; + + old_len = PAGE_ALIGN(old_len); + new_len = PAGE_ALIGN(new_len); + + if (!new_len || old_len != new_len || offset_in_page(new_addr)) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len || + old_addr > TASK_SIZE - old_len) + goto out; + + /* Ensure the old/new locations do not overlap */ + if (old_addr + old_len > new_addr && new_addr + new_len > old_addr) + goto out; + + lru_add_drain_all(); + mmap_write_lock(mm); + ret = pages_can_be_swapped(mm, old_addr, len, &pages); + if (ret) + goto out_release_unlock; + + ret = do_user_swap(mm, old_addr, len, pages, new_addr); + /* follow_page() above increased the reference */ + for (i = 0; i < len / PAGE_SIZE; i++) + if (pages[i]) + put_page(pages[i]); + + kvfree(pages); + +out_release_unlock: + mmap_write_unlock(mm); +out: + return ret; +} + static int __init enable_userswap_setup(char *str) { static_branch_enable(&userswap_enabled);