From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
This is a simple wrap of walk_page_range() to get all the pages of a spa. It doesn't support holes.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 243 +++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 241 insertions(+), 2 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index fe0e36a2214e..28bf0de8813b 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -51,6 +51,7 @@ #include <linux/mmzone.h> #include <linux/timekeeping.h> #include <linux/time64.h> +#include <linux/pagewalk.h>
/* access control mode macros */ #define AC_NONE 0 @@ -494,6 +495,12 @@ static struct file *spa_file(struct sp_area *spa) return spa->spg->file; }
+static inline void check_interrupt_context(void) +{ + if (unlikely(in_interrupt())) + panic("function can't be used in interrupt context\n"); +} + static struct sp_group *create_spg(int spg_id) { return NULL; @@ -664,6 +671,201 @@ void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size, } EXPORT_SYMBOL_GPL(mg_sp_make_share_k2u);
+static int sp_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct sp_walk_data *sp_walk_data = walk->private; + + sp_walk_data->pmd = pmd; + return 0; +} + +static int sp_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page *page; + struct sp_walk_data *sp_walk_data = walk->private; + pmd_t *pmd = sp_walk_data->pmd; + +retry: + if (unlikely(!pte_present(*pte))) { + swp_entry_t entry; + + if (pte_none(*pte)) + goto no_page; + entry = pte_to_swp_entry(*pte); + if (!is_migration_entry(entry)) + goto no_page; + migration_entry_wait(walk->mm, pmd, addr); + goto retry; + } + + page = pte_page(*pte); + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + return 0; + +no_page: + pr_debug("the page of addr %lx unexpectedly not in RAM\n", + (unsigned long)addr); + return -EFAULT; +} + +static int sp_test_walk(unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + /* + * FIXME: The devmm driver uses remap_pfn_range() but actually there + * are associated struct pages, so they should use vm_map_pages() or + * similar APIs. Before the driver has been converted to correct APIs + * we use this test_walk() callback so we can treat VM_PFNMAP VMAs as + * normal VMAs. + */ + return 0; +} + +static int sp_pte_hole(unsigned long start, unsigned long end, + int depth, struct mm_walk *walk) +{ + pr_debug("hole [%lx, %lx) appeared unexpectedly\n", (unsigned long)start, (unsigned long)end); + return -EFAULT; +} + +static int sp_hugetlb_entry(pte_t *ptep, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + pte_t pte = huge_ptep_get(ptep); + struct page *page = pte_page(pte); + struct sp_walk_data *sp_walk_data; + + if (unlikely(!pte_present(pte))) { + pr_debug("the page of addr %lx unexpectedly not in RAM\n", (unsigned long)addr); + return -EFAULT; + } + + sp_walk_data = walk->private; + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + return 0; +} + +/* + * __sp_walk_page_range() - Walk page table with caller specific callbacks. + * @uva: the start VA of user memory. + * @size: the size of user memory. + * @mm: mm struct of the target task. + * @sp_walk_data: a structure of a page pointer array. + * + * the caller must hold mm->mmap_lock + * + * Notes for parameter alignment: + * When size == 0, let it be page_size, so that at least one page is walked. + * + * When size > 0, for convenience, usually the parameters of uva and + * size are not page aligned. There are four different alignment scenarios and + * we must handler all of them correctly. + * + * The basic idea is to align down uva and align up size so all the pages + * in range [uva, uva + size) are walked. However, there are special cases. + * + * Considering a 2M-hugepage addr scenario. Assuming the caller wants to + * traverse range [1001M, 1004.5M), so uva and size is 1001M and 3.5M + * accordingly. The aligned-down uva is 1000M and the aligned-up size is 4M. + * The traverse range will be [1000M, 1004M). Obviously, the final page for + * [1004M, 1004.5M) is not covered. + * + * To fix this problem, we need to walk an additional page, size should be + * ALIGN(uva+size) - uva_aligned + */ +static int __sp_walk_page_range(unsigned long uva, unsigned long size, + struct mm_struct *mm, struct sp_walk_data *sp_walk_data) +{ + int ret = 0; + struct vm_area_struct *vma; + unsigned long page_nr; + struct page **pages = NULL; + bool is_hugepage = false; + unsigned long uva_aligned; + unsigned long size_aligned; + unsigned int page_size = PAGE_SIZE; + struct mm_walk_ops sp_walk = {}; + + /* + * Here we also support non share pool memory in this interface + * because the caller can't distinguish whether a uva is from the + * share pool or not. It is not the best idea to do so, but currently + * it simplifies overall design. + * + * In this situation, the correctness of the parameters is mainly + * guaranteed by the caller. + */ + vma = find_vma(mm, uva); + if (!vma) { + pr_debug("u2k input uva %lx is invalid\n", (unsigned long)uva); + return -EINVAL; + } + if (is_vm_hugetlb_page(vma)) + is_hugepage = true; + + sp_walk.pte_hole = sp_pte_hole; + sp_walk.test_walk = sp_test_walk; + if (is_hugepage) { + sp_walk_data->is_hugepage = true; + sp_walk.hugetlb_entry = sp_hugetlb_entry; + page_size = PMD_SIZE; + } else { + sp_walk_data->is_hugepage = false; + sp_walk.pte_entry = sp_pte_entry; + sp_walk.pmd_entry = sp_pmd_entry; + } + + sp_walk_data->page_size = page_size; + uva_aligned = ALIGN_DOWN(uva, page_size); + sp_walk_data->uva_aligned = uva_aligned; + if (size == 0) + size_aligned = page_size; + else + /* special alignment handling */ + size_aligned = ALIGN(uva + size, page_size) - uva_aligned; + + if (uva_aligned + size_aligned < uva_aligned) { + pr_err_ratelimited("overflow happened in walk page range\n"); + return -EINVAL; + } + + page_nr = size_aligned / page_size; + pages = kvmalloc(page_nr * sizeof(struct page *), GFP_KERNEL); + if (!pages) { + pr_err_ratelimited("alloc page array failed in walk page range\n"); + return -ENOMEM; + } + sp_walk_data->pages = pages; + + ret = walk_page_range(mm, uva_aligned, uva_aligned + size_aligned, + &sp_walk, sp_walk_data); + if (ret) + kvfree(pages); + + return ret; +} + +static void __sp_walk_page_free(struct sp_walk_data *data) +{ + int i = 0; + struct page *page; + + while (i < data->page_count) { + page = data->pages[i++]; + put_page(page); + } + + kvfree(data->pages); + /* prevent repeated release */ + data->page_count = 0; + data->pages = NULL; +} + /** * sp_make_share_u2k() - Share user memory of a specified process to kernel. * @uva: the VA of shared user memory @@ -723,7 +925,39 @@ EXPORT_SYMBOL_GPL(mg_sp_unshare); int sp_walk_page_range(unsigned long uva, unsigned long size, struct task_struct *tsk, struct sp_walk_data *sp_walk_data) { - return 0; + struct mm_struct *mm; + int ret = 0; + + check_interrupt_context(); + + if (unlikely(!sp_walk_data)) { + pr_err_ratelimited("null pointer when walk page range\n"); + return -EINVAL; + } + if (!tsk || (tsk->flags & PF_EXITING)) + return -ESRCH; + + get_task_struct(tsk); + mm = get_task_mm(tsk); + if (!mm) { + put_task_struct(tsk); + return -ESRCH; + } + + sp_walk_data->page_count = 0; + down_write(&mm->mmap_lock); + if (likely(!mm->core_state)) + ret = __sp_walk_page_range(uva, size, mm, sp_walk_data); + else { + pr_err("walk page range: encoutered coredump\n"); + ret = -ESRCH; + } + up_write(&mm->mmap_lock); + + mmput(mm); + put_task_struct(tsk); + + return ret; } EXPORT_SYMBOL_GPL(sp_walk_page_range);
@@ -740,7 +974,12 @@ EXPORT_SYMBOL_GPL(mg_sp_walk_page_range); */ void sp_walk_page_free(struct sp_walk_data *sp_walk_data) { - return; + check_interrupt_context(); + + if (!sp_walk_data) + return; + + __sp_walk_page_free(sp_walk_data); } EXPORT_SYMBOL_GPL(sp_walk_page_free);