[PATCH openEuler-5.10 70/87] share_pool: Implement sp_walk_page_range()

30 Dec 2021

From: Wang Wensheng <wangwensheng4@huawei.com>

ascend inclusion
category: Feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW
CVE: NA

-------------------

This is a simple wrap of walk_page_range() to get all the pages of a
spa. It doesn't support holes.

Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
Signed-off-by: Tang Yizhou <tangyizhou@huawei.com>
Reviewed-by: Kefeng Wang<wangkefeng.wang@huawei.com>
Reviewed-by: Weilong Chen <chenweilong@huawei.com>
Signed-off-by: Zheng Zengkai <zhengzengkai@huawei.com>
---
 mm/share_pool.c | 243 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 241 insertions(+), 2 deletions(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index fe0e36a2214e..28bf0de8813b 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -51,6 +51,7 @@
 #include <linux/mmzone.h>
 #include <linux/timekeeping.h>
 #include <linux/time64.h>
+#include <linux/pagewalk.h>
 
 /* access control mode macros  */
 #define AC_NONE			0
@@ -494,6 +495,12 @@ static struct file *spa_file(struct sp_area *spa)
 		return spa->spg->file;
 }
 
+static inline void check_interrupt_context(void)
+{
+	if (unlikely(in_interrupt()))
+		panic("function can't be used in interrupt context\n");
+}
+
 static struct sp_group *create_spg(int spg_id)
 {
 	return NULL;
@@ -664,6 +671,201 @@ void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size,
 }
 EXPORT_SYMBOL_GPL(mg_sp_make_share_k2u);
 
+static int sp_pmd_entry(pmd_t *pmd, unsigned long addr,
+			unsigned long next, struct mm_walk *walk)
+{
+	struct sp_walk_data *sp_walk_data = walk->private;
+
+	sp_walk_data->pmd = pmd;
+	return 0;
+}
+
+static int sp_pte_entry(pte_t *pte, unsigned long addr,
+			unsigned long next, struct mm_walk *walk)
+{
+	struct page *page;
+	struct sp_walk_data *sp_walk_data = walk->private;
+	pmd_t *pmd = sp_walk_data->pmd;
+
+retry:
+	if (unlikely(!pte_present(*pte))) {
+		swp_entry_t entry;
+
+		if (pte_none(*pte))
+			goto no_page;
+		entry = pte_to_swp_entry(*pte);
+		if (!is_migration_entry(entry))
+			goto no_page;
+		migration_entry_wait(walk->mm, pmd, addr);
+		goto retry;
+	}
+
+	page = pte_page(*pte);
+	get_page(page);
+	sp_walk_data->pages[sp_walk_data->page_count++] = page;
+	return 0;
+
+no_page:
+	pr_debug("the page of addr %lx unexpectedly not in RAM\n",
+		 (unsigned long)addr);
+	return -EFAULT;
+}
+
+static int sp_test_walk(unsigned long addr, unsigned long next,
+			struct mm_walk *walk)
+{
+	/*
+	 * FIXME: The devmm driver uses remap_pfn_range() but actually there
+	 * are associated struct pages, so they should use vm_map_pages() or
+	 * similar APIs. Before the driver has been converted to correct APIs
+	 * we use this test_walk() callback so we can treat VM_PFNMAP VMAs as
+	 * normal VMAs.
+	 */
+	return 0;
+}
+
+static int sp_pte_hole(unsigned long start, unsigned long end,
+		       int depth, struct mm_walk *walk)
+{
+	pr_debug("hole [%lx, %lx) appeared unexpectedly\n", (unsigned long)start, (unsigned long)end);
+	return -EFAULT;
+}
+
+static int sp_hugetlb_entry(pte_t *ptep, unsigned long hmask,
+			    unsigned long addr, unsigned long next,
+			    struct mm_walk *walk)
+{
+	pte_t pte = huge_ptep_get(ptep);
+	struct page *page = pte_page(pte);
+	struct sp_walk_data *sp_walk_data;
+
+	if (unlikely(!pte_present(pte))) {
+		pr_debug("the page of addr %lx unexpectedly not in RAM\n", (unsigned long)addr);
+		return -EFAULT;
+	}
+
+	sp_walk_data = walk->private;
+	get_page(page);
+	sp_walk_data->pages[sp_walk_data->page_count++] = page;
+	return 0;
+}
+
+/*
+ * __sp_walk_page_range() - Walk page table with caller specific callbacks.
+ * @uva: the start VA of user memory.
+ * @size: the size of user memory.
+ * @mm: mm struct of the target task.
+ * @sp_walk_data: a structure of a page pointer array.
+ *
+ * the caller must hold mm->mmap_lock
+ *
+ * Notes for parameter alignment:
+ * When size == 0, let it be page_size, so that at least one page is walked.
+ *
+ * When size > 0, for convenience, usually the parameters of uva and
+ * size are not page aligned. There are four different alignment scenarios and
+ * we must handler all of them correctly.
+ *
+ * The basic idea is to align down uva and align up size so all the pages
+ * in range [uva, uva + size) are walked. However, there are special cases.
+ *
+ * Considering a 2M-hugepage addr scenario. Assuming the caller wants to
+ * traverse range [1001M, 1004.5M), so uva and size is 1001M and 3.5M
+ * accordingly. The aligned-down uva is 1000M and the aligned-up size is 4M.
+ * The traverse range will be [1000M, 1004M). Obviously, the final page for
+ * [1004M, 1004.5M) is not covered.
+ *
+ * To fix this problem, we need to walk an additional page, size should be
+ * ALIGN(uva+size) - uva_aligned
+ */
+static int __sp_walk_page_range(unsigned long uva, unsigned long size,
+	struct mm_struct *mm, struct sp_walk_data *sp_walk_data)
+{
+	int ret = 0;
+	struct vm_area_struct *vma;
+	unsigned long page_nr;
+	struct page **pages = NULL;
+	bool is_hugepage = false;
+	unsigned long uva_aligned;
+	unsigned long size_aligned;
+	unsigned int page_size = PAGE_SIZE;
+	struct mm_walk_ops sp_walk = {};
+
+	/*
+	 * Here we also support non share pool memory in this interface
+	 * because the caller can't distinguish whether a uva is from the
+	 * share pool or not. It is not the best idea to do so, but currently
+	 * it simplifies overall design.
+	 *
+	 * In this situation, the correctness of the parameters is mainly
+	 * guaranteed by the caller.
+	 */
+	vma = find_vma(mm, uva);
+	if (!vma) {
+		pr_debug("u2k input uva %lx is invalid\n", (unsigned long)uva);
+		return -EINVAL;
+	}
+	if (is_vm_hugetlb_page(vma))
+		is_hugepage = true;
+
+	sp_walk.pte_hole = sp_pte_hole;
+	sp_walk.test_walk = sp_test_walk;
+	if (is_hugepage) {
+		sp_walk_data->is_hugepage = true;
+		sp_walk.hugetlb_entry = sp_hugetlb_entry;
+		page_size = PMD_SIZE;
+	} else {
+		sp_walk_data->is_hugepage = false;
+		sp_walk.pte_entry = sp_pte_entry;
+		sp_walk.pmd_entry = sp_pmd_entry;
+	}
+
+	sp_walk_data->page_size = page_size;
+	uva_aligned = ALIGN_DOWN(uva, page_size);
+	sp_walk_data->uva_aligned = uva_aligned;
+	if (size == 0)
+		size_aligned = page_size;
+	else
+		/* special alignment handling */
+		size_aligned = ALIGN(uva + size, page_size) - uva_aligned;
+
+	if (uva_aligned + size_aligned < uva_aligned) {
+		pr_err_ratelimited("overflow happened in walk page range\n");
+		return -EINVAL;
+	}
+
+	page_nr = size_aligned / page_size;
+	pages = kvmalloc(page_nr * sizeof(struct page *), GFP_KERNEL);
+	if (!pages) {
+		pr_err_ratelimited("alloc page array failed in walk page range\n");
+		return -ENOMEM;
+	}
+	sp_walk_data->pages = pages;
+
+	ret = walk_page_range(mm, uva_aligned, uva_aligned + size_aligned,
+			      &sp_walk, sp_walk_data);
+	if (ret)
+		kvfree(pages);
+
+	return ret;
+}
+
+static void __sp_walk_page_free(struct sp_walk_data *data)
+{
+	int i = 0;
+	struct page *page;
+
+	while (i < data->page_count) {
+		page = data->pages[i++];
+		put_page(page);
+	}
+
+	kvfree(data->pages);
+	/* prevent repeated release */
+	data->page_count = 0;
+	data->pages = NULL;
+}
+
 /**
  * sp_make_share_u2k() - Share user memory of a specified process to kernel.
  * @uva: the VA of shared user memory
@@ -723,7 +925,39 @@ EXPORT_SYMBOL_GPL(mg_sp_unshare);
 int sp_walk_page_range(unsigned long uva, unsigned long size,
 	struct task_struct *tsk, struct sp_walk_data *sp_walk_data)
 {
-	return 0;
+	struct mm_struct *mm;
+	int ret = 0;
+
+	check_interrupt_context();
+
+	if (unlikely(!sp_walk_data)) {
+		pr_err_ratelimited("null pointer when walk page range\n");
+		return -EINVAL;
+	}
+	if (!tsk || (tsk->flags & PF_EXITING))
+		return -ESRCH;
+
+	get_task_struct(tsk);
+	mm = get_task_mm(tsk);
+	if (!mm) {
+		put_task_struct(tsk);
+		return -ESRCH;
+	}
+
+	sp_walk_data->page_count = 0;
+	down_write(&mm->mmap_lock);
+	if (likely(!mm->core_state))
+		ret = __sp_walk_page_range(uva, size, mm, sp_walk_data);
+	else {
+		pr_err("walk page range: encoutered coredump\n");
+		ret = -ESRCH;
+	}
+	up_write(&mm->mmap_lock);
+
+	mmput(mm);
+	put_task_struct(tsk);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(sp_walk_page_range);
 
@@ -740,7 +974,12 @@ EXPORT_SYMBOL_GPL(mg_sp_walk_page_range);
  */
 void sp_walk_page_free(struct sp_walk_data *sp_walk_data)
 {
-	return;
+	check_interrupt_context();
+
+	if (!sp_walk_data)
+		return;
+
+	__sp_walk_page_free(sp_walk_data);
 }
 EXPORT_SYMBOL_GPL(sp_walk_page_free);
 
-- 
2.20.1