[PATCH V2 OLK-5.10 1/6] mm/sharepool: Add mg_sp_alloc_nodemask

27 Jul 2023

From: Chen Jun <chenjun102@huawei.com>

hulk inclusion
category: feature
bugzilla: N/A

--------------------------------

Support alloc memory from nodes.

mg_sp_alloc allow to alloc memory from one node.
If the node have no enough memory, the caller would
pick a next node. But that has a lot of overhead.

To improve performance, we support a new interface to
alloc memory from nodes.

Signed-off-by: Chen Jun <chenjun102@huawei.com>
---
 include/linux/hugetlb.h              | 15 +++++
 include/linux/share_pool.h           | 10 ++--
 include/linux/share_pool_interface.h | 19 +++++++
 mm/hugetlb.c                         | 30 ++++++++--
 mm/mempolicy.c                       |  4 +-
 mm/share_pool.c                      | 84 +++++++++++++++++-----------
 6 files changed, 118 insertions(+), 44 deletions(-)
 create mode 100644 include/linux/share_pool_interface.h

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1c780d188e8c..366777ab3ba1 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -629,6 +629,9 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
 
 const struct hstate *hugetlb_get_hstate(void);
 struct page *hugetlb_alloc_hugepage(int nid, int flag);
+struct page *hugetlb_alloc_hugepage_vma(struct vm_area_struct *vma,
+		unsigned long address, int flag);
+
 int hugetlb_insert_hugepage_pte(struct mm_struct *mm, unsigned long addr,
 				pgprot_t prot, struct page *hpage);
 int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm,
@@ -645,6 +648,12 @@ static inline struct page *hugetlb_alloc_hugepage(int nid, int flag)
 	return  NULL;
 }
 
+static inline struct page *hugetlb_alloc_hugepage_vma(struct vm_area_struct *vma,
+		unsigned long address, int flag)
+{
+	return NULL;
+}
+
 static inline int hugetlb_insert_hugepage_pte(struct mm_struct *mm,
 		unsigned long addr, pgprot_t prot, struct page *hpage)
 {
@@ -1091,6 +1100,12 @@ static inline struct page *hugetlb_alloc_hugepage(int nid, int flag)
 	return  NULL;
 }
 
+static inline struct page *hugetlb_alloc_hugepage_vma(struct vm_area_struct *vma,
+		unsigned long address, int flag)
+{
+	return  NULL;
+}
+
 static inline int hugetlb_insert_hugepage_pte(struct mm_struct *mm,
 		unsigned long addr, pgprot_t prot, struct page *hpage)
 {
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h
index 8190c8d82439..04feea9b924d 100644
--- a/include/linux/share_pool.h
+++ b/include/linux/share_pool.h
@@ -12,6 +12,8 @@
 #include <linux/jump_label.h>
 #include <linux/kabi.h>
 
+#include <linux/share_pool_interface.h>
+
 #define SP_HUGEPAGE		(1 << 0)
 #define SP_HUGEPAGE_ONLY	(1 << 1)
 #define SP_DVPP			(1 << 2)
@@ -256,6 +258,8 @@ extern int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task);
 
 extern void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id);
+extern void *mg_sp_alloc_nodemask(unsigned long size, unsigned long sp_flags, int spg_id,
+		nodemask_t nodemask);
 extern int mg_sp_free(unsigned long addr, int id);
 
 extern void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size,
@@ -286,7 +290,6 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm,
 			     unsigned long address, pte_t *ptep, unsigned int flags);
 extern bool sp_check_addr(unsigned long addr);
 extern bool sp_check_mmap_addr(unsigned long addr, unsigned long flags);
-extern int sp_node_id(struct vm_area_struct *vma);
 
 static inline bool sp_is_enabled(void)
 {
@@ -452,11 +455,6 @@ static inline bool is_vmalloc_sharepool(unsigned long vm_flags)
 	return NULL;
 }
 
-static inline int sp_node_id(struct vm_area_struct *vma)
-{
-	return numa_node_id();
-}
-
 static inline bool sp_check_addr(unsigned long addr)
 {
 	return false;
diff --git a/include/linux/share_pool_interface.h b/include/linux/share_pool_interface.h
new file mode 100644
index 000000000000..8cd82859902f
--- /dev/null
+++ b/include/linux/share_pool_interface.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_SHARE_POOL_INTERFACE_H
+#define LINUX_SHARE_POOL_INTERFACE_H
+
+#include <linux/mman.h>
+#include <linux/mm_types.h>
+#include <linux/numa.h>
+#include <linux/kabi.h>
+
+#ifdef CONFIG_ASCEND_SHARE_POOL
+extern int sp_node_id(struct vm_area_struct *vma);
+#else
+static inline int sp_node_id(struct vm_area_struct *vma)
+{
+	return numa_node_id();
+}
+#endif /* !CONFIG_ASCEND_SHARE_POOL */
+
+#endif /* LINUX_SHARE_POOL_INTERFACE_H */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index a55197135afa..a1cf6a1e9cec 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6312,7 +6312,7 @@ static struct page *hugetlb_alloc_hugepage_normal(struct hstate *h,
 /*
  * Allocate hugepage without reserve
  */
-struct page *hugetlb_alloc_hugepage(int nid, int flag)
+struct page *hugetlb_alloc_hugepage_nodemask(int nid, int flag, nodemask_t *nodemask)
 {
 	struct hstate *h = &default_hstate;
 	gfp_t gfp_mask = htlb_alloc_mask(h);
@@ -6327,7 +6327,6 @@ struct page *hugetlb_alloc_hugepage(int nid, int flag)
 	if (flag & ~HUGETLB_ALLOC_MASK)
 		return NULL;
 
-	gfp_mask |= __GFP_THISNODE;
 	if (enable_charge_mighp)
 		gfp_mask |= __GFP_ACCOUNT;
 
@@ -6337,12 +6336,22 @@ struct page *hugetlb_alloc_hugepage(int nid, int flag)
 	if (flag & HUGETLB_ALLOC_NORMAL)
 		page = hugetlb_alloc_hugepage_normal(h, gfp_mask, nid);
 	else if (flag & HUGETLB_ALLOC_BUDDY)
-		page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
+		page = alloc_migrate_huge_page(h, gfp_mask, nid, nodemask);
 	else
-		page = alloc_huge_page_nodemask(h, nid, NULL, gfp_mask);
+		page = alloc_huge_page_nodemask(h, nid, nodemask, gfp_mask);
 
 	return page;
 }
+
+struct page *hugetlb_alloc_hugepage(int nid, int flag)
+{
+	nodemask_t nodemask;
+
+	nodes_clear(nodemask);
+	node_set(nid, nodemask);
+
+	return hugetlb_alloc_hugepage_nodemask(nid, flag, &nodemask);
+}
 EXPORT_SYMBOL_GPL(hugetlb_alloc_hugepage);
 
 static pte_t *hugetlb_huge_pte_alloc(struct mm_struct *mm, unsigned long addr,
@@ -6364,6 +6373,19 @@ static pte_t *hugetlb_huge_pte_alloc(struct mm_struct *mm, unsigned long addr,
 	return ptep;
 }
 
+struct page *hugetlb_alloc_hugepage_vma(struct vm_area_struct *vma, unsigned long address, int flag)
+{
+	int nid;
+	struct hstate *h = hstate_vma(vma);
+	struct mempolicy *mpol;
+	nodemask_t *nodemask;
+	gfp_t gfp_mask;
+
+	gfp_mask = htlb_alloc_mask(h);
+	nid = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
+	return hugetlb_alloc_hugepage_nodemask(nid, flag, nodemask);
+}
+
 static int __hugetlb_insert_hugepage(struct mm_struct *mm, unsigned long addr,
 				     pgprot_t prot, unsigned long pfn)
 {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b58ec3f98896..c43df3206ab6 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -103,6 +103,8 @@
 #include <linux/printk.h>
 #include <linux/swapops.h>
 
+#include <linux/share_pool_interface.h>
+
 #include <asm/tlbflush.h>
 #include <linux/uaccess.h>
 
@@ -2198,7 +2200,7 @@ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags,
 		nid = interleave_nid(*mpol, vma, addr,
 					huge_page_shift(hstate_vma(vma)));
 	} else {
-		nid = policy_node(gfp_flags, *mpol, numa_node_id());
+		nid = policy_node(gfp_flags, *mpol, sp_node_id(vma));
 		if ((*mpol)->mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY)
 			*nodemask = &(*mpol)->v.nodes;
 	}
diff --git a/mm/share_pool.c b/mm/share_pool.c
index ce4837da8a9a..8b46af5fae47 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -700,7 +700,7 @@ struct sp_area {
 	struct mm_struct *mm;		/* owner of k2u(task) */
 	unsigned long kva;		/* shared kva */
 	pid_t applier;			/* the original applier process */
-	int node_id;			/* memory node */
+	int preferred_node_id;			/* memory node */
 	int device_id;
 };
 static DEFINE_SPINLOCK(sp_area_lock);
@@ -1892,7 +1892,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
 	spa->mm = NULL;
 	spa->kva = 0;   /* NULL pointer */
 	spa->applier = applier;
-	spa->node_id = node_id;
+	spa->preferred_node_id = node_id;
 	spa->device_id = device_id;
 
 	spa_inc_usage(spa);
@@ -2191,7 +2191,9 @@ static int sp_free_get_spa(struct sp_free_context *fc)
 }
 
 /**
- * mg_sp_free() - Free the memory allocated by mg_sp_alloc().
+ * mg_sp_free() - Free the memory allocated by mg_sp_alloc() or
+ * mg_sp_alloc_nodemask().
+ *
  * @addr: the starting VA of the memory.
  * @id: Address space identifier, which is used to distinguish the addr.
  *
@@ -2448,18 +2450,15 @@ static int sp_alloc_populate(struct mm_struct *mm, struct sp_area *spa,
 }
 
 static long sp_mbind(struct mm_struct *mm, unsigned long start, unsigned long len,
-		unsigned long node)
+		nodemask_t *nodemask)
 {
-	nodemask_t nmask;
-
-	nodes_clear(nmask);
-	node_set(node, nmask);
 	return __do_mbind(start, len, MPOL_BIND, MPOL_F_STATIC_NODES,
-			&nmask, MPOL_MF_STRICT, mm);
+			nodemask, MPOL_MF_STRICT, mm);
 }
 
 static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa,
-	struct sp_group_node *spg_node, struct sp_alloc_context *ac)
+	struct sp_group_node *spg_node, struct sp_alloc_context *ac,
+	nodemask_t *nodemask)
 {
 	int ret;
 
@@ -2468,10 +2467,10 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa,
 		return ret;
 
 	if (!ac->have_mbind) {
-		ret = sp_mbind(mm, spa->va_start, spa->real_size, spa->node_id);
+		ret = sp_mbind(mm, spa->va_start, spa->real_size, nodemask);
 		if (ret < 0) {
-			pr_err("cannot bind the memory range to specified node:%d, err:%d\n",
-				spa->node_id, ret);
+			pr_err("cannot bind the memory range to node[%*pbl], err:%d\n",
+					nodemask_pr_args(nodemask), ret);
 			return ret;
 		}
 		ac->have_mbind = true;
@@ -2490,17 +2489,25 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa,
 }
 
 static int sp_alloc_mmap_populate(struct sp_area *spa,
-				  struct sp_alloc_context *ac)
+				  struct sp_alloc_context *ac,
+				  nodemask_t *nodemask)
 {
 	int ret = -EINVAL;
 	int mmap_ret = 0;
 	struct mm_struct *mm, *end_mm = NULL;
 	struct sp_group_node *spg_node;
+	nodemask_t __nodemask;
+
+	if (!nodemask) { /* mg_sp_alloc */
+		nodes_clear(__nodemask);
+		node_set(spa->preferred_node_id, __nodemask);
+	} else /* mg_sp_alloc_nodemask */
+		__nodemask = *nodemask;
 
 	/* create mapping for each process in the group */
 	list_for_each_entry(spg_node, &spa->spg->procs, proc_node) {
 		mm = spg_node->master->mm;
-		mmap_ret = __sp_alloc_mmap_populate(mm, spa, spg_node, ac);
+		mmap_ret = __sp_alloc_mmap_populate(mm, spa, spg_node, ac, &__nodemask);
 		if (mmap_ret) {
 
 			/*
@@ -2563,19 +2570,8 @@ static void sp_alloc_finish(int result, struct sp_area *spa,
 	sp_group_put(spg);
 }
 
-/**
- * mg_sp_alloc() - Allocate shared memory for all the processes in a sp_group.
- * @size: the size of memory to allocate.
- * @sp_flags: how to allocate the memory.
- * @spg_id: the share group that the memory is allocated to.
- *
- * Use pass through allocation if spg_id == SPG_ID_DEFAULT in multi-group mode.
- *
- * Return:
- * * if succeed, return the starting address of the shared memory.
- * * if fail, return the pointer of -errno.
- */
-void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id)
+void *__mg_sp_alloc_nodemask(unsigned long size, unsigned long sp_flags, int spg_id,
+		nodemask_t *nodemask)
 {
 	struct sp_area *spa = NULL;
 	int ret = 0;
@@ -2598,7 +2594,7 @@ void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id)
 		goto out;
 	}
 
-	ret = sp_alloc_mmap_populate(spa, &ac);
+	ret = sp_alloc_mmap_populate(spa, &ac, nodemask);
 	if (ret && ac.state == ALLOC_RETRY) {
 		/*
 		 * The mempolicy for shared memory is located at backend file, which varies
@@ -2616,6 +2612,30 @@ void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id)
 	else
 		return (void *)(spa->va_start);
 }
+
+void *mg_sp_alloc_nodemask(unsigned long size, unsigned long sp_flags, int spg_id,
+		nodemask_t nodemask)
+{
+	return __mg_sp_alloc_nodemask(size, sp_flags, spg_id, &nodemask);
+}
+EXPORT_SYMBOL_GPL(mg_sp_alloc_nodemask);
+
+/**
+ * mg_sp_alloc() - Allocate shared memory for all the processes in a sp_group.
+ * @size: the size of memory to allocate.
+ * @sp_flags: how to allocate the memory.
+ * @spg_id: the share group that the memory is allocated to.
+ *
+ * Use pass through allocation if spg_id == SPG_ID_DEFAULT in multi-group mode.
+ *
+ * Return:
+ * * if succeed, return the starting address of the shared memory.
+ * * if fail, return the pointer of -errno.
+ */
+void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id)
+{
+	return __mg_sp_alloc_nodemask(size, sp_flags, spg_id, NULL);
+}
 EXPORT_SYMBOL_GPL(mg_sp_alloc);
 
 /**
@@ -3599,7 +3619,7 @@ int sp_node_id(struct vm_area_struct *vma)
 
 	if (vma && (vma->vm_flags & VM_SHARE_POOL) && vma->vm_private_data) {
 		spa = vma->vm_private_data;
-		node_id = spa->node_id;
+		node_id = spa->preferred_node_id;
 	}
 
 	return node_id;
@@ -4028,7 +4048,6 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm,
 	unsigned long haddr = address & huge_page_mask(h);
 	bool new_page = false;
 	int err;
-	int node_id;
 	struct sp_area *spa;
 	bool charge_hpage;
 
@@ -4037,7 +4056,6 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm,
 		pr_err("share pool: vma is invalid, not from sp mmap\n");
 		return ret;
 	}
-	node_id = spa->node_id;
 
 retry:
 	page = find_lock_page(mapping, idx);
@@ -4049,7 +4067,7 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm,
 		charge_hpage = false;
 		page = alloc_huge_page(vma, haddr, 0);
 		if (IS_ERR(page)) {
-			page = hugetlb_alloc_hugepage(node_id,
+			page = hugetlb_alloc_hugepage_vma(vma, haddr,
 					HUGETLB_ALLOC_BUDDY | HUGETLB_ALLOC_NORECLAIM);
 			if (!page)
 				page = ERR_PTR(-ENOMEM);
-- 
2.17.1

    

[PATCH V2 OLK-5.10 1/6] mm/sharepool: Add mg_sp_alloc_nodemask

Zhang Zekun