[PATCH openEuler-23.09 v3 11/19] mm/sharepool: Implement mg_sp_alloc()

5 Sep 2023

ascend inclusion
category: Feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7YF5R
---------------------------------------------
Alloc share memory for tasks in a share pool group. The tasks in the
same pool could access the memory with the same virtual address.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com
---
 mm/share_pool.c | 343 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 341 insertions(+), 2 deletions(-)

diff --git a/mm/share_pool.c b/mm/share_pool.c
index a31aef4fa2da..06e0e0bc2d56 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -484,6 +484,11 @@ static inline bool is_local_group(int spg_id)
    return spg_id >= SPG_ID_LOCAL_MIN && spg_id <= SPG_ID_LOCAL_MAX;
 }
+static int sp_init_group_master(struct task_struct *tsk, struct mm_struct *mm)
+{
+	return -EOPNOTSUPP;
+}
+
 static void update_mem_usage_alloc(unsigned long size, bool inc,
    	bool is_hugepage, struct sp_group_node *spg_node)
 {
@@ -694,6 +699,16 @@ static inline void check_interrupt_context(void)
    	panic("function can't be used in interrupt context\n");
 }
+struct sp_alloc_context {
+	unsigned long size;
+	unsigned long size_aligned;
+	unsigned long sp_flags;
+	nodemask_t *nodemask;
+	int preferred_node_id;
+	bool have_mbind;
+	enum spa_type type;
+};
+
 /*
  * Get the sp_group from the mm and return the associated sp_group_node.
  * The caller should promise the @mm would not be deleted from the @spg.
@@ -753,6 +768,11 @@ int mg_sp_group_id_by_pid(int tgid, int *spg_ids, int *num)
 }
 EXPORT_SYMBOL_GPL(mg_sp_group_id_by_pid);
+static bool is_online_node_id(int node_id)
+{
+	return node_id >= 0 && node_id < MAX_NUMNODES && node_online(node_id);
+}
+
 /**
  * mg_sp_group_add_task() - Add a process to an share group (sp_group).
  * @tgid: the tgid of the task to be added.
@@ -1260,10 +1280,329 @@ static void __init proc_sharepool_init(void)
    	return;
 }
+/* wrapper of __do_mmap() and the caller must hold down_write(&mm->mmap_lock). */
+static unsigned long sp_mmap(struct mm_struct *mm, struct file *file,
+			     struct sp_area *spa, unsigned long *populate,
+			     unsigned long prot)
+{
+	unsigned long addr = spa->va_start;
+	unsigned long size = spa_size(spa);
+	unsigned long flags = MAP_FIXED_NOREPLACE | MAP_SHARED | MAP_POPULATE |
+			      MAP_SHARE_POOL;
+	unsigned long vm_flags = VM_NORESERVE | VM_SHARE_POOL | VM_DONTCOPY;
+	unsigned long pgoff = addr_offset(spa) >> PAGE_SHIFT;
+	struct vm_area_struct *vma;
+
+	if (spa->flags & SP_PROT_RO)
+		prot &= ~PROT_WRITE;
+
+	atomic_inc(&spa->use_count);
+	addr = __do_mmap_mm(mm, file, addr, size, prot, flags, vm_flags, pgoff,
+			 populate, NULL);
+	if (IS_ERR_VALUE(addr)) {
+		atomic_dec(&spa->use_count);
+		pr_err("do_mmap fails %ld\n", addr);
+		return addr;
+	}
+
+	vma = find_vma(mm, addr);
+	vma->spa = spa;
+
+	if (prot & PROT_WRITE)
+		/* clean PTE_RDONLY flags or trigger SMMU event */
+		vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) |
+						PTE_DIRTY);
+	else
+		vm_flags_clear(vma, VM_MAYWRITE);
+
+	return addr;
+}
+
+static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags,
+	int spg_id, struct sp_alloc_context *ac)
+{
+	int device_id, node_id;
+
+	if (!sp_is_enabled())
+		return -EOPNOTSUPP;
+
+	check_interrupt_context();
+
+	device_id = sp_flags_device_id(sp_flags);
+	node_id = sp_flags & SP_SPEC_NODE_ID ? sp_flags_node_id(sp_flags) : device_id;
+	if (!is_online_node_id(node_id)) {
+		pr_err_ratelimited("invalid numa node id %d\n", node_id);
+		return -EINVAL;
+	}
+	ac->preferred_node_id = node_id;
+
+	if (current->flags & PF_KTHREAD) {
+		pr_err_ratelimited("allocation failed, task is kthread\n");
+		return -EINVAL;
+	}
+
+	if (unlikely(!size || (size >> PAGE_SHIFT) > totalram_pages())) {
+		pr_err_ratelimited("allocation failed, invalid size %lu\n", size);
+		return -EINVAL;
+	}
+
+	if (spg_id != SPG_ID_DEFAULT && (spg_id < SPG_ID_MIN || spg_id >= SPG_ID_AUTO)) {
+		pr_err_ratelimited("allocation failed, invalid group id %d\n", spg_id);
+		return -EINVAL;
+	}
+
+	if (sp_flags & (~SP_FLAG_MASK)) {
+		pr_err_ratelimited("allocation failed, invalid flag %lx\n", sp_flags);
+		return -EINVAL;
+	}
+
+	if (sp_flags & SP_HUGEPAGE_ONLY)
+		sp_flags |= SP_HUGEPAGE;
+
+	if (spg_id == SPG_ID_DEFAULT) {
+		/*
+		 * We should first init the group_master in pass through scene and
+		 * don't free it until we release the mm_struct.
+		 */
+		int ret = sp_init_group_master(current, current->mm);
+
+		if (ret) {
+			pr_err("sp_alloc init local mapping failed %d\n", ret);
+			return ret;
+		}
+	}
+
+	ac->type         = SPA_TYPE_ALLOC;
+	ac->size         = size;
+	ac->sp_flags     = sp_flags;
+	ac->have_mbind   = false;
+	ac->size_aligned = (sp_flags & SP_HUGEPAGE) ? ALIGN(size, PMD_SIZE) :
+						      ALIGN(size, PAGE_SIZE);
+
+	return 0;
+}
+
+static bool sp_alloc_fallback(struct sp_area *spa, struct sp_alloc_context *ac)
+{
+	/*
+	 * If hugepage allocation fails, this will transfer to normal page
+	 * and try again. (only if SP_HUGEPAGE_ONLY is not flagged
+	 */
+	if (!(ac->sp_flags & SP_HUGEPAGE) || (ac->sp_flags & SP_HUGEPAGE_ONLY))
+		return false;
+
+	ac->size_aligned = ALIGN(ac->size, PAGE_SIZE);
+	ac->sp_flags &= ~SP_HUGEPAGE;
+	/*
+	 * The mempolicy for shared memory is located at backend file, which varies
+	 * between normal pages and huge pages. So we should set the mbind policy again
+	 * when we retry using normal pages.
+	 */
+	ac->have_mbind = false;
+	sp_area_put_locked(spa);
+	return true;
+}
+
+static long sp_mbind(struct mm_struct *mm, unsigned long start, unsigned long len,
+		nodemask_t *nodemask)
+{
+	return __do_mbind(start, len, MPOL_BIND, MPOL_F_STATIC_NODES,
+			nodemask, MPOL_MF_STRICT, mm);
+}
+
+static int sp_alloc_populate(struct mm_struct *mm, struct sp_area *spa,
+			     unsigned long populate, struct sp_alloc_context *ac)
+{
+	int ret;
+
+	if (ac && !ac->have_mbind) {
+		ret = sp_mbind(mm, spa->va_start, spa->real_size, ac->nodemask);
+		if (ret < 0) {
+			pr_err("cannot bind the memory range to node[%*pbl], err:%d\n",
+					nodemask_pr_args(ac->nodemask), ret);
+			return ret;
+		}
+		ac->have_mbind = true;
+	}
+
+	/*
+	 * We are not ignoring errors, so if we fail to allocate
+	 * physical memory we just return failure, so we won't encounter
+	 * page fault later on, and more importantly sp_make_share_u2k()
+	 * depends on this feature (and MAP_LOCKED) to work correctly.
+	 */
+	ret = do_mm_populate(mm, spa->va_start, populate, 0);
+	if (ret) {
+		if (unlikely(fatal_signal_pending(current)))
+			pr_warn("allocation failed, current thread is killed\n");
+		else
+			pr_warn("allocation failed due to mm populate failed(potential no enough memory when -12): %d\n",
+					ret);
+	}
+
+	return ret;
+}
+
+static int sp_k2u_populate(struct mm_struct *mm, struct sp_area *spa)
+{
+	return -EOPNOTSUPP;
+}
+
+#define SP_SKIP_ERR 1
+/*
+ * The caller should increase the refcnt of the spa to prevent that we map
+ * a dead spa into a mm_struct.
+ */
+static int sp_map_spa_to_mm(struct mm_struct *mm, struct sp_area *spa,
+			    unsigned long prot, struct sp_alloc_context *ac,
+			    const char *str)
+{
+	int ret;
+	unsigned long mmap_addr;
+	unsigned long populate = 0;
+
+	down_write(&mm->mmap_lock);
+	if (unlikely(!mmget_not_zero(mm))) {
+		up_write(&mm->mmap_lock);
+		pr_warn("sp_map: target mm is exiting\n");
+		return SP_SKIP_ERR;
+	}
+
+	/* when success, mmap_addr == spa->va_start */
+	mmap_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot);
+	if (IS_ERR_VALUE(mmap_addr)) {
+		up_write(&mm->mmap_lock);
+		mmput_async(mm);
+		pr_err("%s, sp mmap failed %ld\n", str, mmap_addr);
+		return (int)mmap_addr;
+	}
+
+	if (spa->type == SPA_TYPE_ALLOC) {
+		up_write(&mm->mmap_lock);
+		ret = sp_alloc_populate(mm, spa, populate, ac);
+		if (ret) {
+			down_write(&mm->mmap_lock);
+			do_munmap(mm, mmap_addr, spa_size(spa), NULL);
+			up_write(&mm->mmap_lock);
+		}
+	} else {
+		ret = sp_k2u_populate(mm, spa);
+		if (ret) {
+			do_munmap(mm, mmap_addr, spa_size(spa), NULL);
+			pr_info("k2u populate failed, %d\n", ret);
+		}
+		up_write(&mm->mmap_lock);
+	}
+	mmput_async(mm);
+
+	return ret;
+}
+
+static int sp_alloc_mmap_populate(struct sp_area *spa, struct sp_alloc_context *ac)
+{
+	int ret = -EINVAL;
+	int mmap_ret = 0;
+	struct mm_struct *mm;
+	struct sp_group_node *spg_node;
+
+	/* create mapping for each process in the group */
+	list_for_each_entry(spg_node, &spa->spg->proc_head, proc_node) {
+		mm = spg_node->master->mm;
+		mmap_ret = sp_map_spa_to_mm(mm, spa, spg_node->prot, ac, "sp_alloc");
+		if (mmap_ret) {
+			/*
+			 * Goto fallback procedure upon ERR_VALUE,
+			 * but skip the coredump situation,
+			 * because we don't want one misbehaving process to affect others.
+			 */
+			if (mmap_ret != SP_SKIP_ERR)
+				goto unmap;
+
+			continue;
+		}
+		ret = mmap_ret;
+	}
+
+	return ret;
+
+unmap:
+	__sp_free(spa, mm);
+
+	/*
+	 * Sometimes do_mm_populate() allocates some memory and then failed to
+	 * allocate more. (e.g. memory use reaches cgroup limit.)
+	 * In this case, it will return enomem, but will not free the
+	 * memory which has already been allocated.
+	 *
+	 * So if sp_map_spa_to_mm fails, always call sp_fallocate()
+	 * to make sure backup physical memory of the shared file is freed.
+	 */
+	sp_fallocate(spa);
+
+	return mmap_ret;
+}
+
+void *__mg_sp_alloc_nodemask(unsigned long size, unsigned long sp_flags, int spg_id,
+			     nodemask_t *nodemask)
+{
+	int ret = 0;
+	struct sp_area *spa;
+	struct sp_group *spg;
+	nodemask_t __nodemask;
+	struct sp_alloc_context ac;
+	struct sp_group_node *spg_node;
+
+	ret = sp_alloc_prepare(size, sp_flags, spg_id, &ac);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!nodemask) { /* mg_sp_alloc */
+		nodes_clear(__nodemask);
+		node_set(ac.preferred_node_id, __nodemask);
+		ac.nodemask = &__nodemask;
+	} else /* mg_sp_alloc_nodemask */
+		ac.nodemask = nodemask;
+
+	spg = sp_group_get_from_mm(current->mm, spg_id, &spg_node);
+	if (!spg) {
+		pr_err("allocation failed, can't find group(%d)\n", spg_id);
+		return ERR_PTR(-ENODEV);
+	}
+
+	down_write(&spg->rw_lock);
+try_again:
+	spa = sp_area_alloc(ac.size_aligned, ac.sp_flags, spg,
+			    ac.type, current->tgid, ac.preferred_node_id);
+	if (IS_ERR(spa)) {
+		up_write(&spg->rw_lock);
+		pr_err("alloc spa failed in allocation(potential no enough virtual memory when -75): %ld\n",
+			PTR_ERR(spa));
+		ret = PTR_ERR(spa);
+		goto out;
+	}
+
+	ret = sp_alloc_mmap_populate(spa, &ac);
+	if (ret == -ENOMEM && sp_alloc_fallback(spa, &ac))
+		goto try_again;
+
+	if (!ret)
+		update_mem_usage(spa_size(spa), true, spa->is_hugepage, spg_node, spa->type);
+
+	sp_area_put_locked(spa);
+	up_write(&spg->rw_lock);
+
+out:
+	sp_group_put(spg);
+	if (ret)
+		return ERR_PTR(ret);
+	else
+		return (void *)(spa->va_start);
+}
+
 void *mg_sp_alloc_nodemask(unsigned long size, unsigned long sp_flags, int spg_id,
    	nodemask_t nodemask)
 {
-	return ERR_PTR(-EOPNOTSUPP);
+	return __mg_sp_alloc_nodemask(size, sp_flags, spg_id, &nodemask);
 }
 EXPORT_SYMBOL_GPL(mg_sp_alloc_nodemask);
@@ -1281,7 +1620,7 @@ EXPORT_SYMBOL_GPL(mg_sp_alloc_nodemask);
  */
 void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id)
 {
-	return ERR_PTR(-EOPNOTSUPP);
+	return __mg_sp_alloc_nodemask(size, sp_flags, spg_id, NULL);
 }
 EXPORT_SYMBOL_GPL(mg_sp_alloc);
-- 
2.17.1


    

2025

2024

2023

2022

2021

2020

2019

[PATCH openEuler-23.09 v3 11/19] mm/sharepool: Implement mg_sp_alloc()