From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: bugfix bugzilla: NA CVE: NA
---------------------------
To avoid mmap vspace reserved for sharepool, we currently change the high_limit to MMAP_SHARE_POOL_START in arch_get_unmapped_area() and arch_get_unmapped_area_topdown(). In mmap-topdown scene, this make the start address of mmap being always MMAP_SHARE_POOL_START. ASLR got broken.
To fix this, this patch set the mm->mmap_base based on MMAP_SHARE_POOL_START instead of STACK_TOP in topdown scene.
Fixes: 4bdd5c21793e ("ascend: memory: introduce do_mm_populate and hugetlb_insert_hugepage") Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/mm/mmap.c | 6 +++++- include/linux/share_pool.h | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c index ac89686c4af89..87f29df8126ba 100644 --- a/arch/arm64/mm/mmap.c +++ b/arch/arm64/mm/mmap.c @@ -28,6 +28,7 @@ #include <linux/io.h> #include <linux/personality.h> #include <linux/random.h> +#include <linux/share_pool.h>
#include <asm/cputype.h>
@@ -80,7 +81,10 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) else if (gap > MAX_GAP) gap = MAX_GAP;
- return PAGE_ALIGN(STACK_TOP - gap - rnd); + if (sp_is_enabled()) + return ALIGN_DOWN(MMAP_SHARE_POOL_START - rnd, PAGE_SIZE); + else + return PAGE_ALIGN(STACK_TOP - gap - rnd); }
/* diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 9650f257b3ad7..9557a8be46677 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -130,8 +130,6 @@ struct sp_proc_stat { atomic64_t k2u_size; };
-#ifdef CONFIG_ASCEND_SHARE_POOL - #define MAP_SHARE_POOL 0x100000
#define MMAP_TOP_4G_SIZE 0x100000000UL @@ -148,6 +146,8 @@ struct sp_proc_stat { #define MMAP_SHARE_POOL_START (MMAP_SHARE_POOL_END - MMAP_SHARE_POOL_SIZE) #define MMAP_SHARE_POOL_16G_START (MMAP_SHARE_POOL_END - MMAP_SHARE_POOL_DVPP_SIZE)
+#ifdef CONFIG_ASCEND_SHARE_POOL + static inline void sp_init_mm(struct mm_struct *mm) { mm->sp_group = NULL;
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The share pool need to support multi-group mode for automotive platform, it will enhance the system reliability and security.
The new multi-group mode could be enabled by boot command line. When disabled, the share pool should only support single group mode by default. When enabled, the task could be added to several groups (at most 3k). At most 50k groups can be created in the whole system.
This patch also fixes the kabi problem for mm struct.
Signed-off-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mm_types.h | 9 +- include/linux/share_pool.h | 36 +++- mm/share_pool.c | 363 ++++++++++++++++++++++++++----------- 3 files changed, 292 insertions(+), 116 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 51a85ba5ac915..4e45bfd088bff 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -470,11 +470,6 @@ struct mm_struct { #endif struct user_namespace *user_ns;
-#ifdef CONFIG_ASCEND_SHARE_POOL - struct sp_group *sp_group; - struct list_head sp_node; /* link to sp_group->procs */ - int sp_stat_id; -#endif /* store ref to file /proc/<pid>/exe symlink points to */ struct file __rcu *exe_file; #ifdef CONFIG_MMU_NOTIFIER @@ -525,6 +520,10 @@ struct mm_struct { KABI_RESERVE(1) #endif
+#ifdef CONFIG_ASCEND_SHARE_POOL + struct sp_group_master *sp_group_master; +#endif + KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 9557a8be46677..597b96129bf86 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -80,9 +80,9 @@ struct sp_group { int hugepage_failures; struct file *file; struct file *file_hugetlb; - /* list head of processes */ + /* list head of processes (sp_group_node, each represents a process) */ struct list_head procs; - /* list of sp_area. it is protected by spin_lock sp_area_lock */ + /* list head of sp_area. it is protected by spin_lock sp_area_lock */ struct list_head spa_list; /* number of sp_area */ atomic_t spa_num; @@ -107,6 +107,34 @@ struct sp_group { struct rw_semaphore rw_lock; };
+/* a per-process(per mm) struct which manages a sp_group_node list */ +struct sp_group_master { + /* + * number of sp groups the process belongs to, + * a.k.a the number of sp_node in node_list + */ + unsigned int count; + int sp_stat_id; + /* list head of sp_node */ + struct list_head node_list; + struct mm_struct *mm; +}; + +/* + * each instance represents an sp group the process belongs to + * sp_group_master : sp_group_node = 1 : N + * sp_group_node->spg : sp_group = 1 : 1 + * sp_group_node : sp_group->procs = N : 1 + */ +struct sp_group_node { + /* list node in sp_group->procs */ + struct list_head proc_node; + /* list node in sp_group_maseter->node_list */ + struct list_head group_node; + struct sp_group_master *master; + struct sp_group *spg; +}; + struct sp_walk_data { struct page **pages; unsigned int page_count; @@ -150,9 +178,7 @@ struct sp_proc_stat {
static inline void sp_init_mm(struct mm_struct *mm) { - mm->sp_group = NULL; - INIT_LIST_HEAD(&mm->sp_node); - mm->sp_stat_id = 0; + mm->sp_group_master = NULL; }
extern int sp_group_add_task(int pid, int spg_id); diff --git a/mm/share_pool.c b/mm/share_pool.c index eb5eaa3e0d05a..1a01e90e6b102 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -39,11 +39,9 @@ #include <linux/kernel.h> #include <linux/falloc.h> #include <linux/types.h> -#include <linux/idr.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/rmap.h> -#include <linux/hugetlb.h> #include <linux/compaction.h> #include <linux/preempt.h> #include <linux/swapops.h> @@ -58,6 +56,12 @@ #define byte2mb(size) ((size) >> 20) #define page2kb(page_num) ((page_num) << (PAGE_SHIFT - 10))
+#define SINGLE_GROUP_MODE 1 +#define MULTI_GROUP_MODE 2 + +#define MAX_GROUP_FOR_SYSTEM 50000 +#define MAX_GROUP_FOR_TASK 3000 + #define PF_DOMAIN_CORE 0x10000000 /* AOS CORE processes in sched.h */
/* mdc scene hack */ @@ -74,9 +78,11 @@ int sysctl_sp_debug_mode;
int sysctl_share_pool_map_lock_enable;
+static int share_pool_group_mode = SINGLE_GROUP_MODE; + /* idr of all sp_groups */ static DEFINE_IDR(sp_group_idr); -/* rw semaphore for sp_group_idr */ +/* rw semaphore for sp_group_idr and mm->sp_group_master */ static DECLARE_RWSEM(sp_group_sem);
static BLOCKING_NOTIFIER_HEAD(sp_notifier_chain); @@ -129,7 +135,7 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, int ret;
down_write(&sp_stat_sem); - id = mm->sp_stat_id; + id = mm->sp_group_master->sp_stat_id; if (id) { /* other threads in the same process may have initialized it */ stat = sp_get_proc_stat_locked(tgid); @@ -166,7 +172,7 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, return ERR_PTR(ret); }
- mm->sp_stat_id = ret; + mm->sp_group_master->sp_stat_id = ret; up_write(&sp_stat_sem); return stat; } @@ -350,7 +356,7 @@ static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long si static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, struct mm_struct *mm);
-static void free_sp_group_id(unsigned int spg_id) +static void free_sp_group_id(int spg_id) { /* ida operation is protected by an internal spin_lock */ if ((spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) || @@ -382,14 +388,34 @@ static void sp_group_drop(struct sp_group *spg) free_sp_group(spg); }
+static struct sp_group *get_first_group(struct mm_struct *mm) +{ + struct sp_group *spg = NULL; + struct sp_group_master *master = mm->sp_group_master; + + if (master && master->count >= 1) { + struct sp_group_node *spg_node = NULL; + + spg_node = list_first_entry(&master->node_list, + struct sp_group_node, group_node); + spg = spg_node->spg; + + /* don't revive a dead group */ + if (!spg || !atomic_inc_not_zero(&spg->use_count)) + spg = NULL; + } + + return spg; +} + /* user must call sp_group_drop() after use */ static struct sp_group *__sp_find_spg_locked(int pid, int spg_id) { - struct sp_group *spg; + struct sp_group *spg = NULL; + struct task_struct *tsk = NULL; int ret = 0;
if (spg_id == SPG_ID_DEFAULT) { - struct task_struct *tsk; rcu_read_lock(); tsk = find_task_by_vpid(pid); if (!tsk || (tsk->flags & PF_EXITING)) @@ -408,12 +434,9 @@ static struct sp_group *__sp_find_spg_locked(int pid, int spg_id) task_lock(tsk); if (tsk->mm == NULL) spg = NULL; - else { - spg = tsk->mm->sp_group; - /* don't revive a dead group */ - if (!spg || !atomic_inc_not_zero(&spg->use_count)) - spg = NULL; - } + else + spg = get_first_group(tsk->mm); + task_unlock(tsk);
put_task_struct(tsk); @@ -561,14 +584,14 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) static void __sp_area_drop_locked(struct sp_area *spa);
/* The caller must down_write(&mm->mmap_sem) */ -static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) +static void sp_munmap_task_areas(struct mm_struct *mm, struct sp_group *spg, struct list_head *stop) { struct sp_area *spa, *prev = NULL; int err;
- spin_lock(&sp_area_lock);
- list_for_each_entry(spa, &mm->sp_group->spa_list, link) { + spin_lock(&sp_area_lock); + list_for_each_entry(spa, &spg->spa_list, link) { if (&spa->link == stop) break;
@@ -592,6 +615,75 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) spin_unlock(&sp_area_lock); }
+static int mm_add_group_init(struct mm_struct *mm, struct sp_group *spg) +{ + struct sp_group_master *master = mm->sp_group_master; + struct sp_group_node *spg_node; + + if (share_pool_group_mode == SINGLE_GROUP_MODE && master && + master->count == 1) { + pr_err("share pool: at most one sp group for a task is allowed in single mode\n"); + return -EEXIST; + } + + if (!master) { + master = kzalloc(sizeof(struct sp_group_master), GFP_KERNEL); + if (master == NULL) { + pr_err_ratelimited("share pool: no memory for spg master\n"); + return -ENOMEM; + } + } else { + list_for_each_entry(spg_node, &master->node_list, group_node) { + if (spg_node->spg == spg) { + pr_err("share pool: task is already in target group\n"); + return -EEXIST; + } + } + } + + if (!mm->sp_group_master) { + INIT_LIST_HEAD(&master->node_list); + master->count = 0; + master->mm = mm; + master->sp_stat_id = 0; + mm->sp_group_master = master; + } else { + if (master->count + 1 == MAX_GROUP_FOR_TASK) { + pr_err("share pool: task reaches max group num\n"); + return -ENOSPC; + } + } + + return 0; +} + +static int mm_add_group_finish(struct mm_struct *mm, struct sp_group *spg) +{ + struct sp_group_master *master; + struct sp_group_node *spg_node; + + spg_node = kzalloc(sizeof(struct sp_group_node), GFP_KERNEL); + if (spg_node == NULL) { + pr_err_ratelimited("share pool: no memory for spg node\n"); + return -ENOMEM; + } + + master = mm->sp_group_master; + INIT_LIST_HEAD(&spg_node->group_node); + INIT_LIST_HEAD(&spg_node->proc_node); + spg_node->spg = spg; + spg_node->master = master; + + down_write(&spg->rw_lock); + list_add_tail(&spg_node->proc_node, &spg->procs); + up_write(&spg->rw_lock); + + list_add_tail(&spg_node->group_node, &master->node_list); + master->count++; + + return 0; +} + /** * sp_group_add_task() - Add a process to an share group (sp_group). * @pid: the pid of the task to be added. @@ -711,11 +803,6 @@ int sp_group_add_task(int pid, int spg_id) ret = -ESRCH; free_new_spg_id(id_newly_generated, spg_id); goto out_put_task; - } else if (mm->sp_group) { - up_write(&sp_group_sem); - ret = -EEXIST; - free_new_spg_id(id_newly_generated, spg_id); - goto out_put_mm; }
spg = find_or_alloc_sp_group(spg_id); @@ -726,18 +813,18 @@ int sp_group_add_task(int pid, int spg_id) goto out_put_mm; }
+ ret = mm_add_group_init(mm, spg); + if (ret) + goto out_drop_group; + /* access control permission check */ if (sysctl_ac_mode == AC_SINGLE_OWNER) { if (spg->owner != current->group_leader) { ret = -EPERM; - up_write(&sp_group_sem); goto out_drop_group; } }
- mm->sp_group = spg; - up_write(&sp_group_sem); - /* per process statistics initialization */ stat = sp_init_proc_stat(tsk, mm); if (IS_ERR(stat)) { @@ -746,7 +833,7 @@ int sp_group_add_task(int pid, int spg_id) goto out_drop_group; }
- down_read(&spg->rw_lock); + down_write(&spg->rw_lock); /* * create mappings of existing shared memory segments into this * new process' page table. @@ -779,7 +866,7 @@ int sp_group_add_task(int pid, int spg_id)
down_write(&mm->mmap_sem); if (unlikely(mm->core_state)) { - sp_munmap_task_areas(mm, &spa->link); + sp_munmap_task_areas(mm, spg, &spa->link); up_write(&mm->mmap_sem); ret = -EBUSY; pr_err("share pool: task add group: encountered coredump, abort\n"); @@ -789,7 +876,7 @@ int sp_group_add_task(int pid, int spg_id)
addr = sp_mmap(mm, file, spa, &populate); if (IS_ERR_VALUE(addr)) { - sp_munmap_task_areas(mm, &spa->link); + sp_munmap_task_areas(mm, spg, &spa->link); up_write(&mm->mmap_sem); ret = addr; pr_err("share pool: task add group sp mmap failed, ret %d\n", ret); @@ -808,7 +895,7 @@ int sp_group_add_task(int pid, int spg_id) "(potential no enough memory when -12): %d, spa type is %d\n", ret, spa->type); down_write(&mm->mmap_sem); - sp_munmap_task_areas(mm, spa->link.next); + sp_munmap_task_areas(mm, spg, spa->link.next); up_write(&mm->mmap_sem); spin_lock(&sp_area_lock); break; @@ -819,21 +906,22 @@ int sp_group_add_task(int pid, int spg_id) } __sp_area_drop_locked(prev); spin_unlock(&sp_area_lock); - up_read(&spg->rw_lock); + up_write(&spg->rw_lock);
if (unlikely(ret)) sp_proc_stat_drop(stat);
out_drop_group: if (unlikely(ret)) { - down_write(&sp_group_sem); - mm->sp_group = NULL; + if (mm->sp_group_master->count == 0) { + kfree(mm->sp_group_master); + mm->sp_group_master = NULL; + } up_write(&sp_group_sem); sp_group_drop(spg); } else { - down_write(&spg->rw_lock); - list_add_tail(&mm->sp_node, &spg->procs); - up_write(&spg->rw_lock); + mm_add_group_finish(mm, spg); + up_write(&sp_group_sem); } out_put_mm: /* No need to put the mm if the sp group adds this mm successfully */ @@ -1250,9 +1338,10 @@ static void __sp_free(struct sp_group *spg, unsigned long addr, unsigned long size, struct mm_struct *stop) { struct mm_struct *mm; - struct mm_struct *tmp; + struct sp_group_node *spg_node = NULL;
- list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + list_for_each_entry(spg_node, &spg->procs, proc_node) { + mm = spg_node->master->mm; if (mm == stop) break; sp_munmap(mm, addr, size); @@ -1285,11 +1374,32 @@ int sp_free(unsigned long addr) spa = __find_sp_area(addr); if (spa) { if (current->mm != NULL) { - if (current->mm->sp_group != spa->spg) { + struct sp_group_node *spg_node; + bool found = false; + + down_read(&spa->spg->rw_lock); + list_for_each_entry(spg_node, &spa->spg->procs, proc_node) { + if (spg_node->master->mm == current->mm) { + found = true; + break; + } + } + up_read(&spa->spg->rw_lock); + if (!found) { ret = -EPERM; goto drop_spa; } } + + down_write(&spa->spg->rw_lock); + if (!spg_valid(spa->spg)) { + up_write(&spa->spg->rw_lock); + goto drop_spa; + } + /* the life cycle of spa has a direct relation with sp group */ + spa->is_dead = true; + up_write(&spa->spg->rw_lock); + } else { /* spa == NULL */ ret = -EINVAL; pr_debug("share pool: sp free invalid input addr %lx\n", (unsigned long)addr); @@ -1302,14 +1412,7 @@ int sp_free(unsigned long addr) goto drop_spa; }
- down_write(&spa->spg->rw_lock); - if (!spg_valid(spa->spg)) { - up_write(&spa->spg->rw_lock); - goto drop_spa; - } - /* the life cycle of spa has a direct relation with sp group */ - spa->is_dead = true; - up_write(&spa->spg->rw_lock); + sp_dump_stack();
down_read(&spa->spg->rw_lock);
@@ -1328,7 +1431,7 @@ int sp_free(unsigned long addr) if (current->mm == NULL) { atomic64_sub(spa->real_size, &kthread_stat.alloc_size); } else { - stat = sp_get_proc_stat(current->mm->sp_stat_id); + stat = sp_get_proc_stat(current->mm->sp_group_master->sp_stat_id); if (stat) atomic64_sub(spa->real_size, &stat->alloc_size); else @@ -1399,9 +1502,9 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) struct file *file; unsigned long size_aligned; int ret = 0; - struct mm_struct *tmp; unsigned long mode, offset; unsigned int noreclaim_flag; + struct sp_group_node *spg_node;
check_interrupt_context();
@@ -1443,12 +1546,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) pr_err_ratelimited("share pool: allocation failed, add group error %d in DVPP pass through\n", ret); return ERR_PTR(ret); } - spg = current->mm->sp_group; - /* - * increase use_count deliberately, due to __sp_find_spg is - * matched with sp_group_drop - */ - atomic_inc(&spg->use_count); + spg = get_first_group(current->mm); } else { /* other scenes */ if (spg_id != SPG_ID_DEFAULT) { spg_tmp = __sp_find_spg(current->pid, spg_id); @@ -1488,9 +1586,10 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) sp_addr = spa->va_start;
/* create mapping for each process in the group */ - list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + list_for_each_entry(spg_node, &spg->procs, proc_node) { unsigned long populate = 0; struct vm_area_struct *vma; + mm = spg_node->master->mm;
down_write(&mm->mmap_sem); if (unlikely(mm->core_state)) { @@ -1552,7 +1651,8 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) sp_add_work_compact(); } if (ret) { - __sp_free(spg, sp_addr, size_aligned, list_next_entry(mm, sp_node)); + __sp_free(spg, sp_addr, size_aligned, + (list_next_entry(spg_node, proc_node))->master->mm); if (unlikely(fatal_signal_pending(current))) pr_warn_ratelimited("share pool: allocation failed, current thread is killed\n"); else @@ -1587,7 +1687,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) up_read(&spg->rw_lock);
if (!IS_ERR(p)) { - stat = sp_get_proc_stat(current->mm->sp_stat_id); + stat = sp_get_proc_stat(current->mm->sp_group_master->sp_stat_id); if (stat) atomic64_add(size_aligned, &stat->alloc_size); else @@ -1760,12 +1860,13 @@ static void *sp_make_share_kva_to_spg(unsigned long kva, struct sp_area *spa, struct sp_group *spg) { struct mm_struct *mm; - struct mm_struct *tmp; unsigned long ret_addr = -ENODEV; unsigned long uva = -ENODEV; void *p = ERR_PTR(-ENODEV); + struct sp_group_node *spg_node;
- list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + list_for_each_entry(spg_node, &spg->procs, proc_node) { + mm = spg_node->master->mm; ret_addr = sp_remap_kva_to_vma(kva, spa, mm); if (IS_ERR_VALUE(ret_addr)) { pr_err("share pool: remap k2u to spg failed, ret %ld \n", ret_addr); @@ -1906,7 +2007,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { - up_read(&spg->rw_lock); pr_debug("share pool: %s: the kva %lx is not valid\n", __func__, (unsigned long)kva_aligned); goto out_drop_spa; } @@ -2293,12 +2393,14 @@ EXPORT_SYMBOL_GPL(sp_make_share_u2k); static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int spg_id) { int ret = 0; + bool found = false; struct mm_struct *mm; struct sp_area *spa; unsigned long uva_aligned; unsigned long size_aligned; unsigned int page_size; struct sp_proc_stat *stat; + struct sp_group_node *spg_node;
/* * at first we guess it's a hugepage addr @@ -2392,19 +2494,25 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp goto out_drop_area; }
- if (unlikely(!spa->spg)) { - WARN(1, "share pool: unshare uva NULL spg pointer\n"); - ret = -EINVAL; - goto out_drop_area; + down_read(&spa->spg->rw_lock); + /* always allow kthread and dvpp channel destroy procedure */ + if (current->mm) { + list_for_each_entry(spg_node, &spa->spg->procs, proc_node) { + if (spg_node->master->mm == current->mm) { + found = true; + break; + } + } }
- /* alway allow kthread and dvpp channel destroy procedure */ - if (current->mm && current->mm->sp_group != spa->spg) { + if (!found) { + up_read(&spa->spg->rw_lock); pr_err_ratelimited("share pool: unshare uva(to group) failed, " "caller process doesn't belong to target group\n"); ret = -EINVAL; goto out_drop_area; } + up_read(&spa->spg->rw_lock);
down_write(&spa->spg->rw_lock); if (!spg_valid(spa->spg)) { @@ -2428,7 +2536,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (current->mm == NULL) { atomic64_sub(spa->real_size, &kthread_stat.k2u_size); } else { - stat = sp_get_proc_stat(current->mm->sp_stat_id); + stat = sp_get_proc_stat(current->mm->sp_group_master->sp_stat_id); if (stat) atomic64_sub(spa->real_size, &stat->k2u_size); else @@ -2711,12 +2819,19 @@ static int __init enable_share_k2u_to_group(char *s) } __setup("enable_sp_share_k2u_spg", enable_share_k2u_to_group);
+static int __init enable_sp_multi_group_mode(char *s) +{ + share_pool_group_mode = MULTI_GROUP_MODE; + return 1; +} +__setup("enable_sp_multi_group_mode", enable_sp_multi_group_mode); + /*** Statistical and maintenance functions ***/
static void free_sp_proc_stat(struct sp_proc_stat *stat) { down_write(&sp_stat_sem); - stat->mm->sp_stat_id = 0; + stat->mm->sp_group_master->sp_stat_id = 0; idr_remove(&sp_stat_idr, stat->tgid); up_write(&sp_stat_sem); kfree(stat); @@ -2747,7 +2862,7 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, up_read(&spg->rw_lock);
/* eliminate potential ABBA deadlock */ - stat = sp_get_proc_stat_ref(task->mm->sp_stat_id); + stat = sp_get_proc_stat_ref(task->mm->sp_group_master->sp_stat_id); if (unlikely(!stat)) { sp_group_drop(spg); return 0; @@ -3166,65 +3281,93 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm, } EXPORT_SYMBOL(sharepool_no_page);
-#define MM_WOULD_FREE 2 +#define MM_WOULD_FREE 1 + +/* + * Recall we add mm->users by 1 deliberately in sp_group_add_task(). + * If the mm_users == sp_group_master->count + 1, it means that the mm is ready + * to be freed because the last owner of this mm is in exiting procedure: + * do_exit() -> exit_mm() -> mmput() -> sp_group_exit -> THIS function. + */ +static bool need_free_sp_group(struct mm_struct *mm, + struct sp_group_master *master) +{ + /* thread exits but process is still alive */ + if ((unsigned int)atomic_read(&mm->mm_users) != master->count + MM_WOULD_FREE) { + if (atomic_dec_and_test(&mm->mm_users)) + WARN(1, "Invalid user counting\n"); + return false; + } + + return true; +}
+/* + * Return: + * 1 - let mmput() return immediately + * 0 - let mmput() decrease mm_users and try __mmput() + */ int sp_group_exit(struct mm_struct *mm) { - struct sp_group *spg = mm->sp_group; + struct sp_group *spg; + struct sp_group_master *master; + struct sp_group_node *spg_node, *tmp; bool is_alive = true;
- if (!spg || !enable_ascend_share_pool) + if (!enable_ascend_share_pool) return 0;
- /* - * The judgment of mm->mm_users == MM_WOULD_FREE and atomic_dec_and_test - * must be atomic. Otherwise, mm->mm_users == MM_WOULD_FREE may never be - * true due to the gap in the middle. - */ - down_write(&spg->rw_lock); - /* - * Recall we add mm->users by 1 deliberately in sp_group_add_task(). - * If the mm_users is 2, it means that the mm is ready to be freed - * because the last owner of this mm is in exiting procedure: - * do_exit() -> exit_mm() -> mmput() -> THIS function. - */ - if (atomic_read(&mm->mm_users) == MM_WOULD_FREE) { + down_write(&sp_group_sem); + + master = mm->sp_group_master; + if (!master) { + up_write(&sp_group_sem); + return 0; + } + + if (!need_free_sp_group(mm, master)) { + up_write(&sp_group_sem); + return 1; + } + + list_for_each_entry_safe(spg_node, tmp, &master->node_list, group_node) { + spg = spg_node->spg; + down_write(&spg->rw_lock); /* a dead group should NOT be reactive again */ if (spg_valid(spg) && list_is_singular(&spg->procs)) is_alive = spg->is_alive = false; - if (mm->sp_group) /* concurrency handle of sp_group_add_task */ - list_del(&mm->sp_node); /* affect spg->procs */ - /* match with get_task_mm() in sp_group_add_task() */ - atomic_dec(&mm->mm_users); + list_del(&spg_node->proc_node); up_write(&spg->rw_lock);
if (!is_alive) blocking_notifier_call_chain(&sp_notifier_chain, 0, - mm->sp_group); - - return 0; + spg); }
- if (atomic_dec_and_test(&mm->mm_users)) { - up_write(&spg->rw_lock); + /* match with get_task_mm() in sp_group_add_task() */ + if (atomic_sub_and_test(master->count, &mm->mm_users)) { + up_write(&sp_group_sem); WARN(1, "Invalid user counting\n"); - return 0; + return 1; }
- up_write(&spg->rw_lock); - return 1; + up_write(&sp_group_sem); + return 0; }
void sp_group_post_exit(struct mm_struct *mm) { struct sp_proc_stat *stat; - struct sp_group *spg = mm->sp_group; long alloc_size, k2u_size; + /* lockless visit */ + struct sp_group_master *master = mm->sp_group_master; + struct sp_group_node *spg_node, *tmp; + struct sp_group *spg;
- if (!enable_ascend_share_pool || !mm->sp_stat_id) + if (!enable_ascend_share_pool || !master) return;
- stat = sp_get_proc_stat(mm->sp_stat_id); + stat = sp_get_proc_stat(master->sp_stat_id); if (stat) { alloc_size = atomic64_read(&stat->alloc_size); k2u_size = atomic64_read(&stat->k2u_size); @@ -3246,19 +3389,27 @@ void sp_group_post_exit(struct mm_struct *mm) * A process not in an sp group doesn't need to print because there * wont't be any memory which is not freed. */ - if (spg) { + if (master) { if (alloc_size != 0 || k2u_size != 0) - pr_info("share pool: process %s(%d) of sp group %d exits. " + pr_info("share pool: process %s(%d) exits. " "It applied %ld aligned KB, k2u shared %ld aligned KB\n", - stat->comm, mm->sp_stat_id, mm->sp_group->id, + stat->comm, master->sp_stat_id, byte2kb(alloc_size), byte2kb(k2u_size));
- /* match with sp_group_add_task -> find_or_alloc_sp_group */ - sp_group_drop(spg); }
/* match with sp_init_proc_stat, we expect stat is released after this call */ sp_proc_stat_drop(stat); + + /* lockless traverse */ + list_for_each_entry_safe(spg_node, tmp, &master->node_list, group_node) { + spg = spg_node->spg; + /* match with refcount inc in sp_group_add_task */ + sp_group_drop(spg); + kfree(spg_node); + } + + kfree(master); }
struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask,
From: Zhou Guanghui zhouguanghui1@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
fix kabi broken in struct mm_struct It's introduced by b82e34a7de33 ("ascend: mm_struct: introduce new parameter...")
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mm_types.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4e45bfd088bff..ae4237a59d21c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -520,11 +520,15 @@ struct mm_struct { KABI_RESERVE(1) #endif
-#ifdef CONFIG_ASCEND_SHARE_POOL - struct sp_group_master *sp_group_master; +#if IS_ENABLED(CONFIG_ASCEND_SHARE_POOL) && !defined(__GENKSYMS__) + union { + struct sp_group_master *sp_group_master; + unsigned long kabi_reserve1; + }; +#else + KABI_RESERVE(2) #endif
- KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5)
From: Zhou Guanghui zhouguanghui1@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/configs/hulk_defconfig | 4 ++++ arch/arm64/configs/syzkaller_defconfig | 11 +++++++++++ 2 files changed, 15 insertions(+)
diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index 01e02aa2cb092..605d17461383b 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -485,6 +485,7 @@ CONFIG_ASCEND_IOPF_HIPRI=y CONFIG_ASCEND_CHARGE_MIGRATE_HUGEPAGES=y CONFIG_ASCEND_WATCHDOG_SYSFS_CONFIGURE=y CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE=y +CONFIG_ASCEND_SHARE_POOL=y
# # Boot options @@ -753,6 +754,7 @@ CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y CONFIG_HAVE_ARCH_HUGE_VMAP=y +CONFIG_HAVE_ARCH_HUGE_VMALLOC=y CONFIG_HAVE_MOD_ARCH_SPECIFIC=y CONFIG_MODULES_USE_ELF_RELA=y CONFIG_ARCH_HAS_ELF_RANDOMIZE=y @@ -960,6 +962,7 @@ CONFIG_BALLOON_COMPACTION=y CONFIG_COMPACTION=y CONFIG_MIGRATION=y CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_MM_OWNER=y CONFIG_MMU_NOTIFIER=y CONFIG_KSM=y CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 @@ -989,6 +992,7 @@ CONFIG_ZSMALLOC_STAT=y CONFIG_GENERIC_EARLY_IOREMAP=y # CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set CONFIG_IDLE_PAGE_TRACKING=y +CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y # CONFIG_PERCPU_STATS is not set # CONFIG_GUP_BENCHMARK is not set CONFIG_ARCH_HAS_PTE_SPECIAL=y diff --git a/arch/arm64/configs/syzkaller_defconfig b/arch/arm64/configs/syzkaller_defconfig index 390ef78934790..eb045bdc5ef16 100644 --- a/arch/arm64/configs/syzkaller_defconfig +++ b/arch/arm64/configs/syzkaller_defconfig @@ -469,6 +469,14 @@ CONFIG_ARM64_PSEUDO_NMI=y CONFIG_RELOCATABLE=y CONFIG_RANDOMIZE_BASE=y CONFIG_RANDOMIZE_MODULE_REGION_FULL=y +CONFIG_ASCEND_FEATURES=y +CONFIG_ASCEND_DVPP_MMAP=y +CONFIG_ASCEND_OOM=y +CONFIG_ASCEND_IOPF_HIPRI=y +CONFIG_ASCEND_CHARGE_MIGRATE_HUGEPAGES=y +# CONFIG_ASCEND_WATCHDOG_SYSFS_CONFIGURE is not set +CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE=y +CONFIG_ASCEND_SHARE_POOL=y
# # Boot options @@ -730,6 +738,7 @@ CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y CONFIG_HAVE_ARCH_HUGE_VMAP=y +CONFIG_HAVE_ARCH_HUGE_VMALLOC=y CONFIG_HAVE_MOD_ARCH_SPECIFIC=y CONFIG_MODULES_USE_ELF_RELA=y CONFIG_ARCH_HAS_ELF_RANDOMIZE=y @@ -913,6 +922,7 @@ CONFIG_BALLOON_COMPACTION=y CONFIG_COMPACTION=y CONFIG_MIGRATION=y CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_MM_OWNER=y CONFIG_MMU_NOTIFIER=y CONFIG_KSM=y CONFIG_DEFAULT_MMAP_MIN_ADDR=4096 @@ -940,6 +950,7 @@ CONFIG_ZSMALLOC_STAT=y CONFIG_GENERIC_EARLY_IOREMAP=y # CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set CONFIG_IDLE_PAGE_TRACKING=y +CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y # CONFIG_PERCPU_STATS is not set # CONFIG_GUP_BENCHMARK is not set CONFIG_ARCH_HAS_PTE_SPECIAL=y
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The max num of sp_group in the system is MAX_GROUP_FOR_SYSTEM.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 1a01e90e6b102..48f8af0e254cf 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -80,6 +80,8 @@ int sysctl_share_pool_map_lock_enable;
static int share_pool_group_mode = SINGLE_GROUP_MODE;
+static int system_group_count; + /* idr of all sp_groups */ static DEFINE_IDR(sp_group_idr); /* rw semaphore for sp_group_idr and mm->sp_group_master */ @@ -380,6 +382,8 @@ static void free_sp_group(struct sp_group *spg) up_write(&sp_group_sem); free_sp_group_id((unsigned int)spg->id); kfree(spg); + system_group_count--; + WARN(system_group_count < 0, "unexpected group count\n"); }
static void sp_group_drop(struct sp_group *spg) @@ -514,6 +518,11 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) struct user_struct *user = NULL; int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT;
+ if (unlikely(system_group_count + 1 == MAX_GROUP_FOR_SYSTEM)) { + pr_err_ratelimited("share pool: reach system max group num\n"); + return ERR_PTR(-ENOSPC); + } + spg = kzalloc(sizeof(*spg), GFP_KERNEL); if (spg == NULL) { pr_err_ratelimited("share pool: alloc spg failed due to lack of memory\n"); @@ -559,6 +568,8 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) ret = PTR_ERR(spg->file_hugetlb); goto out_fput; } + + system_group_count++; } else { down_read(&spg->rw_lock); if (!spg_valid(spg)) {
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The max num of process in a group is MAX_PROC_PER_GROUP.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 2 ++ mm/share_pool.c | 10 ++++++++++ 2 files changed, 12 insertions(+)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 597b96129bf86..8ab4cfb2b2509 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -80,6 +80,8 @@ struct sp_group { int hugepage_failures; struct file *file; struct file *file_hugetlb; + /* number of process in this group */ + int proc_num; /* list head of processes (sp_group_node, each represents a process) */ struct list_head procs; /* list head of sp_area. it is protected by spin_lock sp_area_lock */ diff --git a/mm/share_pool.c b/mm/share_pool.c index 48f8af0e254cf..436084741ed0c 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -61,6 +61,7 @@
#define MAX_GROUP_FOR_SYSTEM 50000 #define MAX_GROUP_FOR_TASK 3000 +#define MAX_PROC_PER_GROUP 1024
#define PF_DOMAIN_CORE 0x10000000 /* AOS CORE processes in sched.h */
@@ -536,6 +537,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) }
spg->id = spg_id; + spg->proc_num = 0; atomic_set(&spg->spa_num, 0); atomic64_set(&spg->size, 0); atomic64_set(&spg->alloc_nsize, 0); @@ -686,6 +688,13 @@ static int mm_add_group_finish(struct mm_struct *mm, struct sp_group *spg) spg_node->master = master;
down_write(&spg->rw_lock); + if (spg->proc_num + 1 == MAX_PROC_PER_GROUP) { + up_write(&spg->rw_lock); + pr_err_ratelimited("add group: group reaches max process num\n"); + kfree(spg_node); + return -ENOSPC; + } + spg->proc_num++; list_add_tail(&spg_node->proc_node, &spg->procs); up_write(&spg->rw_lock);
@@ -3347,6 +3356,7 @@ int sp_group_exit(struct mm_struct *mm) /* a dead group should NOT be reactive again */ if (spg_valid(spg) && list_is_singular(&spg->procs)) is_alive = spg->is_alive = false; + spg->proc_num--; list_del(&spg_node->proc_node); up_write(&spg->rw_lock);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
We are going to redesign the accounting subsystem of share pool. First we need to disambiguate the meaning of sp_spg_stat, as it is system-level not spg-level.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 436084741ed0c..cd6e137fe6698 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -218,12 +218,12 @@ struct sp_spa_stat { static struct sp_spa_stat spa_stat;
/* statistics of all sp group born from sp_alloc and k2u(spg) */ -struct sp_spg_stat { +struct sp_overall_stat { atomic_t spa_total_num; atomic64_t spa_total_size; };
-static struct sp_spg_stat spg_stat; +static struct sp_overall_stat sp_overall_stat;
/*** Global share pool VA allocator ***/
@@ -1134,8 +1134,8 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, atomic64_add(size, &spg->alloc_nsize); atomic64_add(size, &spg->alloc_size); } - atomic_inc(&spg_stat.spa_total_num); - atomic64_add(size, &spg_stat.spa_total_size); + atomic_inc(&sp_overall_stat.spa_total_num); + atomic64_add(size, &sp_overall_stat.spa_total_size); list_add_tail(&spa->link, &spg->spa_list); } spin_unlock(&sp_area_lock); @@ -1219,8 +1219,8 @@ static void sp_free_area(struct sp_area *spa) atomic64_sub(spa->real_size, &spa->spg->alloc_nsize); atomic64_sub(spa->real_size, &spa->spg->alloc_size); } - atomic_dec(&spg_stat.spa_total_num); - atomic64_sub(spa->real_size, &spg_stat.spa_total_size); + atomic_dec(&sp_overall_stat.spa_total_num); + atomic64_sub(spa->real_size, &sp_overall_stat.spa_total_size); list_del(&spa->link); } rb_erase(&spa->rb_node, &sp_area_root); @@ -3045,12 +3045,12 @@ void spg_overview_show(struct seq_file *seq)
if (seq != NULL) { seq_printf(seq, "Share pool total size: %ld KB, spa total num: %d.\n", - byte2kb(atomic64_read(&spg_stat.spa_total_size)), - atomic_read(&spg_stat.spa_total_num)); + byte2kb(atomic64_read(&sp_overall_stat.spa_total_size)), + atomic_read(&sp_overall_stat.spa_total_num)); } else { pr_info("Share pool total size: %ld KB, spa total num: %d.\n", - byte2kb(atomic64_read(&spg_stat.spa_total_size)), - atomic_read(&spg_stat.spa_total_num)); + byte2kb(atomic64_read(&sp_overall_stat.spa_total_size)), + atomic_read(&sp_overall_stat.spa_total_num)); }
down_read(&sp_group_sem);