From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The sp_mutex is used to protect all critical path for share pool, it has serious affected the performance of the the memory alloc and release interface when there is a lot of process in the same memory group, it will serious break the scailability of the system, so add a new read semaphore lock to instead of the big lock for allocation and release critical path.
The scailability has been greatly improved by this modification.
Show the test result: number of process: alloc 4M avg time: Before the patch: 1 32us 3 96us 10 330us
after the patch: 1 32us 3 40us 10 60us
v2: fix some conflicts and clean some code.
Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 5 ++ kernel/fork.c | 4 +- mm/share_pool.c | 170 ++++++++++++++++++++----------------- 3 files changed, 100 insertions(+), 79 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 70b841d0eb8e5..f2d17cb85fa52 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -93,6 +93,8 @@ struct sp_group { unsigned long dvpp_va_start; unsigned long dvpp_size; atomic_t use_count; + /* protect the group internal elements */ + struct rw_semaphore rw_lock; };
struct sp_walk_data { @@ -238,6 +240,8 @@ extern void *vmalloc_hugepage_user(unsigned long size); extern void *buff_vzalloc_user(unsigned long size); extern void *buff_vzalloc_hugepage_user(unsigned long size);
+void sp_exit_mm(struct mm_struct *mm); + #else
static inline int sp_group_add_task(int pid, int spg_id) @@ -400,6 +404,7 @@ static inline void *buff_vzalloc_hugepage_user(unsigned long size) { return NULL; } + #endif
#endif /* LINUX_SHARE_POOL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index c410887b502b2..22ed43ed527de 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1059,8 +1059,6 @@ static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users));
- sp_group_exit(mm); - uprobe_clear_state(mm); exit_aio(mm); ksm_exit(mm); @@ -1088,6 +1086,8 @@ void mmput(struct mm_struct *mm) { might_sleep();
+ sp_group_exit(mm); + if (atomic_dec_and_test(&mm->mm_users)) __mmput(mm); } diff --git a/mm/share_pool.c b/mm/share_pool.c index 5149864c94c09..d9f70526bae17 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -197,6 +197,16 @@ static bool host_svm_sp_enable = false;
int sysctl_share_pool_hugepage_enable = 1;
+static void free_sp_group(struct sp_group *spg); + +static bool sp_group_get(struct sp_group *spg) +{ + if (spg_valid(spg) && atomic_inc_not_zero(&spg->use_count)) + return true; + + return false; +} + static unsigned long spa_size(struct sp_area *spa) { return spa->real_size; @@ -337,7 +347,9 @@ static struct sp_group *__sp_find_spg(int pid, int spg_id)
put_task_struct(tsk); } else { + mutex_lock(&sp_mutex); spg = idr_find(&sp_group_idr, spg_id); + mutex_unlock(&sp_mutex); }
return spg; @@ -392,6 +404,8 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) INIT_LIST_HEAD(&spg->procs); INIT_LIST_HEAD(&spg->spa_list);
+ init_rwsem(&spg->rw_lock); + ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id+1, GFP_KERNEL); if (ret < 0) { @@ -422,9 +436,8 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) goto out_fput; } } else { - if (!spg_valid(spg)) + if (!sp_group_get(spg)) return ERR_PTR(-ENODEV); - atomic_inc(&spg->use_count); }
return spg; @@ -607,6 +620,8 @@ int sp_group_add_task(int pid, int spg_id) }
mm->sp_group = spg; + + down_write(&spg->rw_lock); /* We reactive the spg even the spg exists already. */ spg->is_alive = true; list_add_tail(&mm->sp_node, &spg->procs); @@ -675,11 +690,14 @@ int sp_group_add_task(int pid, int spg_id) mm->sp_group = NULL; }
+ up_write(&spg->rw_lock); out_drop_group: if (unlikely(ret)) __sp_group_drop_locked(spg); out_put_mm: - mmput(mm); + /* No need to put the mm if the sp group add this mm success.*/ + if (unlikely(ret)) + mmput(mm); out_put_task: put_task_struct(tsk); out_unlock: @@ -712,44 +730,12 @@ static void spg_exit_unlock(bool unlock) mutex_unlock(&sp_mutex); }
-/* - * Do cleanup when a process exits. - */ -void sp_group_exit(struct mm_struct *mm) -{ - bool is_alive = true; - bool unlock; - - /* - * Nothing to do if this thread group doesn't belong to any sp_group. - * No need to protect this check with lock because we can add a task - * to a group if !PF_EXITING. - */ - if (!mm->sp_group) - return; - - spg_exit_lock(&unlock); - if (list_is_singular(&mm->sp_group->procs)) - is_alive = mm->sp_group->is_alive = false; - list_del(&mm->sp_node); - spg_exit_unlock(unlock); - - /* - * To avoid calling this with sp_mutex held, we first mark the - * sp_group as dead and then send the notification and then do - * the real cleanup in sp_group_post_exit(). - */ - if (!is_alive) - blocking_notifier_call_chain(&sp_notifier_chain, 0, - mm->sp_group); -} - void sp_group_post_exit(struct mm_struct *mm) { struct sp_proc_stat *stat; bool unlock;
- if (!mm->sp_group) + if (!enable_ascend_share_pool || !mm->sp_group) return;
spg_exit_lock(&unlock); @@ -1139,8 +1125,6 @@ static void sp_munmap(struct mm_struct *mm, unsigned long addr, { int err;
- if (!mmget_not_zero(mm)) - return; down_write(&mm->mmap_sem);
err = do_munmap(mm, addr, size, NULL); @@ -1150,7 +1134,6 @@ static void sp_munmap(struct mm_struct *mm, unsigned long addr, }
up_write(&mm->mmap_sem); - mmput(mm); }
/* The caller must hold sp_mutex. */ @@ -1183,8 +1166,6 @@ int sp_free(unsigned long addr)
check_interrupt_context();
- mutex_lock(&sp_mutex); - /* * Access control: a share pool addr can only be freed by another task * in the same spg or a kthread (such as buff_module_guard_work) @@ -1217,6 +1198,8 @@ int sp_free(unsigned long addr)
sp_dump_stack();
+ down_read(&spa->spg->rw_lock); + __sp_free(spa->spg, spa->va_start, spa_size(spa), NULL);
/* Free the memory of the backing shmem or hugetlbfs */ @@ -1226,6 +1209,9 @@ int sp_free(unsigned long addr) if (ret) pr_err("share pool: sp free fallocate failed: %d\n", ret);
+ up_read(&spa->spg->rw_lock); + + mutex_lock(&sp_mutex); /* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { kthread_stat.alloc_size -= spa->real_size; @@ -1236,12 +1222,11 @@ int sp_free(unsigned long addr) else BUG(); } + mutex_unlock(&sp_mutex);
drop_spa: __sp_area_drop(spa); out: - mutex_unlock(&sp_mutex); - sp_try_to_compact(); return ret; } @@ -1317,9 +1302,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) if (sp_flags & SP_HUGEPAGE_ONLY) sp_flags |= SP_HUGEPAGE;
- mutex_lock(&sp_mutex); spg = __sp_find_spg(current->pid, SPG_ID_DEFAULT); - mutex_unlock(&sp_mutex); if (!spg) { /* DVPP pass through scene: first call sp_alloc() */ /* mdc scene hack */ if (enable_mdc_default_group) @@ -1336,14 +1319,16 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) ret); return ERR_PTR(ret); } - mutex_lock(&sp_mutex); spg = current->mm->sp_group; } else { /* other scenes */ - mutex_lock(&sp_mutex); if (spg_id != SPG_ID_DEFAULT) { + mutex_lock(&sp_mutex); /* the caller should be a member of the sp group */ - if (spg != idr_find(&sp_group_idr, spg_id)) + if (spg != idr_find(&sp_group_idr, spg_id)) { + mutex_unlock(&sp_mutex); goto out; + } + mutex_unlock(&sp_mutex); } }
@@ -1352,6 +1337,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) goto out; }
+ down_read(&spg->rw_lock); if (sp_flags & SP_HUGEPAGE) { file = spg->file_hugetlb; size_aligned = ALIGN(size, PMD_SIZE); @@ -1376,31 +1362,25 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) unsigned long populate = 0; struct vm_area_struct *vma;
- if (!mmget_not_zero(mm)) - continue; - down_write(&mm->mmap_sem); mmap_addr = sp_mmap(mm, file, spa, &populate); if (IS_ERR_VALUE(mmap_addr)) { up_write(&mm->mmap_sem); p = (void *)mmap_addr; __sp_free(spg, sp_addr, size_aligned, mm); - mmput(mm); pr_err("share pool: allocation sp mmap failed, ret %ld\n", mmap_addr); goto out; }
- p =(void *)mmap_addr; /* success */ + p = (void *)mmap_addr; /* success */ if (populate == 0) { up_write(&mm->mmap_sem); - mmput(mm); continue; }
vma = find_vma(mm, sp_addr); if (unlikely(!vma)) { up_write(&mm->mmap_sem); - mmput(mm); pr_err("share pool: allocation failed due to find %pK vma failure\n", (void *)sp_addr); p = ERR_PTR(-EINVAL); @@ -1461,24 +1441,22 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) size_aligned = ALIGN(size, PAGE_SIZE); sp_flags &= ~SP_HUGEPAGE; __sp_area_drop(spa); - mmput(mm); goto try_again; } } - - mmput(mm); break; } - mmput(mm); }
+out: + up_read(&spg->rw_lock); + + mutex_lock(&sp_mutex); if (!IS_ERR(p)) { stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); if (stat) stat->alloc_size += size_aligned; } - -out: mutex_unlock(&sp_mutex);
/* this will free spa if mmap failed */ @@ -1556,10 +1534,6 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, } }
- if (!mmget_not_zero(mm)) { - ret_addr = -ESPGMMEXIT; - goto put_file; - } down_write(&mm->mmap_sem);
ret_addr = sp_mmap(mm, file, spa, &populate); @@ -1604,8 +1578,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa,
put_mm: up_write(&mm->mmap_sem); - mmput(mm); -put_file: + if (!spa->spg && file) fput(file);
@@ -1769,10 +1742,12 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, */ stat = sp_init_proc_stat(tsk, mm); if (IS_ERR(stat)) { + mutex_unlock(&sp_mutex); uva = stat; pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); goto out_unlock; } + mutex_unlock(&sp_mutex);
spg = __sp_find_spg(pid, SPG_ID_DEFAULT); if (spg == NULL) { @@ -1794,6 +1769,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { + up_read(&spg->rw_lock); pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); goto out_drop_spa; } @@ -1808,12 +1784,14 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, goto out_unlock; }
+ down_read(&spg->rw_lock); if (enable_share_k2u_spg) spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); else spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK);
if (IS_ERR(spa)) { + up_read(&spg->rw_lock); if (printk_ratelimit()) pr_err("share pool: k2u(spg) failed due to alloc spa failure " "(potential no enough virtual memory when -75): %ld\n", @@ -1831,14 +1809,18 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, uva = sp_make_share_kva_to_spg(kva_aligned, spa, spg); else uva = sp_make_share_kva_to_task(kva_aligned, spa, mm); + + up_read(&spg->rw_lock); } else { /* group is dead, return -ENODEV */ pr_err("share pool: failed to make k2u, sp group is dead\n"); }
if (!IS_ERR(uva)) { + mutex_lock(&sp_mutex); uva = uva + (kva - kva_aligned); stat->k2u_size += size_aligned; + mutex_unlock(&sp_mutex); } else { /* associate vma and spa */ if (!vmalloc_area_clr_flag(spa, kva_aligned, VM_SHAREPOOL)) @@ -1849,7 +1831,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, out_drop_spa: __sp_area_drop(spa); out_unlock: - mutex_unlock(&sp_mutex); mmput(mm); out_put_task: put_task_struct(tsk); @@ -2144,7 +2125,6 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp unsigned int page_size; struct sp_proc_stat *stat;
- mutex_lock(&sp_mutex); /* * at first we guess it's a hugepage addr * we can tolerate at most PMD_SIZE or PAGE_SIZE which is matched in k2u @@ -2157,7 +2137,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (printk_ratelimit()) pr_err("share pool: invalid input uva %pK in unshare uva\n", (void *)uva); - goto out_unlock; + goto out; } }
@@ -2259,10 +2239,14 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp goto out_drop_area; }
+ down_read(&spa->spg->rw_lock); __sp_free(spa->spg, uva_aligned, size_aligned, NULL); + up_read(&spa->spg->rw_lock); }
sp_dump_stack(); + + mutex_lock(&sp_mutex); /* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { kthread_stat.k2u_size -= spa->real_size; @@ -2273,6 +2257,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp else WARN(1, "share_pool: %s: null process stat\n", __func__); } + mutex_unlock(&sp_mutex);
out_clr_flag: /* deassociate vma and spa */ @@ -2281,8 +2266,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp
out_drop_area: __sp_area_drop(spa); -out_unlock: - mutex_unlock(&sp_mutex); +out: return ret; }
@@ -2446,7 +2430,7 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) check_interrupt_context();
if (device_id < 0 || device_id >= MAX_DEVID || pid < 0 || size <= 0 || - size> MMAP_SHARE_POOL_16G_SIZE) + size > MMAP_SHARE_POOL_16G_SIZE) return false;
mutex_lock(&sp_mutex); @@ -2468,9 +2452,10 @@ EXPORT_SYMBOL_GPL(sp_config_dvpp_range); /* Check whether the address belongs to the share pool. */ bool is_sharepool_addr(unsigned long addr) { - if (host_svm_sp_enable == false) - return addr >= MMAP_SHARE_POOL_START && addr < (MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE); - return addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_END; + if (host_svm_sp_enable == false) + return addr >= MMAP_SHARE_POOL_START && addr < (MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE); + + return addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_END; } EXPORT_SYMBOL_GPL(is_sharepool_addr);
@@ -2515,7 +2500,8 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, return 0; }
-static void rb_spa_stat_show(struct seq_file *seq) { +static void rb_spa_stat_show(struct seq_file *seq) +{ struct rb_node *node; struct sp_area *spa;
@@ -2814,6 +2800,36 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm, } EXPORT_SYMBOL(sharepool_no_page);
+#define MM_WOULD_FREE 2 + +void sp_group_exit(struct mm_struct *mm) +{ + struct sp_group *spg = NULL; + bool is_alive = true, unlock; + + if (!enable_ascend_share_pool) + return; + + spg = mm->sp_group; + + /* If the mm_users is 2, it means that the mm is ready to be freed + because the last owner of this mm is in exiting process. + */ + if (spg_valid(spg) && atomic_read(&mm->mm_users) == MM_WOULD_FREE) { + spg_exit_lock(&unlock); + down_write(&spg->rw_lock); + if (list_is_singular(&spg->procs)) + is_alive = spg->is_alive = false; + list_del(&mm->sp_node); + up_write(&spg->rw_lock); + if (!is_alive) + blocking_notifier_call_chain(&sp_notifier_chain, 0, + mm->sp_group); + atomic_dec(&mm->mm_users); + spg_exit_unlock(unlock); + } +} + struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, unsigned int page_order, int node) {