From: Nicholas Piggin npiggin@gmail.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
https://lwn.net/ml/linux-kernel/20200825145753.529284-12-npiggin@gmail.com/
Don't distinction between vmalloc and hugepage vmalloc, because there is no size print in alloc_large_system_hash in v4.19.
And this patch add page_order in vm_struct, it will break kabi. --------------
Support huge page vmalloc mappings. Config option HAVE_ARCH_HUGE_VMALLOC enables support on architectures that define HAVE_ARCH_HUGE_VMAP and supports PMD sized vmap mappings.
vmalloc will attempt to allocate PMD-sized pages if allocating PMD size or larger, and fall back to small pages if that was unsuccessful.
Allocations that do not use PAGE_KERNEL prot are not permitted to use huge pages, because not all callers expect this (e.g., module allocations vs strict module rwx).
This reduces TLB misses by nearly 30x on a `git diff` workload on a 2-node POWER9 (59,800 -> 2,100) and reduces CPU cycles by 0.54%.
This can result in more internal fragmentation and memory overhead for a given allocation, an option nohugevmalloc is added to disable at boot.
Signed-off-by: Nicholas Piggin npiggin@gmail.com Signed-off-by: Rui Xiang rui.xiang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Zefan Li lizefan@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/Kconfig | 4 + include/linux/vmalloc.h | 1 + mm/vmalloc.c | 160 +++++++++++++++++++++++++++++++--------- 3 files changed, 130 insertions(+), 35 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig index e906cbb213444..00f55932ba781 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -559,6 +559,10 @@ config HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD config HAVE_ARCH_HUGE_VMAP bool
+config HAVE_ARCH_HUGE_VMALLOC + depends on HAVE_ARCH_HUGE_VMAP + bool + config HAVE_ARCH_SOFT_DIRTY bool
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 496ac80046c01..07b4b1141ed8a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -39,6 +39,7 @@ struct vm_struct { unsigned long size; unsigned long flags; struct page **pages; + unsigned int page_order; unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index fc6394184a1ba..e76b806a6c003 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -41,6 +41,19 @@
#include "internal.h"
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +static bool __ro_after_init vmap_allow_huge = true; + +static int __init set_nohugevmalloc(char *str) +{ + vmap_allow_huge = false; + return 0; +} +early_param("nohugevmalloc", set_nohugevmalloc); +#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ +static const bool vmap_allow_huge = false; +#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ + struct vfree_deferred { struct llist_head list; struct work_struct wq; @@ -410,6 +423,61 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, return 0; }
+static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) +{ + pgd_t *pgd; + unsigned long next; + int err = 0; + int nr = 0; + + BUG_ON(addr >= end); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr); + if (err) + return err; + } while (pgd++, addr = next, addr != end); + + return 0; +} + +static int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + unsigned int i, nr = (end - addr) >> PAGE_SHIFT; + + WARN_ON(page_shift < PAGE_SHIFT); + + if (page_shift == PAGE_SHIFT) + return vmap_small_pages_range_noflush(addr, end, prot, pages); + + for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { + int err; + + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + __pa(page_address(pages[i])), prot, + page_shift); + if (err) + return err; + + addr += 1UL << page_shift; + } + + return 0; +} + +static int vmap_pages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + int err; + + err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + return err; +} + /** * map_kernel_range_noflush - map kernel VM area with the specified pages * @addr: start of the VM area to map @@ -431,22 +499,7 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, int map_kernel_range_noflush(unsigned long addr, unsigned long size, pgprot_t prot, struct page **pages) { - unsigned long end = addr + size; - unsigned long next; - pgd_t *pgd; - int err = 0; - int nr = 0; - - BUG_ON(addr >= end); - pgd = pgd_offset_k(addr); - do { - next = pgd_addr_end(addr, end); - err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr); - if (err) - return err; - } while (pgd++, addr = next, addr != end); - - return 0; + return vmap_pages_range_noflush(addr, addr + size, prot, pages, PAGE_SHIFT); }
int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, @@ -2270,11 +2323,11 @@ static void __vunmap(const void *addr, int deallocate_pages) if (deallocate_pages) { int i;
- for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << area->page_order) { struct page *page = area->pages[i];
BUG_ON(!page); - __free_pages(page, 0); + __free_pages(page, area->page_order); }
kvfree(area->pages); @@ -2403,9 +2456,12 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node) + pgprot_t prot, unsigned int page_shift, int node) { struct page **pages; + unsigned long addr = (unsigned long)area->addr; + unsigned long size = get_vm_area_size(area); + unsigned int page_order = page_shift - PAGE_SHIFT; unsigned int nr_pages; unsigned long array_size; unsigned int i; @@ -2415,7 +2471,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 0 : __GFP_HIGHMEM;
- nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; + nr_pages = size >> PAGE_SHIFT; array_size = (unsigned long)nr_pages * sizeof(struct page *);
/* Please note that the recursion is strictly bounded. */ @@ -2434,27 +2490,27 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->pages = pages; area->nr_pages = nr_pages; + area->page_order = page_order;
- for (i = 0; i < area->nr_pages; i++) { + for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page; + int p;
- if (node == NUMA_NO_NODE) - page = alloc_page(alloc_mask|highmem_mask); - else - page = alloc_pages_node(node, alloc_mask|highmem_mask, 0); - + page = alloc_pages_node(node, alloc_mask|highmem_mask, page_order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; goto fail; } - area->pages[i] = page; + + for (p = 0; p < (1U << page_order); p++) + area->pages[i + p] = page + p; + if (gfpflags_allow_blocking(gfp_mask|highmem_mask)) cond_resched(); }
- if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), - prot, pages) < 0) + if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) goto fail;
return area->addr; @@ -2462,7 +2518,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, fail: warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure, allocated %ld of %ld bytes", - (area->nr_pages*PAGE_SIZE), area->size); + (area->nr_pages*PAGE_SIZE), size); vfree(area->addr); return NULL; } @@ -2491,19 +2547,42 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, struct vm_struct *area; void *addr; unsigned long real_size = size; + unsigned long real_align = align; + unsigned int shift = PAGE_SHIFT;
- size = PAGE_ALIGN(size); if (!size || (size >> PAGE_SHIFT) > totalram_pages) goto fail;
+ if (vmap_allow_huge && (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))) { + unsigned long size_per_node; + + /* + * Try huge pages. Only try for PAGE_KERNEL allocations, + * others like modules don't yet expect huge pages in + * their allocations due to apply_to_page_range not + * supporting them. + */ + + size_per_node = size; + if (node == NUMA_NO_NODE) + size_per_node /= num_online_nodes(); + if (size_per_node >= PMD_SIZE) { + shift = PMD_SHIFT; + align = max(real_align, 1UL << shift); + size = ALIGN(real_size, 1UL << shift); + } + } + +again: + size = PAGE_ALIGN(size); area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) goto fail;
- addr = __vmalloc_area_node(area, gfp_mask, prot, node); + addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); if (!addr) - return NULL; + goto fail;
/* * First make sure the mappings are removed from all page-tables @@ -2523,8 +2602,19 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr;
fail: - warn_alloc(gfp_mask, NULL, + if (shift > PAGE_SHIFT) { + free_vm_area(area); + shift = PAGE_SHIFT; + align = real_align; + size = real_size; + goto again; + } + + if (!area) { + /* Warn for area allocation, page allocations already warn */ + warn_alloc(gfp_mask, NULL, "vmalloc: allocation failure: %lu bytes", real_size); + } return NULL; }
@@ -3503,7 +3593,7 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, " %pS", v->caller);
if (v->nr_pages) - seq_printf(m, " pages=%d", v->nr_pages); + seq_printf(m, " pages=%d order=%d", v->nr_pages, v->page_order);
if (v->phys_addr) seq_printf(m, " phys=%pa", &v->phys_addr);
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The mm->owner is only used for MEMCG currently, but the ascend share pool features will use it later, so make it to a general features and select it for CONFIG_MEMCG.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mm_types.h | 2 +- init/Kconfig | 1 + kernel/exit.c | 4 ++-- kernel/fork.c | 4 ++-- mm/Kconfig | 4 ++++ mm/debug.c | 2 +- 6 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index eee88ebbcf35b..34ef2ad0ad1e8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -455,7 +455,7 @@ struct mm_struct { spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MM_OWNER /* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in diff --git a/init/Kconfig b/init/Kconfig index 7334599eef536..c05347a29ca4d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -704,6 +704,7 @@ config MEMCG bool "Memory controller" select PAGE_COUNTER select EVENTFD + select MM_OWNER help Provides control over the memory footprint of tasks in a cgroup.
diff --git a/kernel/exit.c b/kernel/exit.c index c739d83cba988..2a32d32bdc03d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -330,7 +330,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } }
-#ifdef CONFIG_MEMCG +#ifdef CONFIG_MM_OWNER /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ @@ -416,7 +416,7 @@ void mm_update_next_owner(struct mm_struct *mm) task_unlock(c); put_task_struct(c); } -#endif /* CONFIG_MEMCG */ +#endif /* CONFIG_MM_OWNER */
/* * Turn us into a lazy TLB process if we diff --git a/kernel/fork.c b/kernel/fork.c index e17aaa526c593..be67a4aa10631 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -962,7 +962,7 @@ static void mm_init_aio(struct mm_struct *mm) static __always_inline void mm_clear_owner(struct mm_struct *mm, struct task_struct *p) { -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MM_OWNER if (mm->owner == p) WRITE_ONCE(mm->owner, NULL); #endif @@ -970,7 +970,7 @@ static __always_inline void mm_clear_owner(struct mm_struct *mm,
static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MM_OWNER mm->owner = p; #endif } diff --git a/mm/Kconfig b/mm/Kconfig index 0434aef47b44c..253fb184c8f8d 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -299,6 +299,10 @@ config VIRT_TO_BUS deprecated interface virt_to_bus(). All new architectures should probably not select this.
+config MM_OWNER + bool "Enable the ownership the mm owner" + help + This option enables mm_struct's to have an owner.
config MMU_NOTIFIER bool diff --git a/mm/debug.c b/mm/debug.c index 362ce581671e7..2da184b16bce0 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -129,7 +129,7 @@ void dump_mm(const struct mm_struct *mm) #ifdef CONFIG_AIO "ioctx_table %px\n" #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MM_OWNER "owner %px " #endif "exe_file %px\n"
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
This is a prepare patch for share pool, export new function to vmalloc hugepage and vmap the hugepage to virtually contiguous space.
The new head file share_pool.h is mainly used for share pool features and export some sp_xxx function when ascend_share_pool config is enabled, and do nothing by default.
Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 354 +++++++++++++++++++++++++++++++++++++ include/linux/vmalloc.h | 9 + mm/vmalloc.c | 237 ++++++++++++++++++++++++- 3 files changed, 596 insertions(+), 4 deletions(-) create mode 100644 include/linux/share_pool.h
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h new file mode 100644 index 0000000000000..09afbae33d418 --- /dev/null +++ b/include/linux/share_pool.h @@ -0,0 +1,354 @@ +#ifndef LINUX_SHARE_POOL_H +#define LINUX_SHARE_POOL_H + +#include <linux/mman.h> +#include <linux/mm_types.h> +#include <linux/notifier.h> +#include <linux/vmalloc.h> + +#define SP_HUGEPAGE (1 << 0) +#define SP_HUGEPAGE_ONLY (1 << 1) +#define SP_DVPP (1 << 2) + +#define SPG_ID_NONE -1 /* not associated with sp_group, only for specified thread */ +#define SPG_ID_DEFAULT 0 /* use the spg id of current thread */ +#define SPG_ID_MIN 1 /* valid id should be >= 1 */ +#define SPG_ID_MAX 99999 +#define SPG_ID_AUTO_MIN 100000 +#define SPG_ID_AUTO_MAX 199999 +#define SPG_ID_AUTO 200000 /* generate group id automatically */ +#define SPG_ID_DVPP_PASS_THROUGH_MIN 800000 +#define SPG_ID_DVPP_PASS_THROUGH_MAX 899999 +#define SPG_ID_DVPP_PASS_THROUGH 900000 + +#define MAX_DEVID 1 /* the max num of Da-vinci devices */ + +#define VM_HUGE_PAGES 0x00001000 /* use for huge pages */ + +/* to align the pointer to the (next) PMD boundary */ +#define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE) + +/* test whether an address (unsigned long or pointer) is aligned to PMD_SIZE */ +#define PMD_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PMD_SIZE) + +extern int sysctl_share_pool_hugepage_enable; + +extern int sysctl_ac_mode; + +extern int enable_ascend_share_pool; + +/* Processes in the same sp_group can share memory. + * Memory layout for share pool: + * + * |-------------------- 8T -------------------|---|------ 8T ------------| + * | Device 0 | Device 1 |...| | + * |----------------------------------------------------------------------| + * |- 16G -|- 16G -|- 16G -|- 16G -| | | | | + * | DVPP GROUP0 | DVPP GROUP1 | ... | ... |...| sp normal memory | + * | svm | sp | svm | sp | | | | | + * |----------------------------------------------------------------------| + * + * The host SVM feature reserves 8T virtual memory by mmap, and due to the + * restriction of DVPP, while SVM and share pool will both allocate memory + * for DVPP, the memory have to be in the same 32G range. + * + * Share pool reserves 16T memory, with 8T for normal uses and 8T for DVPP. + * Within this 8T DVPP memory, SVM will call sp_config_dvpp_range() to + * tell us which 16G memory range is reserved for share pool . + * + * In some scenarios where there is no host SVM feature, share pool uses + * the default memory setting for DVPP. + */ +struct sp_group { + int id; + struct file *file; + struct file *file_hugetlb; + /* list head of processes */ + struct list_head procs; + /* list of sp_area */ + struct list_head spa_list; + /* number of sp_area */ + atomic_t spa_num; + /* total size of all sp_area from sp_alloc and k2u(spg) */ + atomic_t size; + /* record the number of hugepage allocation failures */ + int hugepage_failures; + /* is_alive == false means it's being destroyed */ + bool is_alive; + /* we define the creator process of a sp_group as owner */ + struct task_struct *owner; + /* dvpp_multi_spaces == true means multiple dvpp 16G spaces are set */ + bool dvpp_multi_spaces; + unsigned long dvpp_va_start; + unsigned long dvpp_size; + atomic_t use_count; +}; + +struct sp_walk_data { + struct page **pages; + unsigned int page_count; + unsigned long uva_aligned; + unsigned long page_size; + bool is_hugepage; +}; + +#ifdef CONFIG_ASCEND_SHARE_POOL + +#define MAP_SHARE_POOL 0x100000 + +#define MMAP_TOP_4G_SIZE 0x100000000UL + +/* 8T size */ +#define MMAP_SHARE_POOL_NORMAL_SIZE 0x80000000000UL +/* 8T size*/ +#define MMAP_SHARE_POOL_DVPP_SIZE 0x80000000000UL +/* 16G size */ +#define MMAP_SHARE_POOL_16G_SIZE 0x400000000UL +#define MMAP_SHARE_POOL_SIZE (MMAP_SHARE_POOL_NORMAL_SIZE + MMAP_SHARE_POOL_DVPP_SIZE) +/* align to 2M hugepage size, and MMAP_SHARE_POOL_TOP_16G_START should be align to 16G */ +#define MMAP_SHARE_POOL_END ((TASK_SIZE - MMAP_SHARE_POOL_DVPP_SIZE) & ~((1 << 21) - 1)) +#define MMAP_SHARE_POOL_START (MMAP_SHARE_POOL_END - MMAP_SHARE_POOL_SIZE) +#define MMAP_SHARE_POOL_16G_START (MMAP_SHARE_POOL_END - MMAP_SHARE_POOL_DVPP_SIZE) + +static inline void sp_init_mm(struct mm_struct *mm) +{ + mm->sp_group = NULL; + INIT_LIST_HEAD(&mm->sp_node); + mm->sp_stat_id = 0; +} + +extern int sp_group_add_task(int pid, int spg_id); +extern void sp_group_exit(struct mm_struct *mm); +extern void sp_group_post_exit(struct mm_struct *mm); +extern int sp_group_id_by_pid(int pid); +extern int sp_group_walk(int spg_id, void *data, int (*func)(struct mm_struct *mm, void *)); +extern int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); + +extern void *sp_alloc(unsigned long size, unsigned long sp_flags, int sp_id); +extern int sp_free(unsigned long addr); +extern void *sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id); +extern void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid); +extern int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id); + +extern void sp_area_drop(struct vm_area_struct *vma); + +extern int sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data); +extern void sp_walk_page_free(struct sp_walk_data *sp_walk_data); + +extern int sp_register_notifier(struct notifier_block *nb); +extern int sp_unregister_notifier(struct notifier_block *nb); +extern bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid); +extern bool is_sharepool_addr(unsigned long addr); +extern void proc_sharepool_init(void); + +static inline struct task_struct *sp_get_task(struct mm_struct *mm) +{ + if (enable_ascend_share_pool) + return mm->owner; + else + return current; +} + +static inline bool sp_check_hugepage(struct page *p) +{ + if (enable_ascend_share_pool && PageHuge(p)) + return true; + + return false; +} + +static inline bool sp_is_enabled(void) +{ + return enable_ascend_share_pool ? true : false; +} + +static inline bool sp_check_vm_huge_page(unsigned long flags) +{ + if (enable_ascend_share_pool && (flags & VM_HUGE_PAGES)) + return true; + + return false; +} + +static inline void sp_area_work_around(struct vm_unmapped_area_info *info) +{ + if (enable_ascend_share_pool) + info->high_limit = min(info->high_limit, MMAP_SHARE_POOL_START); +} + +extern struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, + unsigned int page_order, int node); + +static inline void sp_free_pages(struct page *page, struct vm_struct *area) +{ + if (PageHuge(page)) + put_page(page); + else + __free_pages(page, area->page_order); +} + +static inline bool sp_check_vm_share_pool(unsigned long vm_flags) +{ + if (enable_ascend_share_pool && (vm_flags & VM_SHARE_POOL)) + return true; + + return false; +} + +static inline bool is_vm_huge_special(struct vm_area_struct *vma) +{ + return !!(enable_ascend_share_pool && (vma->vm_flags & VM_HUGE_SPECIAL)); +} + +static inline bool sp_mmap_check(unsigned long flags) +{ + if (enable_ascend_share_pool && (flags & MAP_SHARE_POOL)) + return true; + + return false; +} + +#else + +static inline int sp_group_add_task(int pid, int spg_id) +{ + return -EPERM; +} + +static inline void sp_group_exit(struct mm_struct *mm) +{ +} + +static inline void sp_group_post_exit(struct mm_struct *mm) +{ +} + +static inline int sp_group_id_by_pid(int pid) +{ + return -EPERM; +} + +static inline int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return -EPERM; +} + +static inline void *sp_alloc(unsigned long size, unsigned long sp_flags, int sp_id) +{ + return NULL; +} + +static inline int sp_free(unsigned long addr) +{ + return -EPERM; +} + +static inline void *sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id) +{ + return NULL; +} + +static inline void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) +{ + return NULL; +} +static inline int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) +{ + return -EPERM; +} + +static inline void sp_init_mm(struct mm_struct *mm) +{ +} + +static inline void sp_area_drop(struct vm_area_struct *vma) +{ +} + +static inline int sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + return 0; +} + +static inline void sp_walk_page_free(struct sp_walk_data *sp_walk_data) +{ +} +static inline int sp_register_notifier(struct notifier_block *nb) +{ + return -EPERM; +} + +static inline int sp_unregister_notifier(struct notifier_block *nb) +{ + return -EPERM; +} +static inline bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) +{ + return false; +} + +static inline bool is_sharepool_addr(unsigned long addr) +{ + return false; +} + +static inline void proc_sharepool_init(void) +{ +} + +static inline struct task_struct *sp_get_task(struct mm_struct *mm) +{ + return current; +} +static inline bool sp_check_hugepage(struct page *p) +{ + return false; +} + +static inline bool sp_is_enabled(void) +{ + return false; +} + +static inline bool sp_check_vm_huge_page(unsigned long flags) +{ + return false; +} + +static inline void sp_area_work_around(struct vm_unmapped_area_info *info) +{ +} + +static inline struct page *sp_alloc_pages(void *area, gfp_t mask, + unsigned int page_order, int node) +{ + return NULL; +} + +static inline void sp_free_pages(struct page *page, struct vm_struct *area) +{ +} + +static inline bool sp_check_vm_share_pool(unsigned long vm_flags) +{ + return false; +} + +static inline bool is_vm_huge_special(struct vm_area_struct *vma) +{ + return false; +} + +static inline bool sp_mmap_check(unsigned long flags) +{ + return false; +} +#endif + +#endif /* LINUX_SHARE_POOL_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 07b4b1141ed8a..244eedb7591a7 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -95,6 +95,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); +extern void *vmalloc_hugepage(unsigned long size); +extern void *vmalloc_hugepage_user(unsigned long size); #ifndef CONFIG_MMU extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, @@ -123,6 +125,13 @@ extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, void vmalloc_sync_mappings(void); void vmalloc_sync_unmappings(void);
+extern void *vmap_hugepage(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot); +extern int remap_vmalloc_hugepage_range_partial(struct vm_area_struct *vma, + unsigned long uaddr, void *kaddr, + unsigned long size); +extern int remap_vmalloc_hugepage_range(struct vm_area_struct *vma, + void *addr, unsigned long pgoff); /* * Lowlevel-APIs (not for driver use!) */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e76b806a6c003..511578705e0d8 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -33,6 +33,7 @@ #include <linux/bitops.h> #include <linux/overflow.h> #include <linux/rbtree_augmented.h> +#include <linux/share_pool.h>
#include <linux/uaccess.h> #include <asm/tlbflush.h> @@ -478,6 +479,37 @@ static int vmap_pages_range(unsigned long addr, unsigned long end, return err; }
+static int vmap_hugepages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + unsigned int i, nr = (end - addr) >> page_shift; + + for (i = 0; i < nr; i++) { + int err; + + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + __pa(page_address(pages[i])), prot, + page_shift); + if (err) + return err; + + addr += 1UL << page_shift; + } + + return 0; +} + +static int vmap_hugepages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) +{ + int err; + + err = vmap_hugepages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + return err; +} + /** * map_kernel_range_noflush - map kernel VM area with the specified pages * @addr: start of the VM area to map @@ -589,6 +621,22 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) } EXPORT_SYMBOL(vmalloc_to_page);
+/* + * Walk a hugepage vmap address to the struct page it maps. + * return the head page that corresponds to the base page address. + */ +struct page *vmalloc_to_hugepage(const void *vmalloc_addr) +{ + struct page *huge; + + huge = vmalloc_to_page(vmalloc_addr); + if (huge && PageHuge(huge)) + return huge; + else + return NULL; +} +EXPORT_SYMBOL(vmalloc_to_hugepage); + /* * Map a vmalloc()-space virtual address to the physical page frame number. */ @@ -2243,7 +2291,12 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller) { - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, + unsigned long align = 1; + + if (sp_check_vm_huge_page(flags)) + align = PMD_SIZE; + + return __get_vm_area_node(size, align, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, caller); }
@@ -2327,7 +2380,10 @@ static void __vunmap(const void *addr, int deallocate_pages) struct page *page = area->pages[i];
BUG_ON(!page); - __free_pages(page, area->page_order); + if (sp_is_enabled()) + sp_free_pages(page, area); + else + __free_pages(page, area->page_order); }
kvfree(area->pages); @@ -2452,6 +2508,43 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap);
+/** + * vmap_hugepag - map an array of huge pages into virtually contiguous space + * @pages: array of huge page pointers + * @count: number of pages to map + * @flags: vm_area->flags + * @prot: page protection for the mapping + * + * Maps @count pages from @pages into contiguous kernel virtual + * space. + */ +void *vmap_hugepage(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot) +{ + struct vm_struct *area; + unsigned long size; /* In bytes */ + + might_sleep(); + + if (count > totalram_pages) + return NULL; + + size = (unsigned long)count << PMD_SHIFT; + area = get_vm_area_caller(size, flags, __builtin_return_address(0)); + if (!area) + return NULL; + + if (vmap_hugepages_range((unsigned long)area->addr, + (unsigned long)area->addr + size, prot, + pages, PMD_SHIFT) < 0) { + vunmap(area->addr); + return NULL; + } + + return area->addr; +} +EXPORT_SYMBOL(vmap_hugepage); + static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller); @@ -2496,7 +2589,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; int p;
- page = alloc_pages_node(node, alloc_mask|highmem_mask, page_order); + if (sp_is_enabled()) + page = sp_alloc_pages(area, alloc_mask|highmem_mask, + page_order, node); + else + page = alloc_pages_node(node, alloc_mask|highmem_mask, + page_order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; @@ -2564,7 +2662,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, */
size_per_node = size; - if (node == NUMA_NO_NODE) + if (node == NUMA_NO_NODE && !sp_is_enabled()) size_per_node /= num_online_nodes(); if (size_per_node >= PMD_SIZE) { shift = PMD_SHIFT; @@ -2827,6 +2925,55 @@ void *vmalloc_32_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_32_user);
+/** + * vmalloc_hugepage - allocate virtually contiguous hugetlb memory + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage(unsigned long size) +{ + /* PMD hugepage aligned */ + size = PMD_ALIGN(size); + + return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, + NUMA_NO_NODE, __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_hugepage); + +/** + * vmalloc_hugepage_user - allocate virtually contiguous hugetlb memory + * for userspace + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. The resulting memory area + * is zeroed so it can be mapped to userspace without leaking data. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage_user(unsigned long size) +{ + struct vm_struct *area; + void *ret; + + /* 2M hugepa aligned */ + size = PMD_ALIGN(size); + + ret = __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + NUMA_NO_NODE, __builtin_return_address(0)); + if (ret) { + area = find_vm_area(ret); + area->flags |= VM_USERMAP; + } + return ret; +} +EXPORT_SYMBOL(vmalloc_hugepage_user); + + /* * small helper routine , copy contents to buf from addr. * If the page is not present, fill zero. @@ -3152,6 +3299,85 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range);
+/** + * remap_vmalloc_hugepage_range_partial - map vmalloc hugepages + * to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc hugepage kernel memory + * @size: size of map area + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_hugepage_range_partial(struct vm_area_struct *vma, + unsigned long uaddr, void *kaddr, unsigned long size) +{ + struct vm_struct *area; + + size = PMD_ALIGN(size); + + if (!PMD_ALIGNED(uaddr) || !PMD_ALIGNED(kaddr)) + return -EINVAL; + + area = find_vm_area(kaddr); + if (!area) + return -EINVAL; + + if (!(area->flags & VM_USERMAP)) + return -EINVAL; + + if (kaddr + size > area->addr + get_vm_area_size(area)) + return -EINVAL; + + do { + struct page *page = vmalloc_to_hugepage(kaddr); + int ret; + + ret = vm_insert_page(vma, uaddr, page); + if (ret) + return ret; + + uaddr += PMD_SIZE; + kaddr += PMD_SIZE; + size -= PMD_SIZE; + } while (size > 0); + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + + return 0; +} +EXPORT_SYMBOL(remap_vmalloc_hugepage_range_partial); + +/** + * remap_vmalloc_hugepage_range - map vmalloc hugepages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of hugepages into addr before first page to map + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_hugepage_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + return remap_vmalloc_hugepage_range_partial(vma, vma->vm_start, + addr + (pgoff << PMD_SHIFT), + vma->vm_end - vma->vm_start); +} +EXPORT_SYMBOL(remap_vmalloc_hugepage_range); + /* * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose * not to have one. @@ -3613,6 +3839,9 @@ static int s_show(struct seq_file *m, void *p) if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages");
+ if (sp_is_enabled()) + seq_printf(m, " order=%d", v->page_order); + show_numa_info(m, v); seq_putc(m, '\n'); return 0;
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The mm->sp_group is mainly used to find out the group which owns the mm, and the group could use the mm->sp_node to list and find out the mm, the mm->sp_stat_id is used for collecting memory information.
This changes will affect and destroy the kabi only when enable the ascend_share_pool config.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mm_types.h | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 34ef2ad0ad1e8..51a85ba5ac915 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -470,6 +470,11 @@ struct mm_struct { #endif struct user_namespace *user_ns;
+#ifdef CONFIG_ASCEND_SHARE_POOL + struct sp_group *sp_group; + struct list_head sp_node; /* link to sp_group->procs */ + int sp_stat_id; +#endif /* store ref to file /proc/<pid>/exe symlink points to */ struct file __rcu *exe_file; #ifdef CONFIG_MMU_NOTIFIER
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The do_mmap/mmap_region/__mm_populate could only be used to handle the current process, now the share pool need to handle the other process and create memory mmaping, so need to export new function to distinguish different process and handle it, it would not break the current logic and only valid for share pool.
The share pool need to remap the vmalloc pages to user space, so introduce the hugetlb_insert_hugepage to support hugepage remapming.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/hugetlb.h | 4 +- include/linux/mm.h | 27 +++++++++++++- mm/gup.c | 28 ++++++++++---- mm/hugetlb.c | 42 +++++++++++++++++++++ mm/memory.c | 7 +++- mm/mmap.c | 83 +++++++++++++++++++++++++++++++---------- 6 files changed, 160 insertions(+), 31 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index de6cdfa51694c..2383d81ca2d6d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -384,7 +384,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, const struct hstate *hugetlb_get_hstate(void); struct page *hugetlb_alloc_hugepage(int nid, int flag); int hugetlb_insert_hugepage_pte(struct mm_struct *mm, unsigned long addr, - pgprot_t prot, struct page *hpage); + pgprot_t prot, struct page *hpage); #else static inline const struct hstate *hugetlb_get_hstate(void) { @@ -402,6 +402,8 @@ static inline int hugetlb_insert_hugepage_pte(struct mm_struct *mm, return -EPERM; } #endif +int hugetlb_insert_hugepage(struct vm_area_struct *vma, unsigned long addr, + struct page *hpage, pgprot_t prot);
/* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8aa492fb7d538..e4a20206c3f39 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -230,6 +230,10 @@ extern unsigned int kobjsize(const void *objp); #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ #define VM_PA32BIT 0x400000000 /* Physical address is within 4G */
+#ifdef CONFIG_ASCEND_SHARE_POOL +#define VM_HUGE_SPECIAL 0x800000000 /* Special hugepage flag used by share pool */ +#endif + #ifdef CONFIG_COHERENT_DEVICE #define VM_CDM 0x100000000 /* Contains coherent device memory */ #endif @@ -247,11 +251,13 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) +#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_ARCH_HAS_PKEYS @@ -267,6 +273,12 @@ extern unsigned int kobjsize(const void *objp); #endif #endif /* CONFIG_ARCH_HAS_PKEYS */
+#if defined(CONFIG_ASCEND_SHARE_POOL) +# define VM_SHARE_POOL VM_HIGH_ARCH_5 +#else +# define VM_SHARE_POOL VM_NONE +#endif + #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ #elif defined(CONFIG_PPC) @@ -620,7 +632,7 @@ int region_intersects(resource_size_t offset, size_t size, unsigned long flags, /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); - +struct page *vmalloc_to_hugepage(const void *addr); /* * Determine if an address is within the vmalloc range * @@ -2407,10 +2419,14 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); + extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(unsigned long start, size_t len_in, int behavior); - +extern unsigned long __do_mmap(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, unsigned long prot, + unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, + unsigned long *populate, struct list_head *uf); static inline unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, @@ -2428,14 +2444,21 @@ static inline void mm_populate(unsigned long addr, unsigned long len) /* Ignore errors */ (void) __mm_populate(addr, len, 1); } +extern int do_mm_populate(struct mm_struct *mm, unsigned long addr, unsigned long len, + int ignore_errors); #else static inline void mm_populate(unsigned long addr, unsigned long len) {} +int do_mm_populate(struct mm_struct *mm, unsigned long addr, unsigned long len, + int ignore_errors) +{ +} #endif
/* These take the mm semaphore themselves */ extern int __must_check vm_brk(unsigned long, unsigned long); extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); +extern int do_vm_munmap(struct mm_struct *mm, unsigned long start, size_t len); extern unsigned long do_vm_mmap(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff); diff --git a/mm/gup.c b/mm/gup.c index 5801d4bd523a6..6372fb45e2dca 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -13,6 +13,7 @@ #include <linux/sched/signal.h> #include <linux/rwsem.h> #include <linux/hugetlb.h> +#include <linux/share_pool.h>
#include <asm/mmu_context.h> #include <asm/pgtable.h> @@ -1228,6 +1229,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; + struct task_struct *tsk;
VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK); @@ -1253,24 +1255,22 @@ long populate_vma_page_range(struct vm_area_struct *vma, if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) gup_flags |= FOLL_FORCE;
+ tsk = sp_get_task(mm); /* * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ - return __get_user_pages(current, mm, start, nr_pages, gup_flags, + return __get_user_pages(tsk, mm, start, nr_pages, gup_flags, NULL, NULL, nonblocking); }
/* - * __mm_populate - populate and/or mlock pages within a range of address space. - * - * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap - * flags. VMAs must be already marked with the desired vm_flags, and - * mmap_sem must not be held. + * do_mm_populate - populate and/or mlock pages within a range of + * address space for the specified mm_struct. */ -int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +int do_mm_populate(struct mm_struct *mm, unsigned long start, unsigned long len, + int ignore_errors) { - struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; @@ -1321,6 +1321,18 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) return ret; /* 0 or negative error code */ }
+/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_sem must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +{ + return do_mm_populate(current->mm, start, len, ignore_errors); +} + /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 87f0f2bd6410b..7d57d6a943c25 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -26,6 +26,7 @@ #include <linux/swapops.h> #include <linux/jhash.h> #include <linux/mman.h> +#include <linux/share_pool.h>
#include <asm/page.h> #include <asm/pgtable.h> @@ -4010,6 +4011,12 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, }
page = alloc_huge_page(vma, haddr, 0); + if (IS_ERR(page) && sp_check_vm_share_pool(vma->vm_flags)) { + page = alloc_huge_page_node(hstate_file(vma->vm_file), + numa_mem_id()); + if (!page) + page = ERR_PTR(-ENOMEM); + } if (IS_ERR(page)) { /* * Returning error will result in faulting task being @@ -5359,6 +5366,41 @@ int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(hugetlb_insert_hugepage_pte_by_pa);
+int hugetlb_insert_hugepage(struct vm_area_struct *vma, unsigned long addr, + struct page *hpage, pgprot_t prot) +{ + struct hstate *h = hstate_vma(vma); + int anon_rmap = 0; + spinlock_t *ptl; + pte_t *ptep; + pte_t pte; + struct mm_struct *mm = vma->vm_mm; + + ptep = hugetlb_huge_pte_alloc(mm, addr, huge_page_size(h)); + if (!ptep) + return -ENXIO; + + get_page(hpage); + + ptl = huge_pte_lock(h, mm, ptep); + if (anon_rmap) { + ClearPagePrivate(hpage); + hugepage_add_new_anon_rmap(hpage, vma, addr); + } else { + page_dup_rmap(hpage, true); + } + + pte = make_huge_pte(vma, hpage, ((vma->vm_flags & VM_WRITE) + && (vma->vm_flags & VM_SHARED))); + set_huge_pte_at(mm, addr, ptep, pte); + + hugetlb_count_add(pages_per_huge_page(h), mm); + + spin_unlock(ptl); + + return 0; +} + #ifdef CONFIG_ASCEND_CHARGE_MIGRATE_HUGEPAGES
static int __init ascend_enable_charge_migrate_hugepages(char *s) diff --git a/mm/memory.c b/mm/memory.c index 56e57897d565f..6530d76a40af8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -70,6 +70,7 @@ #include <linux/dax.h> #include <linux/oom.h> #include <linux/ktask.h> +#include <linux/share_pool.h>
#include <asm/io.h> #include <asm/mmu_context.h> @@ -1540,7 +1541,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, BUG_ON(vma->vm_flags & VM_PFNMAP); vma->vm_flags |= VM_MIXEDMAP; } - return insert_page(vma, addr, page, vma->vm_page_prot); + + if (sp_check_hugepage(page)) + return hugetlb_insert_hugepage(vma, addr, page, vma->vm_page_prot); + else + return insert_page(vma, addr, page, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_page);
diff --git a/mm/mmap.c b/mm/mmap.c index f7f1fd3b5fa39..9c9a4a98abb21 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -47,6 +47,7 @@ #include <linux/oom.h> #include <linux/sched/mm.h> #include <linux/swapops.h> +#include <linux/share_pool.h>
#include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -178,6 +179,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); + sp_area_drop(vma); vm_area_free(vma); return next; } @@ -1119,6 +1121,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL;
+ /* don't merge this kind of vma as sp_area couldn't be merged */ + if (sp_check_vm_share_pool(vm_flags)) + return NULL; + if (prev) next = prev->vm_next; else @@ -1373,12 +1379,17 @@ int unregister_mmap_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(unregister_mmap_notifier); #endif
-static inline unsigned long -__do_mmap(struct file *file, unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, vm_flags_t vm_flags, - unsigned long pgoff, unsigned long *populate, struct list_head *uf) +static unsigned long __mmap_region(struct mm_struct *mm, + struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, + unsigned long pgoff, struct list_head *uf); + +inline unsigned long +__do_mmap(struct mm_struct *mm, struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, unsigned long flags, + vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, + struct list_head *uf) { - struct mm_struct *mm = current->mm; int pkey = 0;
*populate = 0; @@ -1403,6 +1414,10 @@ __do_mmap(struct file *file, unsigned long addr, unsigned long len, if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr);
+ /* the MAP_DVPP couldn't work with MAP_SHARE_POOL */ + if ((flags & MAP_DVPP) && sp_mmap_check(flags)) + return -EINVAL; + /* Careful about overflows.. */ len = PAGE_ALIGN(len); if (!len) @@ -1567,7 +1582,7 @@ __do_mmap(struct file *file, unsigned long addr, unsigned long len, if (flags & MAP_CHECKNODE) set_vm_checknode(&vm_flags, flags);
- addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); + addr = __mmap_region(mm, file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) @@ -1737,12 +1752,11 @@ do_user_swap(struct mm_struct *mm, unsigned long addr_start, unsigned long len, }
static inline unsigned long -do_uswap_mmap(struct file *file, unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, vm_flags_t vm_flags, - unsigned long pgoff, unsigned long *populate, - struct list_head *uf) +do_uswap_mmap(struct mm_struct *mm, struct file *file, unsigned long addr, + unsigned long len, unsigned long prot, unsigned long flags, + vm_flags_t vm_flags, unsigned long pgoff, + unsigned long *populate, struct list_head *uf) { - struct mm_struct *mm = current->mm; unsigned long old_addr = addr; struct page **pages = NULL; unsigned long ret; @@ -1758,7 +1772,7 @@ do_uswap_mmap(struct file *file, unsigned long addr, unsigned long len, /* mark the vma as special to avoid merging with other vmas */ vm_flags |= VM_SPECIAL;
- addr = __do_mmap(file, addr, len, prot, flags, vm_flags, pgoff, + addr = __do_mmap(mm, file, addr, len, prot, flags, vm_flags, pgoff, populate, uf); if (IS_ERR_VALUE(addr)) { ret = addr; @@ -1788,10 +1802,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr, { #ifdef CONFIG_USERSWAP if (enable_userswap && (flags & MAP_REPLACE)) - return do_uswap_mmap(file, addr, len, prot, flags, vm_flags, - pgoff, populate, uf); + return do_uswap_mmap(current->mm, file, addr, len, prot, flags, + vm_flags, pgoff, populate, uf); #endif - return __do_mmap(file, addr, len, prot, flags, vm_flags, + return __do_mmap(current->mm, file, addr, len, prot, flags, vm_flags, pgoff, populate, uf); }
@@ -1939,11 +1953,11 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; }
-unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) +static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, + vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) { - struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; int error; struct rb_node **rb_link, *rb_parent; @@ -2105,6 +2119,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return error; }
+unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, + unsigned long pgoff, struct list_head *uf) +{ + return __mmap_region(current->mm, file, addr, len, vm_flags, pgoff, uf); +} + unsigned long unmapped_area(struct vm_unmapped_area_info *info) { /* @@ -2356,6 +2377,8 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags);
+ sp_area_work_around(&info); + return vm_unmapped_area(&info); } #endif @@ -2406,6 +2429,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags);
+ sp_area_work_around(&info); + addr = vm_unmapped_area(&info);
/* @@ -2423,6 +2448,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags);
+ sp_area_work_around(&info); + addr = vm_unmapped_area(&info); }
@@ -3094,6 +3121,24 @@ int vm_munmap(unsigned long start, size_t len) } EXPORT_SYMBOL(vm_munmap);
+int do_vm_munmap(struct mm_struct *mm, unsigned long start, size_t len) +{ + int ret; + LIST_HEAD(uf); + + if (mm == NULL) + return -EINVAL; + + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + + ret = do_munmap(mm, start, len, &uf); + up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); + return ret; +} +EXPORT_SYMBOL(do_vm_munmap); + /* * Must acquire an additional reference to the mm struct to prevent the * mm struct of other process from being released.
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Expose the per-task sp_group state value so users can determine the status of the sg_group.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/proc/base.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 24fb694357338..349c01c68e576 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -94,6 +94,7 @@ #include <linux/sched/stat.h> #include <linux/flex_array.h> #include <linux/posix-timers.h> +#include <linux/share_pool.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" @@ -3055,6 +3056,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif +#ifdef CONFIG_ASCEND_SHARE_POOL + ONE("sp_group", S_IRUGO, proc_sp_group_state), +#endif };
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3435,6 +3439,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif +#ifdef CONFIG_ASCEND_SHARE_POOL + ONE("sp_group", S_IRUGO, proc_sp_group_state), +#endif };
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
add new node for /proc/sys/kernel/share_pool_hugepage_enable and /proc/sys/kernel/sharepool_ac_mode.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sysctl.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f8854e7f6fdff..3417b45058483 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -67,6 +67,7 @@ #include <linux/bpf.h> #include <linux/mount.h> #include <linux/pipe_fs_i.h> +#include <linux/share_pool.h>
#include "../lib/kstrtox.h"
@@ -1267,6 +1268,18 @@ static struct ctl_table kern_table[] = { .extra2 = &three, },
+#endif +#ifdef CONFIG_ASCEND_SHARE_POOL + { + /* 0: disable, 1: enable */ + .procname = "share_pool_hugepage_enable", + .data = &sysctl_share_pool_hugepage_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; @@ -1758,6 +1771,17 @@ static struct ctl_table vm_table[] = { .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, +#endif +#ifdef CONFIG_ASCEND_SHARE_POOL + { + .procname = "sharepool_ac_mode", + .data = &sysctl_ac_mode, + .maxlen = sizeof(sysctl_ac_mode), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } };
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The proc_share_init should be used in proc_root_init when enable the share pool features.
The is_vm_huge_special() was used to distinguish the vma with VM_HUGE_SPECIAL and handle it, it would not be used by default.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/proc/root.c | 2 ++ mm/pagewalk.c | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/fs/proc/root.c b/fs/proc/root.c index f4b1a9d2eca60..33f9e1a627cc7 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -23,6 +23,7 @@ #include <linux/pid_namespace.h> #include <linux/parser.h> #include <linux/cred.h> +#include <linux/share_pool.h>
#include "internal.h"
@@ -140,6 +141,7 @@ void __init proc_root_init(void) proc_tty_init(); proc_mkdir("bus", NULL); proc_sys_init(); + proc_sharepool_init();
register_filesystem(&proc_fs_type); } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 286f00cc1c065..0c0aeb878d426 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -3,6 +3,7 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/hugetlb.h> +#include <linux/share_pool.h>
static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -178,7 +179,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; - struct hstate *h = hstate_vma(vma); + struct hstate *h = is_vm_huge_special(vma) ? &default_hstate : hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); unsigned long sz = huge_page_size(h); @@ -247,7 +248,7 @@ static int __walk_page_range(unsigned long start, unsigned long end, int err = 0; struct vm_area_struct *vma = walk->vma;
- if (vma && is_vm_hugetlb_page(vma)) { + if (vma && ((is_vm_hugetlb_page(vma)) || is_vm_huge_special(vma))) { if (walk->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The fork() will create the new mm for new process, the mm should not take any information from the parent process, so need to clean it.
The exit() will mmput the mm and free the memory, if the mm is alrready be used for sp_group, need to clean the group first.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/fork.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/kernel/fork.c b/kernel/fork.c index be67a4aa10631..c410887b502b2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -91,6 +91,7 @@ #include <linux/kcov.h> #include <linux/livepatch.h> #include <linux/thread_info.h> +#include <linux/share_pool.h>
#include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -1027,6 +1028,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, goto fail_nocontext;
mm->user_ns = get_user_ns(user_ns); + + sp_init_mm(mm); + return mm;
fail_nocontext: @@ -1055,11 +1059,16 @@ static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users));
+ sp_group_exit(mm); + uprobe_clear_state(mm); exit_aio(mm); ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + + sp_group_post_exit(mm); + mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) {
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The share pool features is a big feature, it is mainly used to share user virtual memory for different processes in the same group. It could be used by this steps: 1. Process A create a new group which is owned by process A. 2. Process A add process B to the group. 3. Process A add process C to the same group. 4. Process B alloc a new memory VA, and write something in it. 5. The VA was send to the process C by IPC, then process C got it. 6. The process C access the VA and got the data directly. 7. The process A could add more processes in the group to share the memory. 8. Fix the memory by use the free function or exit the group.
The new features is enabled both by CONFIG_ASCEND_SHARE_POOL and the enable_ascend_share_pool flag, it would not affect anything if disabled.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Wu Peng wupeng58@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/Kconfig | 9 + mm/Makefile | 1 + mm/share_pool.c | 2278 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 2288 insertions(+) create mode 100644 mm/share_pool.c
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4c10d7be55881..b0b19554359fa 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1460,6 +1460,15 @@ config ASCEND_AUTO_TUNING_HUGEPAGE help The hugepage auto-tuning means the kernel dynamically manages the number of huage pages. To achieve this purpose, custom interfaces are required. + +config ASCEND_SHARE_POOL + bool "Enable support for the Share Pool Memory" + default n + select ARCH_USES_HIGH_VMA_FLAGS + select MM_OWNER + help + This feature allows multiple processes to share virtual memory both + in kernel and user level, which is only enabled for ascend platform. endif
endmenu diff --git a/mm/Makefile b/mm/Makefile index 876359db8b05f..deee05d22a853 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -107,3 +107,4 @@ obj-$(CONFIG_HMM) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE) += hugepage_tuning.o obj-$(CONFIG_PIN_MEMORY) += pin_mem.o +obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o diff --git a/mm/share_pool.c b/mm/share_pool.c new file mode 100644 index 0000000000000..0d29c85beb4d4 --- /dev/null +++ b/mm/share_pool.c @@ -0,0 +1,2278 @@ +/* + * Huawei Ascend Share Pool Memory + * + * Copyright (C) 2020 Huawei Limited + * Author: Tang Yizhou tangyizhou@huawei.com + * Zefan Li lizefan@huawei.com + * Wu Peng wupeng58@huawei.com + * Ding Tianhong dingtgianhong@huawei.com + * Zhou Guanghui zhouguanghui1@huawei.com + * Li Ming limingming.li@huawei.com + * + * This code is based on the hisilicon ascend platform. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/share_pool.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/mm.h> +#include <linux/mm_types.h> +#include <linux/idr.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/shmem_fs.h> +#include <linux/file.h> +#include <linux/printk.h> +#include <linux/hugetlb.h> +#include <linux/vmalloc.h> +#include <linux/pid.h> +#include <linux/pid_namespace.h> +#include <linux/atomic.h> +#include <linux/lockdep.h> +#include <linux/kernel.h> +#include <linux/falloc.h> +#include <linux/types.h> +#include <linux/idr.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +/* access control mode macros */ +#define AC_NONE 0 +#define AC_SINGLE_OWNER 1 + +#define spg_valid(spg) ((spg) && ((spg)->is_alive == true)) +#define ESPGMMEXIT 4000 + +#define byte2kb(size) ((size) / 1024) + +/* mdc scene hack */ +int enable_mdc_default_group; +static const int mdc_default_group_id = 1; + +/* access control mode */ +int sysctl_ac_mode = AC_NONE; + +/* idr of all sp_groups */ +static DEFINE_IDR(sp_group_idr); + +static DEFINE_MUTEX(sp_mutex); + +static BLOCKING_NOTIFIER_HEAD(sp_notifier_chain); + +static DEFINE_IDA(sp_group_id_ida); + +/*** Statistical and maintenance tools ***/ + +/* idr of all sp_proc_stats */ +static DEFINE_IDR(sp_stat_idr); + +/* per process memory usage statistics indexed by tgid */ +struct sp_proc_stat { + char comm[TASK_COMM_LEN]; + /* + * alloc amount minus free amount, may be negative when freed by + * another task in the same sp group. + */ + long amount; +}; + +/* for kthread buff_module_guard_work */ +static struct sp_proc_stat kthread_stat = {0}; + +/* The caller must hold sp_mutex. */ +static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk) +{ + struct sp_proc_stat *stat; + int id = tsk->mm->sp_stat_id; + int tgid = tsk->tgid; + int ret; + + if (id) { + stat = idr_find(&sp_stat_idr, id); + /* other threads in the same process may have initialized it */ + if (stat) + return stat; + } + + stat = kzalloc(sizeof(*stat), GFP_KERNEL); + if (stat == NULL) { + if (printk_ratelimit()) + pr_err("share pool: alloc proc stat failed due to lack of memory\n"); + return ERR_PTR(-ENOMEM); + } + + stat->amount = 0; + get_task_comm(stat->comm, tsk); + ret = idr_alloc(&sp_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); + if (ret < 0) { + if (printk_ratelimit()) + pr_err("share pool: proc stat idr alloc failed %d\n", ret); + kfree(stat); + return ERR_PTR(ret); + } + + tsk->mm->sp_stat_id = ret; + return stat; +} + +/* statistics of all sp area, protected by sp_area_lock */ +struct sp_spa_stat { + unsigned int total_num; + unsigned int alloc_num; + unsigned int k2u_task_num; + unsigned int k2u_spg_num; + unsigned long total_size; + unsigned long alloc_size; + unsigned long k2u_task_size; + unsigned long k2u_spg_size; +}; + +static struct sp_spa_stat spa_stat = {0}; + +/* statistics of all sp group born from sp_alloc and k2u(spg) */ +struct sp_spg_stat { + atomic_t spa_total_num; + atomic_t spa_total_size; +}; + +static struct sp_spg_stat spg_stat = {0}; + +/*** Global share pool VA allocator ***/ + +enum spa_type { + SPA_TYPE_ALLOC = 1, + SPA_TYPE_K2TASK, + SPA_TYPE_K2SPG, +}; + +/* + * We bump the reference when each mmap succeeds, and it will be dropped + * when vma is about to release, so sp_area object will be automatically + * freed when all tasks in the sp group has exited. + */ +struct sp_area { + unsigned long va_start; + unsigned long va_end; /* va_end always align to hugepage */ + unsigned long real_size; /* real size with alignment */ + bool is_hugepage; + atomic_t use_count; /* How many vmas use this VA region */ + struct rb_node rb_node; /* address sorted rbtree */ + struct list_head link; /* link to the spg->head */ + struct sp_group *spg; + enum spa_type type; /* where spa born from */ +}; +static DEFINE_SPINLOCK(sp_area_lock); +static struct rb_root sp_area_root = RB_ROOT; +bool host_svm_sp_enable = false; + +int sysctl_share_pool_hugepage_enable = 1; + +static unsigned long spa_size(struct sp_area *spa) +{ + return spa->real_size; +} + +static struct file *spa_file(struct sp_area *spa) +{ + if (spa->is_hugepage) + return spa->spg->file_hugetlb; + else + return spa->spg->file; +} + +/* the caller should hold sp_area_lock */ +static int spa_inc_usage(enum spa_type type, unsigned long size) +{ + /* + * all the calculations won't overflow due to system limitation and + * parameter checking in sp_alloc_area() + */ + spa_stat.total_num += 1; + spa_stat.total_size += size; + switch (type) { + case SPA_TYPE_ALLOC: + spa_stat.alloc_num += 1; + spa_stat.alloc_size += size; + break; + case SPA_TYPE_K2TASK: + spa_stat.k2u_task_num += 1; + spa_stat.k2u_task_size += size; + break; + case SPA_TYPE_K2SPG: + spa_stat.k2u_spg_num += 1; + spa_stat.k2u_spg_size += size; + break; + default: + /* usually impossible, perhaps a developer's mistake */ + return -EINVAL; + } + return 0; +} + +/* the caller should hold sp_area_lock */ +static int spa_dec_usage(enum spa_type type, unsigned long size) +{ + switch (type) { + case SPA_TYPE_ALLOC: + spa_stat.alloc_num -= 1; + spa_stat.alloc_size -= size; + break; + case SPA_TYPE_K2TASK: + spa_stat.k2u_task_num -= 1; + spa_stat.k2u_task_size -= size; + break; + case SPA_TYPE_K2SPG: + spa_stat.k2u_spg_num -= 1; + spa_stat.k2u_spg_size -= size; + break; + default: + /* usually impossible, perhaps a developer's mistake */ + spin_unlock(&sp_area_lock); + return -EINVAL; + } + spa_stat.total_num -= 1; + spa_stat.total_size -= size; + return 0; +} + +static void *sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate); + +static void free_sp_group(struct sp_group *spg) +{ + fput(spg->file); + fput(spg->file_hugetlb); + idr_remove(&sp_group_idr, spg->id); + if ((spg->id >= SPG_ID_AUTO_MIN && spg->id <= SPG_ID_AUTO_MAX) || + (spg->id >= SPG_ID_DVPP_PASS_THROUGH_MIN && + spg->id <= SPG_ID_DVPP_PASS_THROUGH_MAX)) + ida_free(&sp_group_id_ida, (unsigned int)spg->id); + kfree(spg); +} + +/* The caller must hold sp_mutex. */ +static struct sp_group *__sp_find_spg(int pid, int spg_id) +{ + struct sp_group *spg; + int ret = 0; + + if (spg_id == SPG_ID_DEFAULT) { + struct task_struct *tsk; + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else + get_task_struct(tsk); + rcu_read_unlock(); + if (ret) + return NULL; + + spg = tsk->mm->sp_group; + put_task_struct(tsk); + } else { + spg = idr_find(&sp_group_idr, spg_id); + } + + return spg; +} + +int sp_group_id_by_pid(int pid) +{ + struct sp_group *spg; + int spg_id = -ENODEV; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(pid, SPG_ID_DEFAULT); + if (spg_valid(spg)) + spg_id = spg->id; + + mutex_unlock(&sp_mutex); + return spg_id; +} +EXPORT_SYMBOL_GPL(sp_group_id_by_pid); + +/* The caller must hold sp_mutex. */ +static struct sp_group *find_or_alloc_sp_group(int spg_id) +{ + struct sp_group *spg; + int ret; + char name[20]; + + spg = idr_find(&sp_group_idr, spg_id); + if (!spg) { + struct user_struct *user = NULL; + int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; + + spg = kzalloc(sizeof(*spg), GFP_KERNEL); + if (spg == NULL) { + if (printk_ratelimit()) + pr_err("share pool: alloc spg failed due to lack of memory\n"); + return ERR_PTR(-ENOMEM); + } + spg->id = spg_id; + atomic_set(&spg->spa_num, 0); + atomic_set(&spg->size, 0); + spg->is_alive = true; + spg->hugepage_failures = 0; + spg->dvpp_multi_spaces = false; + spg->owner = current->group_leader; + atomic_set(&spg->use_count, 0); + INIT_LIST_HEAD(&spg->procs); + INIT_LIST_HEAD(&spg->spa_list); + + ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id+1, + GFP_KERNEL); + if (ret < 0) { + if (printk_ratelimit()) + pr_err("share pool: create group idr alloc failed\n"); + goto out_kfree; + } + + sprintf(name, "sp_group_%d", spg_id); + spg->file = shmem_kernel_file_setup(name, MAX_LFS_FILESIZE, + VM_NORESERVE); + if (IS_ERR(spg->file)) { + if (printk_ratelimit()) + pr_err("share pool: file setup for small page failed %ld\n", + PTR_ERR(spg->file)); + ret = PTR_ERR(spg->file); + goto out_idr; + } + + spg->file_hugetlb = hugetlb_file_setup(name, MAX_LFS_FILESIZE, + VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, hsize_log); + if (IS_ERR(spg->file_hugetlb)) { + if (printk_ratelimit()) + pr_err("share pool: file setup for hugepage failed %ld\n", + PTR_ERR(spg->file_hugetlb)); + ret = PTR_ERR(spg->file_hugetlb); + goto out_fput; + } + } + + return spg; + +out_fput: + fput(spg->file); +out_idr: + idr_remove(&sp_group_idr, spg_id); +out_kfree: + kfree(spg); + return ERR_PTR(ret); +} + +static void __sp_area_drop_locked(struct sp_area *spa); + +/* The caller must hold sp_mutex. */ +static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) +{ + struct sp_area *spa, *prev = NULL; + int err; + + if (!mmget_not_zero(mm)) + return; + down_write(&mm->mmap_sem); + spin_lock(&sp_area_lock); + + list_for_each_entry(spa, &mm->sp_group->spa_list, link) { + if (&spa->link == stop) + break; + + if (prev) + __sp_area_drop_locked(prev); + prev = spa; + + atomic_inc(&spa->use_count); + spin_unlock(&sp_area_lock); + + err = do_munmap(mm, spa->va_start, spa_size(spa), NULL); + if (err) { + /* we are not supposed to fail */ + pr_err("share pool: failed to unmap VA %pK when munmap task areas\n", + (void *)spa->va_start); + } + + spin_lock(&sp_area_lock); + } + if (prev) + __sp_area_drop_locked(prev); + + spin_unlock(&sp_area_lock); + up_write(&mm->mmap_sem); + mmput(mm); +} + +/** + * sp_group_add_task - add a process to an sp_group + * @pid: the pid of the task to be added + * @spg_id: the ID of the sp_group + * + * A thread group can't be added to more than one sp_group. + * + * Return: The manually allocated ID is between [SPG_ID_MIN, SPG_ID_MAX] + * The automatically allocated ID is between [SPG_ID_AUTO_MIN, SPG_ID_AUTO_MAX] + * When negative, the return value is -errno. + */ +int sp_group_add_task(int pid, int spg_id) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct sp_group *spg; + int ret = 0; + struct sp_area *spa, *prev = NULL; + struct sp_proc_stat *stat; + + /* mdc scene hack */ + if (enable_mdc_default_group) + spg_id = mdc_default_group_id; + + if ((spg_id < SPG_ID_MIN || spg_id > SPG_ID_AUTO) + && spg_id != SPG_ID_DVPP_PASS_THROUGH) { + if (printk_ratelimit()) + pr_err("share pool: task add group failed due to invalid group id %d\n", spg_id); + return -EINVAL; + } + + if (spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) { + mutex_lock(&sp_mutex); + spg = idr_find(&sp_group_idr, spg_id); + if (!spg_valid(spg)) { + mutex_unlock(&sp_mutex); + pr_err("share pool: task add group failed because group id %d hasn't been create or dead\n", + spg_id); + return -EINVAL; + } + mutex_unlock(&sp_mutex); + } + + if (spg_id == SPG_ID_AUTO) { + spg_id = ida_alloc_range(&sp_group_id_ida, SPG_ID_AUTO_MIN, + SPG_ID_AUTO_MAX, GFP_ATOMIC); + if (spg_id < 0) { + pr_err("share pool: task add group failed when automatically generate group id failed\n"); + return spg_id; + } + } + + if (spg_id == SPG_ID_DVPP_PASS_THROUGH) { + spg_id = ida_alloc_range(&sp_group_id_ida, + SPG_ID_DVPP_PASS_THROUGH_MIN, + SPG_ID_DVPP_PASS_THROUGH_MAX, GFP_ATOMIC); + if (spg_id < 0) { + pr_err("share pool: task add group failed when automatically generate group id failed" + "in DVPP pass through\n"); + return spg_id; + } + } + + mutex_lock(&sp_mutex); + + rcu_read_lock(); + + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else if (tsk->mm->sp_group) /* if it's already in a sp_group */ + ret = -EEXIST; + else + get_task_struct(tsk); + + rcu_read_unlock(); + if (ret) + goto out_unlock; + + spg = find_or_alloc_sp_group(spg_id); + if (IS_ERR(spg) || !spg_valid(spg)) { + ret = PTR_ERR(spg); + goto out_put_task; + } + /* access control permission check */ + if (sysctl_ac_mode == AC_SINGLE_OWNER) { + if (spg->owner != current->group_leader) { + ret = -EPERM; + goto out_put_task; + } + } + + /* per process statistics initialization */ + stat = sp_init_proc_stat(tsk); + if (IS_ERR(stat)) { + ret = PTR_ERR(stat); + pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); + goto out_put_task; + } + + mm = tsk->mm; + mm->sp_group = spg; + atomic_inc(&spg->use_count); + list_add_tail(&tsk->mm->sp_node, &spg->procs); + /* + * create mappings of existing shared memory segments into this + * new process' page table. + */ + spin_lock(&sp_area_lock); + + list_for_each_entry(spa, &spg->spa_list, link) { + unsigned long populate = 0; + struct file *file = spa_file(spa); + void *p; + + if (prev) + __sp_area_drop_locked(prev); + prev = spa; + + atomic_inc(&spa->use_count); + spin_unlock(&sp_area_lock); + + p = sp_mmap(mm, file, spa, &populate); + if (IS_ERR(p) && (PTR_ERR(p) != -ESPGMMEXIT)) { + sp_munmap_task_areas(mm, &spa->link); + ret = PTR_ERR(p); + pr_err("share pool: task add group sp mmap failed, ret %d\n", ret); + spin_lock(&sp_area_lock); + break; + } + + if (PTR_ERR(p) == -ESPGMMEXIT) { + pr_err("share pool: task add group sp mmap failed, ret -ESPGMEXIT\n"); + spin_lock(&sp_area_lock); + ret = -ESPGMMEXIT; + break; + } + + if (populate) { + ret = do_mm_populate(mm, spa->va_start, populate, 0); + if (ret) { + if (printk_ratelimit()) + pr_err("share pool: task add group failed when mm populate failed: %d\n", + ret); + sp_munmap_task_areas(mm, spa->link.next); + } + } + + spin_lock(&sp_area_lock); + } + if (prev) + __sp_area_drop_locked(prev); + spin_unlock(&sp_area_lock); + + if (unlikely(ret)) { + idr_remove(&sp_stat_idr, mm->sp_stat_id); + kfree(stat); + } + +out_put_task: + put_task_struct(tsk); +out_unlock: + mutex_unlock(&sp_mutex); + return ret == 0 ? spg_id : ret; +} +EXPORT_SYMBOL_GPL(sp_group_add_task); + +static void spg_exit_lock(bool *unlock) +{ + switch (mutex_trylock_recursive(&sp_mutex)) { + case MUTEX_TRYLOCK_RECURSIVE: + *unlock = false; + break; + case MUTEX_TRYLOCK_FAILED: + mutex_lock(&sp_mutex); + *unlock = true; + break; + case MUTEX_TRYLOCK_SUCCESS: + *unlock = true; + break; + default: + BUG(); + } +} + +static void spg_exit_unlock(bool unlock) +{ + if (unlock) + mutex_unlock(&sp_mutex); +} + +/* + * Do cleanup when a process exits. + */ +void sp_group_exit(struct mm_struct *mm) +{ + bool is_alive = true; + bool unlock; + + if (!enable_ascend_share_pool) + return; + + /* + * Nothing to do if this thread group doesn't belong to any sp_group. + * No need to protect this check with lock because we can add a task + * to a group if !PF_EXITING. + */ + if (!mm->sp_group) + return; + + spg_exit_lock(&unlock); + if (list_is_singular(&mm->sp_group->procs)) + is_alive = mm->sp_group->is_alive = false; + list_del(&mm->sp_node); + spg_exit_unlock(unlock); + + /* + * To avoid calling this with sp_mutex held, we first mark the + * sp_group as dead and then send the notification and then do + * the real cleanup in sp_group_post_exit(). + */ + if (!is_alive) + blocking_notifier_call_chain(&sp_notifier_chain, 0, + mm->sp_group); +} + +void sp_group_post_exit(struct mm_struct *mm) +{ + bool is_alive; + struct sp_proc_stat *stat; + bool unlock; + + if (!enable_ascend_share_pool) + return; + + if (!mm->sp_group) + return; + + spg_exit_lock(&unlock); + is_alive = mm->sp_group->is_alive; + + /* pointer stat must be valid, we don't need to check sanity */ + stat = idr_find(&sp_stat_idr, mm->sp_stat_id); + /* + * There are two basic scenarios when a process in the share pool is + * exiting but its share pool memory usage is not 0. + * 1. Process A called sp_alloc(), but it terminates without calling + * sp_free(). Then its share pool memory usage is a positive number. + * 2. Process A never called sp_alloc(), and process B in the same spg + * called sp_alloc() to get an addr u. Then A gets u somehow and + * called sp_free(u). Now A's share pool memory usage is a negative + * number. Notice B's memory usage will be a positive number. + * + * We decide to print a info when seeing both of the scenarios. + */ + if (stat && stat->amount != 0) + pr_info("share pool: process %s(%d) of sp group %d exits. " + "It applied %ld aligned KB\n", + stat->comm, mm->sp_stat_id, + mm->sp_group->id, byte2kb(stat->amount)); + + idr_remove(&sp_stat_idr, mm->sp_stat_id); + + if (atomic_dec_and_test(&mm->sp_group->use_count)) { + BUG_ON(is_alive); + free_sp_group(mm->sp_group); + } + spg_exit_unlock(unlock); + + kfree(stat); +} + +/* the caller must hold sp_area_lock */ +static void __insert_sp_area(struct sp_area *spa) +{ + struct rb_node **p = &sp_area_root.rb_node; + struct rb_node *parent = NULL; + + while (*p) { + struct sp_area *tmp; + + parent = *p; + tmp = rb_entry(parent, struct sp_area, rb_node); + if (spa->va_start < tmp->va_end) + p = &(*p)->rb_left; + else if (spa->va_end > tmp->va_start) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&spa->rb_node, parent, p); + rb_insert_color(&spa->rb_node, &sp_area_root); +} + +/* + * Allocate a region of VA from the share pool. + * @size - the size of VA to allocate + * + * The caller must hold must sp_mutex when input parameter spg is not NULL + * + * Return NULL if fail. + */ +static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, + struct sp_group *spg, enum spa_type type) +{ + struct sp_area *spa; + struct rb_node *n; + unsigned long vstart = MMAP_SHARE_POOL_START; + unsigned long vend = MMAP_SHARE_POOL_16G_START; + unsigned long addr; + unsigned long size_align = ALIGN(size, 1 << 21); /* align to 2M */ + + if ((flags & SP_DVPP)) { + if (host_svm_sp_enable == false) { + vstart = MMAP_SHARE_POOL_16G_START; + vend = MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE; + } else { + vstart = spg->dvpp_va_start; + vend = spg->dvpp_va_start + spg->dvpp_size; + } + } + + addr = vstart; + + if (!sysctl_share_pool_hugepage_enable) + flags &= ~(SP_HUGEPAGE_ONLY | SP_HUGEPAGE); + + spa = kmalloc(sizeof(struct sp_area), GFP_KERNEL); + if (unlikely(!spa)) { + if (printk_ratelimit()) + pr_err("share pool: alloc spa failed due to lack of memory\n"); + return NULL; + } + + spin_lock(&sp_area_lock); + + n = sp_area_root.rb_node; + if (n) { + struct sp_area *first = NULL; + + do { + struct sp_area *tmp; + tmp = rb_entry(n, struct sp_area, rb_node); + if (tmp->va_end >= addr) { + if (!first && tmp->va_start < addr + size_align) + first = tmp; + n = n->rb_left; + } else { + first = tmp; + n = n->rb_right; + } + } while (n); + + if (!first) + goto found; + + if (first->va_end < addr) { + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct sp_area, rb_node); + else + goto found; + } + + while (addr + size_align >= first->va_start && + addr + size_align <= vend) { + addr = first->va_end; + + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct sp_area, rb_node); + else + goto found; + } + } +found: + if (addr + size_align > vend) { + goto error; + } + + spa->va_start = addr; + spa->va_end = addr + size_align; + spa->real_size = size; + spa->is_hugepage = (flags & SP_HUGEPAGE); + spa->spg = spg; + atomic_set(&spa->use_count, 1); + spa->type = type; + + if (spa_inc_usage(type, size)) + goto error; + + __insert_sp_area(spa); + if (spa->spg) { + atomic_inc(&spg->spa_num); + atomic_add(size, &spg->size); + atomic_inc(&spg_stat.spa_total_num); + atomic_add(size, &spg_stat.spa_total_size); + list_add_tail(&spa->link, &spg->spa_list); + } + spin_unlock(&sp_area_lock); + + return spa; + +error: + spin_unlock(&sp_area_lock); + kfree(spa); + return NULL; +} + +/* the caller should hold sp_area_lock */ +static struct sp_area *__find_sp_area_locked(unsigned long addr) +{ + struct rb_node *n = sp_area_root.rb_node; + + while (n) { + struct sp_area *spa; + + spa = rb_entry(n, struct sp_area, rb_node); + if (addr < spa->va_start) { + n = n->rb_left; + } else if (addr > spa->va_start) { + n = n->rb_right; + } else { + return spa; + } + } + + return NULL; +} + +static struct sp_area *__find_sp_area(unsigned long addr) +{ + struct sp_area *n; + spin_lock(&sp_area_lock); + n = __find_sp_area_locked(addr); + if (n) + atomic_inc(&n->use_count); + spin_unlock(&sp_area_lock); + return n; +} + +/* + * Free the VA region starting from addr to the share pool + */ +static void sp_free_area(struct sp_area *spa) +{ + lockdep_assert_held(&sp_area_lock); + + spa_dec_usage(spa->type, spa->real_size); /* won't fail */ + if (spa->spg) { + atomic_dec(&spa->spg->spa_num); + atomic_sub(spa->real_size, &spa->spg->size); + atomic_dec(&spg_stat.spa_total_num); + atomic_sub(spa->real_size, &spg_stat.spa_total_size); + list_del(&spa->link); + } + rb_erase(&spa->rb_node, &sp_area_root); + RB_CLEAR_NODE(&spa->rb_node); + kfree(spa); +} + +static void __sp_area_drop_locked(struct sp_area *spa) +{ + /* + * Considering a situation where task A and B are in the same spg. + * A is exiting and calling remove_vma(). Before A calls this func, + * B calls sp_free() to free the same spa. So spa maybe NULL when A + * calls this func later. + */ + if (!spa) + return; + + if (atomic_dec_and_test(&spa->use_count)) + sp_free_area(spa); +} + +static void __sp_area_drop(struct sp_area *spa) +{ + spin_lock(&sp_area_lock); + __sp_area_drop_locked(spa); + spin_unlock(&sp_area_lock); +} + +void sp_area_drop(struct vm_area_struct *vma) +{ + struct sp_area *spa; + + if (!sp_check_vm_share_pool(vma->vm_flags)) + return; + + /* + * Considering a situation where task A and B are in the same spg. + * A is exiting and calling remove_vma() -> ... -> sp_area_drop(). + * Concurrently, B is calling sp_free() to free the same spa. + * __find_sp_area_locked() and __sp_area_drop_locked() should be + * an atomic operation. + */ + spin_lock(&sp_area_lock); + spa = __find_sp_area_locked(vma->vm_start); + __sp_area_drop_locked(spa); + spin_unlock(&sp_area_lock); +} + +/* The caller must hold sp_mutex. */ +static void sp_munmap(struct mm_struct *mm, unsigned long addr, + unsigned long size) +{ + int err; + + if (!mmget_not_zero(mm)) + return; + down_write(&mm->mmap_sem); + + err = do_munmap(mm, addr, size, NULL); + if (err) { + /* we are not supposed to fail */ + pr_err("share pool: failed to unmap VA %pK when sp munmap\n", (void *)addr); + } + + up_write(&mm->mmap_sem); + mmput(mm); +} + +/* The caller must hold sp_mutex. */ +static void __sp_free(struct sp_group *spg, unsigned long addr, + unsigned long size, struct mm_struct *stop) +{ + struct mm_struct *mm; + struct mm_struct *tmp; + + list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + if (mm == stop) + break; + sp_munmap(mm, addr, size); + } +} + +/* + * Free the memory allocated by sp_alloc() + * @addr - the starting VA of the memory + * + * Return fail if the memory can't be found or was not allocted by share pool. + */ +int sp_free(unsigned long addr) +{ + struct sp_area *spa; + struct sp_proc_stat *stat; + int mode; + loff_t offset; + int ret = 0; + + mutex_lock(&sp_mutex); + + /* + * Access control: a share pool addr can only be freed by another task + * in the same spg or a kthread (such as buff_module_guard_work) + */ + spa = __find_sp_area(addr); + if (spa) { + if (current->mm != NULL) { + if (current->mm->sp_group != spa->spg) { + ret = -EPERM; + goto drop_spa; + } + } + } else { /* spa == NULL */ + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: sp_free invalid input addr %pK\n", (void *)addr); + goto out; + } + + if (!spg_valid(spa->spg)) + goto drop_spa; + + __sp_free(spa->spg, spa->va_start, spa_size(spa), NULL); + + /* Free the memory of the backing shmem or hugetlbfs */ + mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; + offset = addr - MMAP_SHARE_POOL_START; + ret = vfs_fallocate(spa_file(spa), mode, offset, spa_size(spa)); + if (ret) + pr_err("share pool: fallocate failed: %d\n", ret); + + /* pointer stat may be invalid because of kthread buff_module_guard_work */ + if (current->mm == NULL) { + kthread_stat.amount -= spa->real_size; + } else { + stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); + if (stat) + stat->amount -= spa->real_size; + else + BUG(); + } + +drop_spa: + __sp_area_drop(spa); +out: + mutex_unlock(&sp_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(sp_free); + +/* wrapper of __do_mmap() and the caller must hold down_write(&mm->mmap_sem). */ +static unsigned long __sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate) +{ + unsigned long addr = spa->va_start; + unsigned long size = spa_size(spa); + unsigned long prot = PROT_READ | PROT_WRITE; + unsigned long flags = MAP_FIXED | MAP_SHARED | MAP_LOCKED | + MAP_POPULATE | MAP_SHARE_POOL; + unsigned long vm_flags = VM_NORESERVE | VM_SHARE_POOL | VM_DONTCOPY; + unsigned long pgoff = (addr - MMAP_SHARE_POOL_START) >> PAGE_SHIFT; + + atomic_inc(&spa->use_count); + addr = __do_mmap(mm, file, addr, size, prot, flags, vm_flags, pgoff, + populate, NULL); + if (IS_ERR_VALUE(addr)) { + atomic_dec(&spa->use_count); + pr_err("share pool: do_mmap fails %ld\n", addr); + } + + return addr; +} + +static void *sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate) +{ + unsigned long addr; + + if (!mmget_not_zero(mm)) + return ERR_PTR(-ESPGMMEXIT); + down_write(&mm->mmap_sem); + addr = __sp_mmap(mm, file, spa, populate); + up_write(&mm->mmap_sem); + mmput(mm); + + if (IS_ERR_VALUE(addr)) + return ERR_PTR(addr); + + BUG_ON(addr != spa->va_start); + return (void *)addr; +} + +/** + * Allocate shared memory for all the processes in the same sp_group + * size - the size of memory to allocate + * sp_flags - how to allocate the memory + * spg_id - the share group that the memory is allocated to. + * + * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + */ +void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) +{ + struct sp_group *spg = NULL; + struct sp_area *spa = NULL; + struct sp_proc_stat *stat; + unsigned long sp_addr; + void *p_mmap, *p = ERR_PTR(-ENODEV); + struct mm_struct *mm; + struct file *file; + unsigned long size_aligned; + int ret = 0; + struct mm_struct *tmp; + + /* mdc scene hack */ + if (enable_mdc_default_group) + spg_id = mdc_default_group_id; + + if (spg_id != SPG_ID_DEFAULT && spg_id < SPG_ID_MIN) { + if (printk_ratelimit()) + pr_err("share pool: allocation failed due to invalid group id %d\n", spg_id); + return ERR_PTR(-EINVAL); + } + + if (sp_flags & ~(SP_HUGEPAGE_ONLY | SP_HUGEPAGE | SP_DVPP)) { + if (printk_ratelimit()) + pr_err("share pool: allocation failed due to invalid flag %lu\n", sp_flags); + return ERR_PTR(-EINVAL); + } + + if (sp_flags & SP_HUGEPAGE_ONLY) + sp_flags |= SP_HUGEPAGE; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(current->pid, SPG_ID_DEFAULT); + mutex_unlock(&sp_mutex); + if (!spg) { /* DVPP pass through scene: first call sp_alloc() */ + /* mdc scene hack */ + if (enable_mdc_default_group) + ret = sp_group_add_task(current->tgid, spg_id); + else + ret = sp_group_add_task(current->tgid, + SPG_ID_DVPP_PASS_THROUGH); + /* + * The multi-thread contention may cause repeated joins to the group. + * The judgment is added to prevent exit in this case. + */ + if (ret < 0 && (ret != -EEXIST)) { + pr_err("share pool: allocation failed due to add group error %d in DVPP pass through scenario", + ret); + p = ERR_PTR(ret); + goto out; + } + mutex_lock(&sp_mutex); + spg = current->mm->sp_group; + } else { /* other scenes */ + mutex_lock(&sp_mutex); + if (spg_id != SPG_ID_DEFAULT) { + /* the caller should be a member of the sp group */ + if (spg != idr_find(&sp_group_idr, spg_id)) + goto out; + } + } + + if (!spg_valid(spg)) { + pr_err("share pool: sp alloc failed, spg is invalid\n"); + goto out; + } + + if (!sysctl_share_pool_hugepage_enable) + sp_flags &= ~(SP_HUGEPAGE_ONLY | SP_HUGEPAGE); + + if (sp_flags & SP_HUGEPAGE) { + file = spg->file_hugetlb; + size_aligned = ALIGN(size, PMD_SIZE); + } else { + file = spg->file; + size_aligned = ALIGN(size, PAGE_SIZE); + } +try_again: + spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_ALLOC); + if (!spa) { + if (printk_ratelimit()) + pr_err("share pool: allocation failed due to alloc spa failure\n"); + p = ERR_PTR(-ENOMEM); + goto out; + } + sp_addr = spa->va_start; + + /* create mapping for each process in the group */ + list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + unsigned long populate = 0; + struct vm_area_struct *vma; + + p_mmap = sp_mmap(mm, file, spa, &populate); + if (IS_ERR(p_mmap) && (PTR_ERR(p_mmap) != -ESPGMMEXIT)) { + p = p_mmap; + __sp_free(spg, sp_addr, size_aligned, mm); + pr_err("share pool: allocation sp mmap failed, ret %ld\n", PTR_ERR(p_mmap)); + break; + } + + if (PTR_ERR(p_mmap) == -ESPGMMEXIT) { + pr_info("share pool: allocation sp mmap failed, ret -ESPGMMEXIT\n"); + continue; + } + + p = p_mmap; /* success */ + if (populate == 0) + continue; + + if (!mmget_not_zero(mm)) + continue; + down_write(&mm->mmap_sem); + vma = find_vma(mm, sp_addr); + if (unlikely(!vma)) { + pr_err("share pool: allocation failed due to find %pK vma failure\n", + (void *)sp_addr); + p = ERR_PTR(-EINVAL); + up_write(&mm->mmap_sem); + mmput(mm); + goto out; + } + /* clean PTE_RDONLY flags or trigger SMMU event */ + vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); + up_write(&mm->mmap_sem); + /* + * We are not ignoring errors, so if we fail to allocate + * physical memory we just return failure, so we won't encounter + * page fault later on, and more importantly sp_make_share_u2k() + * depends on this feature (and MAP_LOCKED) to work correctly. + */ + ret = do_mm_populate(mm, sp_addr, populate, 0); + if (ret) { + __sp_free(spg, sp_addr, size_aligned, + list_next_entry(mm, sp_node)); + + if (file == spg->file_hugetlb) { + spg->hugepage_failures++; + + /* fallback to small pages */ + if (!(sp_flags & SP_HUGEPAGE_ONLY)) { + file = spg->file; + spa->is_hugepage = false; + size_aligned = ALIGN(size, PAGE_SIZE); + __sp_area_drop(spa); + mmput(mm); + goto try_again; + } + } + + if (printk_ratelimit()) + pr_err("share pool: allocation failed due to mm populate failed: %d\n", + ret); + p = ERR_PTR(ret); + mmput(mm); + break; + } + mmput(mm); + } + + if (!IS_ERR(p)) { + stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); + if (stat) + stat->amount += size_aligned; + } + +out: + mutex_unlock(&sp_mutex); + + /* this will free spa if mmap failed */ + if (spa) + __sp_area_drop(spa); + + return p; +} +EXPORT_SYMBOL_GPL(sp_alloc); + +static unsigned long __sp_remap_get_pfn(unsigned long kva) +{ + unsigned long pfn; + if (is_vmalloc_addr((void *)kva)) + pfn = vmalloc_to_pfn((void *)kva); + else + pfn = virt_to_pfn(kva); + + return pfn; +} + +/* + * return value: >0 means this is a hugepage addr + * =0 means a normal addr. <0 means an errno. + */ +static int is_vmap_hugepage(unsigned long addr) +{ + struct vm_struct *area; + + if (unlikely(!addr)) { + if (printk_ratelimit()) + pr_err("share pool: null pointer when judge vmap addr\n"); + return -EINVAL; + } + + area = find_vm_area((void *)addr); + if (unlikely(!area)) { + if (printk_ratelimit()) + pr_err("share pool: failed to find vm area(%lx)\n", addr); + return -EINVAL; + } + + if (area->flags & VM_HUGE_PAGES) + return 1; + else + return 0; +} + +static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, + struct mm_struct *mm) +{ + struct vm_area_struct *vma; + unsigned long ret_addr; + unsigned long populate = 0; + unsigned long addr, buf, offset; + struct file *file = NULL; + int ret = 0; + struct user_struct *user = NULL; + int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; + + if (spa->is_hugepage) { + file = hugetlb_file_setup(HUGETLB_ANON_FILE, spa_size(spa), VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE, hsize_log); + if (IS_ERR(file)) { + pr_err("share pool: file setup for k2u hugepage failed %ld\n", PTR_ERR(file)); + return PTR_ERR(file); + } + } + + if (!mmget_not_zero(mm)) { + ret_addr = -ESPGMMEXIT; + goto put_file; + } + down_write(&mm->mmap_sem); + + ret_addr = __sp_mmap(mm, file, spa, &populate); + if (IS_ERR_VALUE(ret_addr)) { + pr_err("share pool: k2u mmap failed %lx\n", ret_addr); + goto out; + } + BUG_ON(ret_addr != spa->va_start); + + vma = find_vma(mm, ret_addr); + BUG_ON(vma == NULL); + vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); + + if (is_vm_hugetlb_page(vma)) { + ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); + if (ret) { + pr_err("share pool: remap vmalloc hugepage failed, ret %d\n", ret); + ret_addr = ret; + goto out; + } + } else { + buf = ret_addr; + addr = kva; + offset = 0; + do { + ret = remap_pfn_range(vma, buf, __sp_remap_get_pfn(addr), PAGE_SIZE, + __pgprot(vma->vm_page_prot.pgprot)); + if (ret) { + ret_addr = ret; + goto out; + } + offset += PAGE_SIZE; + buf += PAGE_SIZE; + addr += PAGE_SIZE; + } while (offset < spa_size(spa)); + } + +out: + up_write(&mm->mmap_sem); + mmput(mm); +put_file: + if (file) + fput(file); + + return ret_addr; +} + +static void *sp_make_share_kva_to_task(unsigned long kva, struct sp_area *spa, + int pid) +{ + struct task_struct *tsk; + unsigned long ret_addr; + void *p = ERR_PTR(-ENODEV); + int ret = 0; + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else + get_task_struct(tsk); + + rcu_read_unlock(); + if (ret) + return ERR_PTR(ret); + + ret_addr = sp_remap_kva_to_vma(kva, spa, tsk->mm); + if (IS_ERR_VALUE(ret_addr)) { + pr_err("share pool: remap k2u to task failed, ret %ld\n", ret_addr); + sp_munmap(tsk->mm, spa->va_start, spa_size(spa)); + p = ERR_PTR(ret_addr); + goto out; + } + + p = (void *)ret_addr; +out: + put_task_struct(tsk); + return p; +} + +static void *sp_make_share_kva_to_spg(unsigned long kva, struct sp_area *spa, + struct sp_group *spg) +{ + struct mm_struct *mm; + struct mm_struct *tmp; + unsigned long ret_addr = -ENODEV; + unsigned long uva = -ENODEV; + void *p = ERR_PTR(-ENODEV); + + list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + ret_addr = sp_remap_kva_to_vma(kva, spa, mm); + if (IS_ERR_VALUE(ret_addr) && (ret_addr != -ESPGMMEXIT)) { + pr_err("share pool: remap k2u to spg failed, ret %ld \n", ret_addr); + __sp_free(spg, spa->va_start, spa_size(spa), + list_next_entry(mm, sp_node)); + p = ERR_PTR(ret_addr); + goto out; + } + + if (ret_addr == -ESPGMMEXIT) { + pr_info("share pool: remap k2u, ret is -ESPGMMEXIT\n"); + continue; + } + + uva = ret_addr; + } + p = (void *)uva; +out: + return p; +} + +/** + * Share kernel memory to a specified process or sp_group + * @kva: the VA of shared kernel memory + * @size: the size of shared kernel memory + * @sp_flags: how to allocate the memory. We only support SP_DVPP. + * @pid: the pid of the specified process + * @spg_id: currently, only support default value(SPG_ID_DEFAULT) and other values + * are useless. + * + * Return: the shared target user address to start at + * + * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + */ +void *sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id) +{ + void *uva = ERR_PTR(-ENODEV); + struct sp_group *spg; + struct sp_area *spa; + unsigned long kva_aligned; + unsigned long size_aligned; + unsigned int page_size = PAGE_SIZE; + int ret; + + if (sp_flags & ~SP_DVPP) { + if (printk_ratelimit()) + pr_err("share pool: k2u sp_flags %lu error\n", sp_flags); + return ERR_PTR(-EINVAL); + } + + ret = is_vmap_hugepage(kva); + if (ret > 0) { + sp_flags |= SP_HUGEPAGE; + page_size = PMD_SIZE; + } else if (ret == 0) { + /* do nothing */ + } else { + return ERR_PTR(ret); + } + /* aligned down kva is convenient for caller to start with any valid kva */ + kva_aligned = ALIGN_DOWN(kva, page_size); + size_aligned = ALIGN(kva + size, page_size) - kva_aligned; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(pid, spg_id); + if (spg == NULL) { + spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); + if (!spa) { + mutex_unlock(&sp_mutex); + if (printk_ratelimit()) + pr_err("share pool: k2u failed due to alloc spa failure\n"); + return ERR_PTR(-ENOMEM); + } + uva = sp_make_share_kva_to_task(kva_aligned, spa, pid); + mutex_unlock(&sp_mutex); + } else if (spg_valid(spg)) { + spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); + if (!spa) { + mutex_unlock(&sp_mutex); + if (printk_ratelimit()) + pr_err("share pool: k2u failed due to alloc spa failure\n"); + return ERR_PTR(-ENOMEM); + } + + uva = sp_make_share_kva_to_spg(kva_aligned, spa, spg); + mutex_unlock(&sp_mutex); + } else { + mutex_unlock(&sp_mutex); + pr_err("share pool: failed to make k2u\n"); + return NULL; + } + + if (!IS_ERR(uva)) + uva = uva + (kva - kva_aligned); + + __sp_area_drop(spa); + return uva; +} +EXPORT_SYMBOL_GPL(sp_make_share_k2u); + +static int sp_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page *page = pte_page(*pte); + struct sp_walk_data *sp_walk_data; + + if (unlikely(!pte_present(*pte))) { + if (printk_ratelimit()) + pr_err("share pool: the page of addr %pK unexpectedly not in RAM\n", (void *)addr); + return -EFAULT; + } + + sp_walk_data = walk->private; + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + return 0; +} + +static int sp_test_walk(unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + /* + * FIXME: The devmm driver uses remap_pfn_range() but actually there + * are associated struct pages, so they should use vm_map_pages() or + * similar APIs. Before the driver has been converted to correct APIs + * we use this test_walk() callback so we can treat VM_PFNMAP VMAs as + * normal VMAs. + */ + return 0; +} + +static int sp_pte_hole(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + if (printk_ratelimit()) + pr_err("share pool: hole [%pK, %pK) appeared unexpectedly\n", (void *)start, (void *)end); + return -EFAULT; +} + +static int sp_hugetlb_entry(pte_t *ptep, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + pte_t pte = huge_ptep_get(ptep); + struct page *page = pte_page(pte); + struct sp_walk_data *sp_walk_data; + + if (unlikely(!pte_present(pte))) { + if (printk_ratelimit()) + pr_err("share pool: the page of addr %pK unexpectedly not in RAM\n", (void *)addr); + return -EFAULT; + } + + sp_walk_data = walk->private; + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + return 0; +} + +/** + * the caller must hold mm->mmap_sem + * + * Notes for parameter alignment: + * When size == 0, let it be page_size, so that at least one page is walked. + * + * When size > 0, for convenience, usually the parameters of uva and + * size are not page aligned. There are four different alignment scenarios and + * we must handler all of them correctly. + * + * The basic idea is to align down uva and align up size so all the pages + * in range [uva, uva + size) are walked. However, there are special cases. + * + * Considering a 2M-hugepage addr scenario. Assuming the caller wants to + * traverse range [1001M, 1004.5M), so uva and size is 1001M and 3.5M + * accordingly. The aligned-down uva is 1000M and the aligned-up size is 4M. + * The traverse range will be [1000M, 1004M). Obviously, the final page for + * [1004M, 1004.5M) is not covered. + * + * To fix this problem, we need to walk an additional page, size should be + * ALIGN(uva+size) - uva_aligned + */ +static int __sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + int ret = 0; + struct vm_area_struct *vma; + unsigned long page_nr; + struct page **pages = NULL; + bool is_hugepage = false; + unsigned long uva_aligned; + unsigned long size_aligned; + unsigned int page_size = PAGE_SIZE; + struct mm_walk sp_walk = {}; + + /* + * Here we also support non share pool memory in this interface + * because the caller can't distinguish whether a uva is from the + * share pool or not. It is not the best idea to do so, but currently + * it simplifies overall design. + * + * In this situation, the correctness of the parameters is mainly + * guaranteed by the caller. + */ + vma = find_vma(tsk->mm, uva); + if (!vma) { + if (printk_ratelimit()) + pr_err("share pool: u2k input uva %pK is invalid\n", (void *)uva); + return -EINVAL; + } + if ((is_vm_hugetlb_page(vma)) || is_vm_huge_special(vma)) + is_hugepage = true; + + sp_walk.pte_hole = sp_pte_hole; + sp_walk.test_walk = sp_test_walk; + if (is_hugepage) { + sp_walk_data->is_hugepage = true; + sp_walk.hugetlb_entry = sp_hugetlb_entry; + page_size = PMD_SIZE; + } else { + sp_walk_data->is_hugepage = false; + sp_walk.pte_entry = sp_pte_entry; + } + + sp_walk_data->page_size = page_size; + uva_aligned = ALIGN_DOWN(uva, page_size); + sp_walk_data->uva_aligned = uva_aligned; + if (size == 0) + size_aligned = page_size; + else + /* special alignment handling */ + size_aligned = ALIGN(uva + size, page_size) - uva_aligned; + + if (uva_aligned + size_aligned < uva_aligned) { + if (printk_ratelimit()) + pr_err("share pool: overflow happened in walk page range\n"); + return -EINVAL; + } + + page_nr = size_aligned / page_size; + pages = kvmalloc(page_nr * sizeof(struct page *), GFP_KERNEL); + if (!pages) { + if (printk_ratelimit()) + pr_err("share pool: alloc page array failed in walk page range\n"); + return -ENOMEM; + } + sp_walk_data->pages = pages; + + sp_walk.mm = tsk->mm; + sp_walk.private = sp_walk_data; + + ret = walk_page_range(uva_aligned, uva_aligned + size_aligned, + &sp_walk); + if (ret) + kvfree(pages); + + return ret; +} + +/** + * Share user memory of a specified process to kernel + * @uva: the VA of shared user memory + * @size: the size of shared user memory + * @pid: the pid of the specified process + * + * Return: if success, return the starting kernel address of the shared memory. + * if failed, return the pointer of -errno. + */ +void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) +{ + int ret = 0; + struct task_struct *tsk; + void *p = ERR_PTR(-ENODEV); + struct sp_walk_data sp_walk_data = { + .page_count = 0, + }; + struct vm_struct *area; + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else + get_task_struct(tsk); + rcu_read_unlock(); + if (ret) { + p = ERR_PTR(ret); + goto out; + } + + if (!mmget_not_zero(tsk->mm)) + goto out_put_task; + down_write(&tsk->mm->mmap_sem); + ret = __sp_walk_page_range(uva, size, tsk, &sp_walk_data); + if (ret) { + pr_err("share pool: walk page range failed, ret %d\n", ret); + up_write(&tsk->mm->mmap_sem); + mmput(tsk->mm); + p = ERR_PTR(ret); + goto out_put_task; + } + + if (sp_walk_data.is_hugepage) + p = vmap_hugepage(sp_walk_data.pages, sp_walk_data.page_count, + VM_MAP | VM_HUGE_PAGES, PAGE_KERNEL); + else + p = vmap(sp_walk_data.pages, sp_walk_data.page_count, VM_MAP, + PAGE_KERNEL); + up_write(&tsk->mm->mmap_sem); + mmput(tsk->mm); + + if (!p) { + if (printk_ratelimit()) + pr_err("share pool: vmap(huge) in u2k failed\n"); + p = ERR_PTR(-ENOMEM); + goto out_free_pages; + } else { + p = p + (uva - sp_walk_data.uva_aligned); + } + + /* + * kva p may be used later in k2u. Since p comes from uva originally, + * it's reasonable to add flag VM_USERMAP so that p can be remapped + * into userspace again. + */ + area = find_vm_area(p); + area->flags |= VM_USERMAP; + +out_free_pages: + kvfree(sp_walk_data.pages); +out_put_task: + put_task_struct(tsk); +out: + return p; +} +EXPORT_SYMBOL_GPL(sp_make_share_u2k); + +static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int spg_id) +{ + int ret = 0; + struct task_struct *tsk; + struct sp_group *spg; + struct sp_area *spa; + unsigned long uva_aligned; + unsigned long size_aligned; + unsigned int page_size; + + mutex_lock(&sp_mutex); + /* + * at first we guess it's a hugepage addr + * we can tolerate at most PMD_SIZE or PAGE_SIZE which is matched in k2u + */ + spa = __find_sp_area(ALIGN_DOWN(uva, PMD_SIZE)); + if (!spa) { + spa = __find_sp_area(ALIGN_DOWN(uva, PAGE_SIZE)); + if (!spa) { + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: invalid input uva %pK in unshare uva\n", (void *)uva); + goto out_unlock; + } + } + + /* + * 1. overflow actually won't happen due to an spa must be valid. + * 2. we must unshare [spa->va_start, spa->va_start + spa->real_size) completely + * because an spa is one-to-one correspondence with an vma. + * Thus input paramter size is not necessarily needed. + */ + page_size = (spa->is_hugepage ? PMD_SIZE : PAGE_SIZE); + uva_aligned = spa->va_start; + size_aligned = spa->real_size; + + if (size_aligned < ALIGN(size, page_size)) { + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: unshare uva failed due to invalid parameter size %lu\n", size); + goto out_drop_area; + } + + if (spg_id == SPG_ID_NONE) { + if (spa->spg) { + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: unshare uva failed, SPG_ID_NONE is invalid\n"); + goto out_drop_area; + } + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else + get_task_struct(tsk); + + rcu_read_unlock(); + if (ret) + goto out_drop_area; + + if (!mmget_not_zero(tsk->mm)) { + put_task_struct(tsk); + pr_info("share pool: no need to unshare uva, target process is exiting\n"); + goto out_drop_area; + } + down_write(&tsk->mm->mmap_sem); + ret = do_munmap(tsk->mm, uva_aligned, size_aligned, NULL); + up_write(&tsk->mm->mmap_sem); + mmput(tsk->mm); + if (ret) { + /* we are not supposed to fail */ + pr_err("share pool: failed to unmap VA %pK when munmap in unshare uva\n", + (void *)uva_aligned); + } + put_task_struct(tsk); + } else { + /* + * k2u to task, then unshare_uva(..., spg_id) is invalid due to potential + * spa memory leak. + */ + if (!spa->spg) { + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: unshare uva failed, sp group id %d is invalid\n", spg_id); + goto out_drop_area; + } + + spg = __sp_find_spg(pid, spg_id); + if (spg_valid(spg)) { + __sp_free(spg, uva_aligned, size_aligned, NULL); + } else { + if (!spg) { + if (printk_ratelimit()) + pr_err("share pool: unshare uva failed, doesn't belong to group %d\n", + spg_id); + ret = -EINVAL; + goto out_drop_area; + } else { + pr_info("share pool: no need to unshare uva, target process is exiting\n"); + } + } + } + +out_drop_area: + __sp_area_drop(spa); +out_unlock: + mutex_unlock(&sp_mutex); + return ret; +} + +static int sp_unshare_kva(unsigned long kva, unsigned long size) +{ + unsigned long addr, kva_aligned; + struct page *page; + unsigned long size_aligned; + unsigned long step; + bool is_hugepage = true; + int ret; + + ret = is_vmap_hugepage(kva); + if (ret > 0) { + kva_aligned = ALIGN_DOWN(kva, PMD_SIZE); + size_aligned = ALIGN(kva + size, PMD_SIZE) - kva_aligned; + step = PMD_SIZE; + } else if (ret == 0) { + kva_aligned = ALIGN_DOWN(kva, PAGE_SIZE); + size_aligned = ALIGN(kva + size, PAGE_SIZE) - kva_aligned; + step = PAGE_SIZE; + is_hugepage = false; + } else { + pr_err("share pool: check vmap hugepage failed, ret %d\n", ret); + return -EINVAL; + } + + if (kva_aligned + size_aligned < kva_aligned) { + if (printk_ratelimit()) + pr_err("share pool: overflow happened in unshare kva\n"); + return -EINVAL; + } + + for (addr = kva_aligned; addr < (kva_aligned + size_aligned); addr += step) { + if (is_hugepage) + page = vmalloc_to_hugepage((void *)addr); + else + page = vmalloc_to_page((void *)addr); + if (page) + put_page(page); + else + pr_err("share pool: vmalloc to hugepage failed\n"); + } + + vunmap((void *)kva_aligned); + + return 0; +} + +/** + * Unshare the kernel or user memory which shared by calling sp_make_share_{k2u,u2k}(). + * @va: the specified virtual address of memory + * @size: the size of unshared memory + * @pid: the pid of the specified process if the VA is user address + * @spg_id: the ID of the specified sp_group if the VA is user address + * + * Return -errno if fail. + * + * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + */ +int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) +{ + int ret = 0; + + if (va < TASK_SIZE) { + /* user address */ + ret = sp_unshare_uva(va, size, pid, spg_id); + } else if (va >= VA_START) { + /* kernel address */ + ret = sp_unshare_kva(va, size); + } else { + /* regard user and kernel address ranges as bad address */ + if (printk_ratelimit()) + pr_err("share pool: unshare addr %pK is not a user or kernel addr", (void *)va); + ret = -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sp_unshare); + +/** + * Return 0 when success. + * When return value < 0, information in sp_walk_data is useless + */ +int sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + int ret = 0; + + if (unlikely(!sp_walk_data)) { + if (printk_ratelimit()) + pr_err("share pool: null pointer when walk page range\n"); + return -EINVAL; + } + if (!tsk || (tsk->flags & PF_EXITING)) + return -ESRCH; + + sp_walk_data->page_count = 0; + + get_task_struct(tsk); + if (!mmget_not_zero(tsk->mm)) { + put_task_struct(tsk); + return -EINVAL; + } + down_write(&tsk->mm->mmap_sem); + ret = __sp_walk_page_range(uva, size, tsk, sp_walk_data); + up_write(&tsk->mm->mmap_sem); + mmput(tsk->mm); + put_task_struct(tsk); + + return ret; +} +EXPORT_SYMBOL_GPL(sp_walk_page_range); + +void sp_walk_page_free(struct sp_walk_data *sp_walk_data) +{ + struct page *page; + unsigned int i = 0; + + if (!sp_walk_data) + return; + + while (i < sp_walk_data->page_count) { + page = sp_walk_data->pages[i++]; + put_page(page); + } + + kvfree(sp_walk_data->pages); +} +EXPORT_SYMBOL_GPL(sp_walk_page_free); + +/** + * Walk the mm_struct of processes in the specified sp_group + * and call CALLBACK once for each mm_struct. + * @spg_id: the ID of the specified sp_group + * @data: the param for callback function + * @func: caller specific callback function + * + * Return -errno if fail. + */ +int sp_group_walk(int spg_id, void *data, int (*func)(struct mm_struct *mm, void *)) +{ + struct sp_group *spg; + int ret = -ESRCH; + + if (!func) { + if (printk_ratelimit()) + pr_err("share pool: null func pointer\n"); + return -EINVAL; + } + + mutex_lock(&sp_mutex); + spg = idr_find(&sp_group_idr, spg_id); + if (spg_valid(spg)) { + struct mm_struct *mm; + struct mm_struct *tmp; + list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + if (func) { + ret = func(mm, data); + if (ret) + goto out_unlock; + } + } + } +out_unlock: + mutex_unlock(&sp_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(sp_group_walk); + +int sp_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&sp_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(sp_register_notifier); + +int sp_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&sp_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(sp_unregister_notifier); + +/** + * user can config the share pool start addrese of each Da-vinci device + * @start: the value of share pool start + * @size: the value of share pool + * @device_id: the num of Da-vinci device + * @pid: the pid of device process + * + * Return false if parameter invalid of has been set up. + */ +bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) +{ + struct sp_group *spg; + + if (device_id < 0 || device_id >= MAX_DEVID || pid < 0 || size <= 0 || + size > MMAP_SHARE_POOL_16G_SIZE) + return false; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(pid, SPG_ID_DEFAULT); + if (!spg_valid(spg) || spg->dvpp_multi_spaces == true) { + mutex_unlock(&sp_mutex); + return false; + } + spg->dvpp_va_start = start; + spg->dvpp_size = size; + spg->dvpp_multi_spaces = true; + host_svm_sp_enable = true; + mutex_unlock(&sp_mutex); + + return true; +} +EXPORT_SYMBOL_GPL(sp_config_dvpp_range); + +/* Check whether the address belongs to the share pool. */ +bool is_sharepool_addr(unsigned long addr) +{ + if (host_svm_sp_enable == false) + return (addr >= MMAP_SHARE_POOL_START) && + addr < (MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE); + + return addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_END; +} +EXPORT_SYMBOL_GPL(is_sharepool_addr); + +static int __init mdc_default_group(char *s) +{ + enable_mdc_default_group = 1; + return 1; +} +__setup("enable_mdc_default_group", mdc_default_group); + +int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct sp_group *spg = NULL; + struct sp_proc_stat *stat; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(task->pid, SPG_ID_DEFAULT); + if (spg_valid(spg)) { + /* print the file header */ + stat = idr_find(&sp_stat_idr, task->mm->sp_stat_id); + if (!stat) { + mutex_unlock(&sp_mutex); + return 0; + } + seq_printf(m, "%-10s %-18s %-15s\n", + "Group ID", "Aligned Apply(KB)", "HugePage Fails"); + seq_printf(m, "%-10d %-18ld %-15d\n", + spg->id, byte2kb(stat->amount), spg->hugepage_failures); + } + mutex_unlock(&sp_mutex); + + return 0; +} + +static int idr_proc_stat_cb(int id, void *p, void *data) +{ + struct sp_group *spg; + struct sp_proc_stat *stat = p; + struct seq_file *seq = data; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(id, SPG_ID_DEFAULT); + if (spg) { + seq_printf(seq, "%-12d %-10d %-18ld\n", + id, spg->id, byte2kb(stat->amount)); + } + mutex_unlock(&sp_mutex); + + return 0; +} + +static int proc_stat_show(struct seq_file *seq, void *offset) +{ + /* print the file header */ + seq_printf(seq, "%-12s %-10s %-18s\n", + "Process ID", "Group ID", "Aligned Apply(KB)"); + /* print kthread buff_module_guard_work */ + seq_printf(seq, "%-12s %-10s %-18ld\n", + "guard", "-", byte2kb(kthread_stat.amount)); + idr_for_each(&sp_stat_idr, idr_proc_stat_cb, seq); + return 0; +} + +static void rb_spa_stat_show(struct seq_file *seq) +{ + struct rb_node *node; + struct sp_area *spa; + + spin_lock(&sp_area_lock); + + for (node = rb_first(&sp_area_root); node; node = rb_next(node)) { + spa = rb_entry(node, struct sp_area, rb_node); + atomic_inc(&spa->use_count); + spin_unlock(&sp_area_lock); + + mutex_lock(&sp_mutex); + if (spg_valid(spa->spg)) + seq_printf(seq, "%-10d ", spa->spg->id); + else /* k2u for task or spg is dead */ + seq_printf(seq, "%-10s ", "None"); + mutex_unlock(&sp_mutex); + + seq_printf(seq, "%2s%-14lx %2s%-14lx %-13ld ", + "0x", spa->va_start, + "0x", spa->va_end, + byte2kb(spa->real_size)); + + switch (spa->type) { + case SPA_TYPE_ALLOC: + seq_printf(seq, "%-7s ", "ALLOC"); + break; + case SPA_TYPE_K2TASK: + seq_printf(seq, "%-7s ", "TASK"); + break; + case SPA_TYPE_K2SPG: + seq_printf(seq, "%-7s ", "SPG"); + break; + default: + /* usually impossible, perhaps a developer's mistake */ + break; + } + + if (spa->is_hugepage) + seq_printf(seq, "%-5s ", "Y"); + else + seq_printf(seq, "%-5s ", "N"); + + seq_printf(seq, "%-10d\n", atomic_read(&spa->use_count)); + + spin_lock(&sp_area_lock); + __sp_area_drop_locked(spa); + } + + spin_unlock(&sp_area_lock); +} + +static void spa_overview_show(struct seq_file *seq) +{ + unsigned int total_num, alloc_num, k2u_task_num, k2u_spg_num; + unsigned long total_size, alloc_size, k2u_task_size, k2u_spg_size; + + spin_lock(&sp_area_lock); + total_num = spa_stat.total_num; + alloc_num = spa_stat.alloc_num; + k2u_task_num = spa_stat.k2u_task_num; + k2u_spg_num = spa_stat.k2u_spg_num; + total_size = spa_stat.total_size; + alloc_size = spa_stat.alloc_size; + k2u_task_size = spa_stat.k2u_task_size; + k2u_spg_size = spa_stat.k2u_spg_size; + spin_unlock(&sp_area_lock); + + seq_printf(seq, "Spa total num %u.\n", total_num); + seq_printf(seq, "Spa alloc num %u, k2u(task) num %u, k2u(spg) num %u.\n", + alloc_num, k2u_task_num, k2u_spg_num); + seq_printf(seq, "Spa total size: %13lu KB\n", byte2kb(total_size)); + seq_printf(seq, "Spa alloc size: %13lu KB\n", byte2kb(alloc_size)); + seq_printf(seq, "Spa k2u(task) size: %13lu KB\n", byte2kb(k2u_task_size)); + seq_printf(seq, "Spa k2u(spg) size: %13lu KB\n", byte2kb(k2u_spg_size)); + seq_printf(seq, "\n"); +} + +/* the caller must hold sp_mutex */ +static int idr_spg_stat_cb(int id, void *p, void *data) +{ + struct sp_group *spg = p; + struct seq_file *seq = data; + + seq_printf(seq, "Group %-10d size: %13d KB, spa num: %d.\n", + id, byte2kb(atomic_read(&spg->size)), + atomic_read(&spg->spa_num)); + + return 0; +} + +static void spg_overview_show(struct seq_file *seq) +{ + mutex_lock(&sp_mutex); + idr_for_each(&sp_group_idr, idr_spg_stat_cb, seq); + mutex_unlock(&sp_mutex); + seq_printf(seq, "Share pool total size: %13d KB, spa total num: %d.\n\n", + byte2kb(atomic_read(&spg_stat.spa_total_size)), + atomic_read(&spg_stat.spa_total_num)); +} + +static int spa_stat_show(struct seq_file *seq, void *offset) +{ + spg_overview_show(seq); + spa_overview_show(seq); + /* print the file header */ + seq_printf(seq, "%-10s %-16s %-16s %-13s %-7s %-5s %-10s\n", + "Group ID", "va_start", "va_end", "Aligned KB", "Type", "Huge", "Ref"); + rb_spa_stat_show(seq); + return 0; +} + +/* + * Called by proc_root_init() to initialize the /proc/sharepool subtree + */ +void __init proc_sharepool_init(void) +{ + if (!proc_mkdir("sharepool", NULL)) + return; + + proc_create_single_data("sharepool/proc_stat", 0, NULL, proc_stat_show, NULL); + proc_create_single_data("sharepool/spa_stat", 0, NULL, spa_stat_show, NULL); +} + + +struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, + unsigned int page_order, int node) +{ + if (area->flags & VM_HUGE_PAGES) + return hugetlb_alloc_hugepage(NUMA_NO_NODE, HUGETLB_ALLOC_NONE); + else + return alloc_pages_node(node, mask, page_order); +} + +int enable_ascend_share_pool; + +static int __init enable_share_pool(char *s) +{ + enable_ascend_share_pool = 1; + + pr_info("Ascend enable share pool features\n"); + + return 1; +} +__setup("enable_ascend_share_pool", enable_share_pool);
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The share pool is used widely for several accelerator, and it is difficult to debug the user problem, so add debug mode to analyse the problem, this mode is enabled by the sysctl_sp_debug_mode flag.
Some functions have been refactored to protect the critical area correctly, and output message more clearly.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Wu Peng wupeng58@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 15 +- kernel/sysctl.c | 9 + mm/share_pool.c | 537 +++++++++++++++++++++---------------- 3 files changed, 326 insertions(+), 235 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 09afbae33d418..2557ef1381221 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -5,6 +5,7 @@ #include <linux/mm_types.h> #include <linux/notifier.h> #include <linux/vmalloc.h> +#include <linux/printk.h>
#define SP_HUGEPAGE (1 << 0) #define SP_HUGEPAGE_ONLY (1 << 1) @@ -35,6 +36,8 @@ extern int sysctl_share_pool_hugepage_enable;
extern int sysctl_ac_mode;
+extern int sysctl_sp_debug_mode; + extern int enable_ascend_share_pool;
/* Processes in the same sp_group can share memory. @@ -70,7 +73,7 @@ struct sp_group { /* number of sp_area */ atomic_t spa_num; /* total size of all sp_area from sp_alloc and k2u(spg) */ - atomic_t size; + atomic64_t size; /* record the number of hugepage allocation failures */ int hugepage_failures; /* is_alive == false means it's being destroyed */ @@ -211,6 +214,12 @@ static inline bool sp_mmap_check(unsigned long flags) return false; }
+static inline void sp_dump_stack(void) +{ + if (sysctl_sp_debug_mode) + dump_stack(); +} + #else
static inline int sp_group_add_task(int pid, int spg_id) @@ -349,6 +358,10 @@ static inline bool sp_mmap_check(unsigned long flags) { return false; } + +static inline void sp_dump_stack(void) +{ +} #endif
#endif /* LINUX_SHARE_POOL_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3417b45058483..b88e12d942166 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1782,6 +1782,15 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "sharepool_debug_mode", + .data = &sysctl_sp_debug_mode, + .maxlen = sizeof(sysctl_sp_debug_mode), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; diff --git a/mm/share_pool.c b/mm/share_pool.c index 0d29c85beb4d4..4deb05441349a 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -57,6 +57,8 @@ static const int mdc_default_group_id = 1;
/* access control mode */ int sysctl_ac_mode = AC_NONE; +/* debug mode */ +int sysctl_sp_debug_mode;
/* idr of all sp_groups */ static DEFINE_IDR(sp_group_idr); @@ -85,9 +87,11 @@ struct sp_proc_stat { /* for kthread buff_module_guard_work */ static struct sp_proc_stat kthread_stat = {0};
-/* The caller must hold sp_mutex. */ -static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk) -{ +/* + * The caller must hold sp_mutex and ensure no concurrency problem + * for task_struct and mm_struct. + */ +static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk) { struct sp_proc_stat *stat; int id = tsk->mm->sp_stat_id; int tgid = tsk->tgid; @@ -138,7 +142,7 @@ static struct sp_spa_stat spa_stat = {0}; /* statistics of all sp group born from sp_alloc and k2u(spg) */ struct sp_spg_stat { atomic_t spa_total_num; - atomic_t spa_total_size; + atomic64_t spa_total_size; };
static struct sp_spg_stat spg_stat = {0}; @@ -166,10 +170,11 @@ struct sp_area { struct list_head link; /* link to the spg->head */ struct sp_group *spg; enum spa_type type; /* where spa born from */ + struct mm_struct *mm; /* owner of k2u(task) */ }; static DEFINE_SPINLOCK(sp_area_lock); static struct rb_root sp_area_root = RB_ROOT; -bool host_svm_sp_enable = false; +static bool host_svm_sp_enable = false;
int sysctl_share_pool_hugepage_enable = 1;
@@ -241,7 +246,7 @@ static int spa_dec_usage(enum spa_type type, unsigned long size) return 0; }
-static void *sp_mmap(struct mm_struct *mm, struct file *file, +static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, struct sp_area *spa, unsigned long *populate);
static void free_sp_group(struct sp_group *spg) @@ -274,7 +279,18 @@ static struct sp_group *__sp_find_spg(int pid, int spg_id) if (ret) return NULL;
- spg = tsk->mm->sp_group; + /* + * Once we encounter a concurrency problem here. + * To fix it, we believe get_task_mm() and mmput() is too + * heavy because we just get the pointer of sp_group. + */ + task_lock(tsk); + if (tsk->mm == NULL) + spg = NULL; + else + spg = tsk->mm->sp_group; + task_unlock(tsk); + put_task_struct(tsk); } else { spg = idr_find(&sp_group_idr, spg_id); @@ -318,7 +334,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) } spg->id = spg_id; atomic_set(&spg->spa_num, 0); - atomic_set(&spg->size, 0); + atomic64_set(&spg->size, 0); spg->is_alive = true; spg->hugepage_failures = 0; spg->dvpp_multi_spaces = false; @@ -377,9 +393,6 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) struct sp_area *spa, *prev = NULL; int err;
- if (!mmget_not_zero(mm)) - return; - down_write(&mm->mmap_sem); spin_lock(&sp_area_lock);
list_for_each_entry(spa, &mm->sp_group->spa_list, link) { @@ -406,8 +419,17 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) __sp_area_drop_locked(prev);
spin_unlock(&sp_area_lock); - up_write(&mm->mmap_sem); - mmput(mm); +} + +/* The caller must hold sp_mutex. */ +static void __sp_group_drop_locked(struct sp_group *spg) +{ + bool is_alive = spg->is_alive; + + if (atomic_dec_and_test(&spg->use_count)) { + BUG_ON(is_alive); + free_sp_group(spg); + } }
/** @@ -446,8 +468,9 @@ int sp_group_add_task(int pid, int spg_id) spg = idr_find(&sp_group_idr, spg_id); if (!spg_valid(spg)) { mutex_unlock(&sp_mutex); - pr_err("share pool: task add group failed because group id %d hasn't been create or dead\n", - spg_id); + if (printk_ratelimit()) + pr_err("share pool: task add group failed because group id %d " + "hasn't been create or dead\n", spg_id); return -EINVAL; } mutex_unlock(&sp_mutex); @@ -457,7 +480,9 @@ int sp_group_add_task(int pid, int spg_id) spg_id = ida_alloc_range(&sp_group_id_ida, SPG_ID_AUTO_MIN, SPG_ID_AUTO_MAX, GFP_ATOMIC); if (spg_id < 0) { - pr_err("share pool: task add group failed when automatically generate group id failed\n"); + if (printk_ratelimit()) + pr_err("share pool: task add group failed when automatically " + "generate group id failed\n"); return spg_id; } } @@ -467,8 +492,9 @@ int sp_group_add_task(int pid, int spg_id) SPG_ID_DVPP_PASS_THROUGH_MIN, SPG_ID_DVPP_PASS_THROUGH_MAX, GFP_ATOMIC); if (spg_id < 0) { - pr_err("share pool: task add group failed when automatically generate group id failed" - "in DVPP pass through\n"); + if (printk_ratelimit()) + pr_err("share pool: task add group failed when automatically " + "generate group id failed in DVPP pass through\n"); return spg_id; } } @@ -494,25 +520,31 @@ int sp_group_add_task(int pid, int spg_id) ret = PTR_ERR(spg); goto out_put_task; } + atomic_inc(&spg->use_count); + /* access control permission check */ if (sysctl_ac_mode == AC_SINGLE_OWNER) { if (spg->owner != current->group_leader) { ret = -EPERM; - goto out_put_task; + goto out_drop_group; } }
+ mm = get_task_mm(tsk); + if (!mm) { + ret = -ESRCH; + goto out_drop_group; + } + /* per process statistics initialization */ stat = sp_init_proc_stat(tsk); if (IS_ERR(stat)) { ret = PTR_ERR(stat); pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); - goto out_put_task; + goto out_put_mm; }
- mm = tsk->mm; mm->sp_group = spg; - atomic_inc(&spg->use_count); list_add_tail(&tsk->mm->sp_node, &spg->procs); /* * create mappings of existing shared memory segments into this @@ -523,7 +555,7 @@ int sp_group_add_task(int pid, int spg_id) list_for_each_entry(spa, &spg->spa_list, link) { unsigned long populate = 0; struct file *file = spa_file(spa); - void *p; + unsigned long addr;
if (prev) __sp_area_drop_locked(prev); @@ -532,28 +564,24 @@ int sp_group_add_task(int pid, int spg_id) atomic_inc(&spa->use_count); spin_unlock(&sp_area_lock);
- p = sp_mmap(mm, file, spa, &populate); - if (IS_ERR(p) && (PTR_ERR(p) != -ESPGMMEXIT)) { + down_write(&mm->mmap_sem); + addr = sp_mmap(mm, file, spa, &populate); + if (IS_ERR_VALUE(addr)) { sp_munmap_task_areas(mm, &spa->link); - ret = PTR_ERR(p); + up_write(&mm->mmap_sem); + ret = addr; pr_err("share pool: task add group sp mmap failed, ret %d\n", ret); spin_lock(&sp_area_lock); break; } - - if (PTR_ERR(p) == -ESPGMMEXIT) { - pr_err("share pool: task add group sp mmap failed, ret -ESPGMEXIT\n"); - spin_lock(&sp_area_lock); - ret = -ESPGMMEXIT; - break; - } + up_write(&mm->mmap_sem);
if (populate) { ret = do_mm_populate(mm, spa->va_start, populate, 0); if (ret) { if (printk_ratelimit()) - pr_err("share pool: task add group failed when mm populate failed: %d\n", - ret); + pr_warn("share pool: task add group failed when mm populate " + "failed (potential no enough memory): %d\n", ret); sp_munmap_task_areas(mm, spa->link.next); } } @@ -567,8 +595,16 @@ int sp_group_add_task(int pid, int spg_id) if (unlikely(ret)) { idr_remove(&sp_stat_idr, mm->sp_stat_id); kfree(stat); + mm->sp_stat_id = 0; + list_del(&mm->sp_node); + mm->sp_group = NULL; }
+out_put_mm: + mmput(mm); +out_drop_group: + if (unlikely(ret)) + __sp_group_drop_locked(spg); out_put_task: put_task_struct(tsk); out_unlock: @@ -609,9 +645,6 @@ void sp_group_exit(struct mm_struct *mm) bool is_alive = true; bool unlock;
- if (!enable_ascend_share_pool) - return; - /* * Nothing to do if this thread group doesn't belong to any sp_group. * No need to protect this check with lock because we can add a task @@ -638,18 +671,13 @@ void sp_group_exit(struct mm_struct *mm)
void sp_group_post_exit(struct mm_struct *mm) { - bool is_alive; struct sp_proc_stat *stat; bool unlock;
- if (!enable_ascend_share_pool) - return; - if (!mm->sp_group) return;
spg_exit_lock(&unlock); - is_alive = mm->sp_group->is_alive;
/* pointer stat must be valid, we don't need to check sanity */ stat = idr_find(&sp_stat_idr, mm->sp_stat_id); @@ -673,10 +701,7 @@ void sp_group_post_exit(struct mm_struct *mm)
idr_remove(&sp_stat_idr, mm->sp_stat_id);
- if (atomic_dec_and_test(&mm->sp_group->use_count)) { - BUG_ON(is_alive); - free_sp_group(mm->sp_group); - } + __sp_group_drop_locked(mm->sp_group); spg_exit_unlock(unlock);
kfree(stat); @@ -716,7 +741,7 @@ static void __insert_sp_area(struct sp_area *spa) static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, struct sp_group *spg, enum spa_type type) { - struct sp_area *spa; + struct sp_area *spa, *err; struct rb_node *n; unsigned long vstart = MMAP_SHARE_POOL_START; unsigned long vend = MMAP_SHARE_POOL_16G_START; @@ -728,6 +753,11 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, vstart = MMAP_SHARE_POOL_16G_START; vend = MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE; } else { + if (!spg) { + if (printk_ratelimit()) + pr_err("share pool: don't allow k2u(task) in host svm multiprocess scene\n"); + return ERR_PTR(-EINVAL); + } vstart = spg->dvpp_va_start; vend = spg->dvpp_va_start + spg->dvpp_size; } @@ -735,14 +765,11 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
addr = vstart;
- if (!sysctl_share_pool_hugepage_enable) - flags &= ~(SP_HUGEPAGE_ONLY | SP_HUGEPAGE); - spa = kmalloc(sizeof(struct sp_area), GFP_KERNEL); if (unlikely(!spa)) { if (printk_ratelimit()) pr_err("share pool: alloc spa failed due to lack of memory\n"); - return NULL; + return ERR_PTR(-ENOMEM); }
spin_lock(&sp_area_lock); @@ -788,6 +815,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, } found: if (addr + size_align > vend) { + err = ERR_PTR(-EOVERFLOW); goto error; }
@@ -799,15 +827,17 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, atomic_set(&spa->use_count, 1); spa->type = type;
- if (spa_inc_usage(type, size)) + if (spa_inc_usage(type, size)) { + err = ERR_PTR(-EINVAL); goto error; + }
__insert_sp_area(spa); if (spa->spg) { atomic_inc(&spg->spa_num); - atomic_add(size, &spg->size); + atomic64_add(size, &spg->size); atomic_inc(&spg_stat.spa_total_num); - atomic_add(size, &spg_stat.spa_total_size); + atomic64_add(size, &spg_stat.spa_total_size); list_add_tail(&spa->link, &spg->spa_list); } spin_unlock(&sp_area_lock); @@ -817,7 +847,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, error: spin_unlock(&sp_area_lock); kfree(spa); - return NULL; + return err; }
/* the caller should hold sp_area_lock */ @@ -862,9 +892,9 @@ static void sp_free_area(struct sp_area *spa) spa_dec_usage(spa->type, spa->real_size); /* won't fail */ if (spa->spg) { atomic_dec(&spa->spg->spa_num); - atomic_sub(spa->real_size, &spa->spg->size); + atomic64_sub(spa->real_size, &spa->spg->size); atomic_dec(&spg_stat.spa_total_num); - atomic_sub(spa->real_size, &spg_stat.spa_total_size); + atomic64_sub(spa->real_size, &spg_stat.spa_total_size); list_del(&spa->link); } rb_erase(&spa->rb_node, &sp_area_root); @@ -898,7 +928,7 @@ void sp_area_drop(struct vm_area_struct *vma) { struct sp_area *spa;
- if (!sp_check_vm_share_pool(vma->vm_flags)) + if (!(vma->vm_flags & VM_SHARE_POOL)) return;
/* @@ -979,13 +1009,25 @@ int sp_free(unsigned long addr) } else { /* spa == NULL */ ret = -EINVAL; if (printk_ratelimit()) - pr_err("share pool: sp_free invalid input addr %pK\n", (void *)addr); + pr_err("share pool: sp free invalid input addr %pK\n", (void *)addr); goto out; }
+ if (spa->type != SPA_TYPE_ALLOC) { + if (printk_ratelimit()) + pr_err("share pool: sp free failed, addr %pK is not from sp_alloc\n", + (void *)addr); + } + if (!spg_valid(spa->spg)) goto drop_spa;
+ pr_notice("share pool: [sp free] caller %s(%d/%d); " + "group id %d addr 0x%pK, size %ld\n", + current->comm, current->tgid, current->pid, spa->spg->id, + (void *)spa->va_start, spa->real_size); + sp_dump_stack(); + __sp_free(spa->spg, spa->va_start, spa_size(spa), NULL);
/* Free the memory of the backing shmem or hugetlbfs */ @@ -993,7 +1035,7 @@ int sp_free(unsigned long addr) offset = addr - MMAP_SHARE_POOL_START; ret = vfs_fallocate(spa_file(spa), mode, offset, spa_size(spa)); if (ret) - pr_err("share pool: fallocate failed: %d\n", ret); + pr_err("share pool: sp free fallocate failed: %d\n", ret);
/* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { @@ -1016,7 +1058,7 @@ int sp_free(unsigned long addr) EXPORT_SYMBOL_GPL(sp_free);
/* wrapper of __do_mmap() and the caller must hold down_write(&mm->mmap_sem). */ -static unsigned long __sp_mmap(struct mm_struct *mm, struct file *file, +static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, struct sp_area *spa, unsigned long *populate) { unsigned long addr = spa->va_start; @@ -1033,30 +1075,13 @@ static unsigned long __sp_mmap(struct mm_struct *mm, struct file *file, if (IS_ERR_VALUE(addr)) { atomic_dec(&spa->use_count); pr_err("share pool: do_mmap fails %ld\n", addr); + } else { + BUG_ON(addr != spa->va_start); }
return addr; }
-static void *sp_mmap(struct mm_struct *mm, struct file *file, - struct sp_area *spa, unsigned long *populate) -{ - unsigned long addr; - - if (!mmget_not_zero(mm)) - return ERR_PTR(-ESPGMMEXIT); - down_write(&mm->mmap_sem); - addr = __sp_mmap(mm, file, spa, populate); - up_write(&mm->mmap_sem); - mmput(mm); - - if (IS_ERR_VALUE(addr)) - return ERR_PTR(addr); - - BUG_ON(addr != spa->va_start); - return (void *)addr; -} - /** * Allocate shared memory for all the processes in the same sp_group * size - the size of memory to allocate @@ -1071,12 +1096,14 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) struct sp_area *spa = NULL; struct sp_proc_stat *stat; unsigned long sp_addr; - void *p_mmap, *p = ERR_PTR(-ENODEV); + unsigned long mmap_addr; + void *p = ERR_PTR(-ENODEV); struct mm_struct *mm; struct file *file; unsigned long size_aligned; int ret = 0; struct mm_struct *tmp; + unsigned long mode, offset;
/* mdc scene hack */ if (enable_mdc_default_group) @@ -1133,9 +1160,6 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) goto out; }
- if (!sysctl_share_pool_hugepage_enable) - sp_flags &= ~(SP_HUGEPAGE_ONLY | SP_HUGEPAGE); - if (sp_flags & SP_HUGEPAGE) { file = spg->file_hugetlb; size_aligned = ALIGN(size, PMD_SIZE); @@ -1145,10 +1169,12 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) } try_again: spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_ALLOC); - if (!spa) { + if (IS_ERR(spa)) { if (printk_ratelimit()) - pr_err("share pool: allocation failed due to alloc spa failure\n"); - p = ERR_PTR(-ENOMEM); + pr_err("share pool: allocation failed due to alloc spa failure " + "(potential no enough virtual memory when -75): %ld\n", + PTR_ERR(spa)); + p = spa; goto out; } sp_addr = spa->va_start; @@ -1158,33 +1184,34 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) unsigned long populate = 0; struct vm_area_struct *vma;
- p_mmap = sp_mmap(mm, file, spa, &populate); - if (IS_ERR(p_mmap) && (PTR_ERR(p_mmap) != -ESPGMMEXIT)) { - p = p_mmap; + if (!mmget_not_zero(mm)) + continue; + + down_write(&mm->mmap_sem); + mmap_addr = sp_mmap(mm, file, spa, &populate); + if (IS_ERR_VALUE(mmap_addr)) { + up_write(&mm->mmap_sem); + p = (void *)mmap_addr; __sp_free(spg, sp_addr, size_aligned, mm); - pr_err("share pool: allocation sp mmap failed, ret %ld\n", PTR_ERR(p_mmap)); - break; + mmput(mm); + pr_err("share pool: allocation sp mmap failed, ret %ld\n", mmap_addr); + goto out; }
- if (PTR_ERR(p_mmap) == -ESPGMMEXIT) { - pr_info("share pool: allocation sp mmap failed, ret -ESPGMMEXIT\n"); + p =(void *)mmap_addr; /* success */ + if (populate == 0) { + up_write(&mm->mmap_sem); + mmput(mm); continue; }
- p = p_mmap; /* success */ - if (populate == 0) - continue; - - if (!mmget_not_zero(mm)) - continue; - down_write(&mm->mmap_sem); vma = find_vma(mm, sp_addr); if (unlikely(!vma)) { + up_write(&mm->mmap_sem); + mmput(mm); pr_err("share pool: allocation failed due to find %pK vma failure\n", (void *)sp_addr); p = ERR_PTR(-EINVAL); - up_write(&mm->mmap_sem); - mmput(mm); goto out; } /* clean PTE_RDONLY flags or trigger SMMU event */ @@ -1216,9 +1243,17 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) }
if (printk_ratelimit()) - pr_err("share pool: allocation failed due to mm populate failed: %d\n", - ret); + pr_warn("share pool: allocation failed due to mm populate failed" + "(potential no enough memory when -12): %d\n", ret); p = ERR_PTR(ret); + __sp_area_drop(spa); + + mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; + offset = sp_addr - MMAP_SHARE_POOL_START; + ret = vfs_fallocate(spa_file(spa), mode, offset, spa_size(spa)); + if (ret) + pr_err("share pool: fallocate failed %d\n", ret); + mmput(mm); break; } @@ -1235,24 +1270,20 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) mutex_unlock(&sp_mutex);
/* this will free spa if mmap failed */ - if (spa) + if (spa && !IS_ERR(spa)) __sp_area_drop(spa);
+ if (!IS_ERR(p)) { + pr_notice("share pool: [sp alloc] caller %s(%d/%d); group id %d; " + "return addr 0x%pK, size %ld\n", + current->comm, current->tgid, current->pid, spa->spg->id, + (void *)spa->va_start, spa->real_size); + sp_dump_stack(); + } return p; } EXPORT_SYMBOL_GPL(sp_alloc);
-static unsigned long __sp_remap_get_pfn(unsigned long kva) -{ - unsigned long pfn; - if (is_vmalloc_addr((void *)kva)) - pfn = vmalloc_to_pfn((void *)kva); - else - pfn = virt_to_pfn(kva); - - return pfn; -} - /* * return value: >0 means this is a hugepage addr * =0 means a normal addr. <0 means an errno. @@ -1286,7 +1317,6 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, struct vm_area_struct *vma; unsigned long ret_addr; unsigned long populate = 0; - unsigned long addr, buf, offset; struct file *file = NULL; int ret = 0; struct user_struct *user = NULL; @@ -1307,7 +1337,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, } down_write(&mm->mmap_sem);
- ret_addr = __sp_mmap(mm, file, spa, &populate); + ret_addr = sp_mmap(mm, file, spa, &populate); if (IS_ERR_VALUE(ret_addr)) { pr_err("share pool: k2u mmap failed %lx\n", ret_addr); goto out; @@ -1326,20 +1356,12 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, goto out; } } else { - buf = ret_addr; - addr = kva; - offset = 0; - do { - ret = remap_pfn_range(vma, buf, __sp_remap_get_pfn(addr), PAGE_SIZE, - __pgprot(vma->vm_page_prot.pgprot)); - if (ret) { - ret_addr = ret; - goto out; - } - offset += PAGE_SIZE; - buf += PAGE_SIZE; - addr += PAGE_SIZE; - } while (offset < spa_size(spa)); + ret = remap_vmalloc_range(vma, (void *)kva, 0); + if (ret) { + pr_err("share pool: remap vmalloc failed, ret %d\n", ret); + ret_addr = ret; + goto out; + } }
out: @@ -1380,6 +1402,13 @@ static void *sp_make_share_kva_to_task(unsigned long kva, struct sp_area *spa, }
p = (void *)ret_addr; + + task_lock(tsk); + if (tsk->mm == NULL) + p = ERR_PTR(-ESRCH); + else + spa->mm = tsk->mm; + task_unlock(tsk); out: put_task_struct(tsk); return p; @@ -1438,6 +1467,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned long kva_aligned; unsigned long size_aligned; unsigned int page_size = PAGE_SIZE; + enum spa_type type; int ret;
if (sp_flags & ~SP_DVPP) { @@ -1453,6 +1483,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, } else if (ret == 0) { /* do nothing */ } else { + pr_err("it is not vmalloc address\n"); return ERR_PTR(ret); } /* aligned down kva is convenient for caller to start with any valid kva */ @@ -1460,24 +1491,42 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, size_aligned = ALIGN(kva + size, page_size) - kva_aligned;
mutex_lock(&sp_mutex); - spg = __sp_find_spg(pid, spg_id); + spg = __sp_find_spg(pid, SPG_ID_DEFAULT); if (spg == NULL) { - spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); - if (!spa) { + type = SPA_TYPE_K2TASK; + if (spg_id != SPG_ID_NONE && spg_id != SPG_ID_DEFAULT) { mutex_unlock(&sp_mutex); if (printk_ratelimit()) - pr_err("share pool: k2u failed due to alloc spa failure\n"); - return ERR_PTR(-ENOMEM); + pr_err("share pool: k2task invalid spg id %d\n", spg_id); + return ERR_PTR(-EINVAL); + } + spa = sp_alloc_area(size_aligned, sp_flags, NULL, type); + if (IS_ERR(spa)) { + mutex_unlock(&sp_mutex); + if (printk_ratelimit()) + pr_err("share pool: k2u(task) failed due to alloc spa failure " + "(potential no enough virtual memory when -75): %ld\n", + PTR_ERR(spa)); + return spa; } uva = sp_make_share_kva_to_task(kva_aligned, spa, pid); mutex_unlock(&sp_mutex); } else if (spg_valid(spg)) { - spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); - if (!spa) { + type = SPA_TYPE_K2SPG; + if (spg_id != SPG_ID_DEFAULT && spg_id != spg->id) { mutex_unlock(&sp_mutex); if (printk_ratelimit()) - pr_err("share pool: k2u failed due to alloc spa failure\n"); - return ERR_PTR(-ENOMEM); + pr_err("share pool: k2spg invalid spg id %d\n", spg_id); + return ERR_PTR(-EINVAL); + } + spa = sp_alloc_area(size_aligned, sp_flags, spg, type); + if (IS_ERR(spa)) { + mutex_unlock(&sp_mutex); + if (printk_ratelimit()) + pr_err("share pool: k2u(spg) failed due to alloc spa failure " + "(potential no enough virtual memory when -75): %ld\n", + PTR_ERR(spa)); + return spa; }
uva = sp_make_share_kva_to_spg(kva_aligned, spa, spg); @@ -1492,6 +1541,17 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, uva = uva + (kva - kva_aligned);
__sp_area_drop(spa); + + if (!IS_ERR(uva)) { + if (spg_valid(spa->spg)) + spg_id = spa->spg->id; + pr_notice("share pool: [sp k2u type %d] caller %s(%d/%d); group id %d; " + "return addr 0x%pK size %ld\n", + type, current->comm, current->tgid, current->pid, spg_id, + (void *)spa->va_start, spa->real_size); + sp_dump_stack(); + } + return uva; } EXPORT_SYMBOL_GPL(sp_make_share_k2u); @@ -1531,7 +1591,8 @@ static int sp_pte_hole(unsigned long start, unsigned long end, struct mm_walk *walk) { if (printk_ratelimit()) - pr_err("share pool: hole [%pK, %pK) appeared unexpectedly\n", (void *)start, (void *)end); + pr_err("share pool: hole [%pK, %pK) appeared unexpectedly\n", + (void *)start, (void *)end); return -EFAULT; }
@@ -1545,7 +1606,8 @@ static int sp_hugetlb_entry(pte_t *ptep, unsigned long hmask,
if (unlikely(!pte_present(pte))) { if (printk_ratelimit()) - pr_err("share pool: the page of addr %pK unexpectedly not in RAM\n", (void *)addr); + pr_err("share pool: the page of addr %pK unexpectedly " + "not in RAM\n", (void *)addr); return -EFAULT; }
@@ -1758,6 +1820,11 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp } }
+ if (spa->type != SPA_TYPE_K2TASK && spa->type != SPA_TYPE_K2SPG) { + pr_err("share pool: this spa should not be unshare here\n"); + ret = -EINVAL; + goto out_drop_area; + } /* * 1. overflow actually won't happen due to an spa must be valid. * 2. we must unshare [spa->va_start, spa->va_start + spa->real_size) completely @@ -1771,32 +1838,57 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (size_aligned < ALIGN(size, page_size)) { ret = -EINVAL; if (printk_ratelimit()) - pr_err("share pool: unshare uva failed due to invalid parameter size %lu\n", size); + pr_err("share pool: unshare uva failed due to invalid parameter size %lu\n", + size); goto out_drop_area; }
- if (spg_id == SPG_ID_NONE) { - if (spa->spg) { - ret = -EINVAL; + if (spa->type == SPA_TYPE_K2TASK) { + if (spg_id != SPG_ID_NONE && spg_id != SPG_ID_DEFAULT) { if (printk_ratelimit()) - pr_err("share pool: unshare uva failed, SPG_ID_NONE is invalid\n"); + pr_err("share pool: unshare uva(to task) failed, " + "invalid spg id %d\n", spg_id); + ret = -EINVAL; goto out_drop_area; }
rcu_read_lock(); tsk = find_task_by_vpid(pid); - if (!tsk || (tsk->flags & PF_EXITING)) - ret = -ESRCH; - else - get_task_struct(tsk); - + if (!tsk || !tsk->mm || (tsk->flags & PF_EXITING)) { + if (printk_ratelimit()) + pr_info("share pool: no need to unshare uva(to task), " + "target process not found or do_exit\n"); + ret = -EINVAL; + rcu_read_unlock(); + sp_dump_stack(); + goto out_drop_area; + } + get_task_struct(tsk); rcu_read_unlock(); - if (ret) + + if (!spa->mm || + (current->mm && (current->mm != tsk->mm || tsk->mm != spa->mm))) { + if (printk_ratelimit()) + pr_err("share pool: unshare uva(to task) failed, " + "wrong pid or invalid spa\n"); + ret = -EINVAL; goto out_drop_area; + } + + if (spa->mm != tsk->mm) { + if (printk_ratelimit()) + pr_err("share pool: unshare uva(to task) failed, " + "spa not belong to the task\n"); + ret = -EINVAL; + goto out_drop_area; + }
if (!mmget_not_zero(tsk->mm)) { put_task_struct(tsk); - pr_info("share pool: no need to unshare uva, target process is exiting\n"); + if (printk_ratelimit()) + pr_info("share pool: no need to unshare uva(to task), " + "target process mm is not existing\n"); + sp_dump_stack(); goto out_drop_area; } down_write(&tsk->mm->mmap_sem); @@ -1809,32 +1901,51 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp (void *)uva_aligned); } put_task_struct(tsk); - } else { - /* - * k2u to task, then unshare_uva(..., spg_id) is invalid due to potential - * spa memory leak. - */ - if (!spa->spg) { + } else if (spa->type == SPA_TYPE_K2SPG) { + if (!spa->spg || spg_id == SPG_ID_NONE) { + if (printk_ratelimit()) + pr_err("share pool: unshare uva(to group) failed, " + "invalid spg id %d\n", spg_id); ret = -EINVAL; + goto out_drop_area; + } + + spg = __sp_find_spg(pid, SPG_ID_DEFAULT); + if (!spg_valid(spg)) { if (printk_ratelimit()) - pr_err("share pool: unshare uva failed, sp group id %d is invalid\n", spg_id); + pr_err("share pool: unshare uva(to group) invalid pid, " + "process not in sp group or group is dead\n"); + ret = -EINVAL; goto out_drop_area; }
- spg = __sp_find_spg(pid, spg_id); - if (spg_valid(spg)) { - __sp_free(spg, uva_aligned, size_aligned, NULL); - } else { - if (!spg) { - if (printk_ratelimit()) - pr_err("share pool: unshare uva failed, doesn't belong to group %d\n", - spg_id); - ret = -EINVAL; - goto out_drop_area; - } else { - pr_info("share pool: no need to unshare uva, target process is exiting\n"); - } + if (spa->spg != spg) { + if (printk_ratelimit()) + pr_err("share pool: unshare uva(to group) failed, " + "spa not belong to the group\n"); + ret = -EINVAL; + goto out_drop_area; } + + if (current->mm && current->mm->sp_group != spg) { + if (printk_ratelimit()) + pr_err("share pool: unshare uva(to group) failed, " + "caller process doesn't belong to target group\n"); + ret = -EINVAL; + goto out_drop_area; + } + + __sp_free(spg, uva_aligned, size_aligned, NULL); + } + + if (!ret) { + if (spg_valid(spa->spg)) + spg_id = spa->spg->id; + pr_notice("share pool: [sp unshare uva type %d] caller %s(%d/%d); " + "group id %d addr 0x%pK size %ld\n", + spa->type, current->comm, current->tgid, current->pid, + spg_id, (void *)spa->va_start, spa->real_size); + sp_dump_stack(); }
out_drop_area: @@ -1864,7 +1975,8 @@ static int sp_unshare_kva(unsigned long kva, unsigned long size) step = PAGE_SIZE; is_hugepage = false; } else { - pr_err("share pool: check vmap hugepage failed, ret %d\n", ret); + if (printk_ratelimit()) + pr_err("share pool: check vmap hugepage failed, ret %d\n", ret); return -EINVAL; }
@@ -1882,7 +1994,8 @@ static int sp_unshare_kva(unsigned long kva, unsigned long size) if (page) put_page(page); else - pr_err("share pool: vmalloc to hugepage failed\n"); + pr_err("share pool: vmalloc %pK to page/hugepage failed\n", + (void *)addr); }
vunmap((void *)kva_aligned); @@ -1944,7 +2057,7 @@ int sp_walk_page_range(unsigned long uva, unsigned long size, get_task_struct(tsk); if (!mmget_not_zero(tsk->mm)) { put_task_struct(tsk); - return -EINVAL; + return -ESRCH; } down_write(&tsk->mm->mmap_sem); ret = __sp_walk_page_range(uva, size, tsk, sp_walk_data); @@ -1973,46 +2086,6 @@ void sp_walk_page_free(struct sp_walk_data *sp_walk_data) } EXPORT_SYMBOL_GPL(sp_walk_page_free);
-/** - * Walk the mm_struct of processes in the specified sp_group - * and call CALLBACK once for each mm_struct. - * @spg_id: the ID of the specified sp_group - * @data: the param for callback function - * @func: caller specific callback function - * - * Return -errno if fail. - */ -int sp_group_walk(int spg_id, void *data, int (*func)(struct mm_struct *mm, void *)) -{ - struct sp_group *spg; - int ret = -ESRCH; - - if (!func) { - if (printk_ratelimit()) - pr_err("share pool: null func pointer\n"); - return -EINVAL; - } - - mutex_lock(&sp_mutex); - spg = idr_find(&sp_group_idr, spg_id); - if (spg_valid(spg)) { - struct mm_struct *mm; - struct mm_struct *tmp; - list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { - if (func) { - ret = func(mm, data); - if (ret) - goto out_unlock; - } - } - } -out_unlock: - mutex_unlock(&sp_mutex); - - return ret; -} -EXPORT_SYMBOL_GPL(sp_group_walk); - int sp_register_notifier(struct notifier_block *nb) { return blocking_notifier_chain_register(&sp_notifier_chain, nb); @@ -2039,7 +2112,7 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) struct sp_group *spg;
if (device_id < 0 || device_id >= MAX_DEVID || pid < 0 || size <= 0 || - size > MMAP_SHARE_POOL_16G_SIZE) + size> MMAP_SHARE_POOL_16G_SIZE) return false;
mutex_lock(&sp_mutex); @@ -2061,11 +2134,9 @@ EXPORT_SYMBOL_GPL(sp_config_dvpp_range); /* Check whether the address belongs to the share pool. */ bool is_sharepool_addr(unsigned long addr) { - if (host_svm_sp_enable == false) - return (addr >= MMAP_SHARE_POOL_START) && - addr < (MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE); - - return addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_END; + if (host_svm_sp_enable == false) + return addr >= MMAP_SHARE_POOL_START && addr < (MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE); + return addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_END; } EXPORT_SYMBOL_GPL(is_sharepool_addr);
@@ -2109,7 +2180,7 @@ static int idr_proc_stat_cb(int id, void *p, void *data)
mutex_lock(&sp_mutex); spg = __sp_find_spg(id, SPG_ID_DEFAULT); - if (spg) { + if (spg_valid(spg)) { seq_printf(seq, "%-12d %-10d %-18ld\n", id, spg->id, byte2kb(stat->amount)); } @@ -2130,8 +2201,7 @@ static int proc_stat_show(struct seq_file *seq, void *offset) return 0; }
-static void rb_spa_stat_show(struct seq_file *seq) -{ +static void rb_spa_stat_show(struct seq_file *seq) { struct rb_node *node; struct sp_area *spa;
@@ -2215,8 +2285,8 @@ static int idr_spg_stat_cb(int id, void *p, void *data) struct sp_group *spg = p; struct seq_file *seq = data;
- seq_printf(seq, "Group %-10d size: %13d KB, spa num: %d.\n", - id, byte2kb(atomic_read(&spg->size)), + seq_printf(seq, "Group %-10d size: %13ld KB, spa num: %d.\n", + id, byte2kb(atomic64_read(&spg->size)), atomic_read(&spg->spa_num));
return 0; @@ -2227,8 +2297,8 @@ static void spg_overview_show(struct seq_file *seq) mutex_lock(&sp_mutex); idr_for_each(&sp_group_idr, idr_spg_stat_cb, seq); mutex_unlock(&sp_mutex); - seq_printf(seq, "Share pool total size: %13d KB, spa total num: %d.\n\n", - byte2kb(atomic_read(&spg_stat.spa_total_size)), + seq_printf(seq, "Share pool total size: %13ld KB, spa total num: %d.\n\n", + byte2kb(atomic64_read(&spg_stat.spa_total_size)), atomic_read(&spg_stat.spa_total_num)); }
@@ -2255,7 +2325,6 @@ void __init proc_sharepool_init(void) proc_create_single_data("sharepool/spa_stat", 0, NULL, spa_stat_show, NULL); }
- struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, unsigned int page_order, int node) {
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Provide a free area cache for the share pool VA allocator, based on the algorithm used by linux mainline commit 89699605fe7c.
This reduces the number of rbtree operations and linear traversals over the share pool extents in order to find a free area, by starting off at the last point that a free area was found.
The free area cache is reset if areas are freed behind it, or if we are searching for an another area (such as DVPP 16G area) than last time. So allocation patterns are not changed.
After this patch, the search will start from where it left off, giving closer to an amortized O(1).
Test environment: Huawei 1951 DC (8 CPU cores) with 21G memory, no load.
Test method: A single thread process first call sp_alloc() to allocate a specified number of 2M hugepages, then we calculate the allocation time when sp_alloc() another 2M hugepage. The results are in microsecond.
test 1, first sp_alloc() 256 2M-hugepage, total 512M test 2, first sp_alloc() 512 2M-hugepage, total 1G test 3, first sp_alloc() 1024 2M-hugepage, total 2G test 4, first sp_alloc() 1536 2M-hugepage, total 3G test 5, first sp_alloc() 2048 2M-hugepage, total 4G test 6, first sp_alloc() 4096 2M-hugepage, total 8G test 7, first sp_alloc() 6072 2M-hugepage, total 12G test 8, first sp_alloc() 8192 2M-hugepage, total 16G
test1 test2 test3 test4 231 238 240 252 279 253 315 268 242 238 247 253 282 255 326 265 233 234 250 243 272 251 314 258 239 224 245 246 273 261 324 262 234 233 252 257 277 262 326 265 225 231 243 243 279 249 325 264 236 261 246 248 265 262 323 266 233 238 247 246 281 259 331 265 239 222 243 241 270 248 325 263 241 231 239 241 335 246 321 268 avg: 235.3 235 245.2 247 281.3 254.6 323 264.4 res: - - 9.49% 18.14%
test5 test6 test7 test8 371 280 720 458 1001 629 1547 909 369 283 691 465 1005 718 1533 903 374 279 954 470 1003 680 1371 908 363 279 697 457 1004 923 1375 930 369 286 711 464 1016 683 1395 1083 382 280 967 491 1029 695 1413 1096 378 284 688 823 1008 689 1419 905 376 360 921 469 1285 696 1554 1085 374 287 896 485 1030 682 1381 902 380 276 706 545 1286 717 1606 1097 avg: 373.6 289.4 791.5 512.7 1066.7 717.5 1459.4 981.8 res: 22.54% 35.52% 32.74% 32.73%
Suggested-by: Zefan Li lizefan@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 104 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 75 insertions(+), 29 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 4deb05441349a..6738daa4b2f4c 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -730,6 +730,11 @@ static void __insert_sp_area(struct sp_area *spa) rb_insert_color(&spa->rb_node, &sp_area_root); }
+/* The sp_area cache globals are protected by sp_area_lock */ +static struct rb_node *free_sp_area_cache; +static unsigned long cached_hole_size; +static unsigned long cached_vstart; /* affected by SP_DVPP and sp_config_dvpp_range() */ + /* * Allocate a region of VA from the share pool. * @size - the size of VA to allocate @@ -741,7 +746,7 @@ static void __insert_sp_area(struct sp_area *spa) static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, struct sp_group *spg, enum spa_type type) { - struct sp_area *spa, *err; + struct sp_area *spa, *first, *err; struct rb_node *n; unsigned long vstart = MMAP_SHARE_POOL_START; unsigned long vend = MMAP_SHARE_POOL_16G_START; @@ -763,8 +768,6 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, } }
- addr = vstart; - spa = kmalloc(sizeof(struct sp_area), GFP_KERNEL); if (unlikely(!spa)) { if (printk_ratelimit()) @@ -774,45 +777,75 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
spin_lock(&sp_area_lock);
- n = sp_area_root.rb_node; - if (n) { - struct sp_area *first = NULL; + /* + * Invalidate cache if we have more permissive parameters. + * cached_hole_size notes the largest hole noticed _below_ + * the sp_area cached in free_sp_area_cache: if size fits + * into that hole, we want to scan from vstart to reuse + * the hole instead of allocating above free_sp_area_cache. + * Note that sp_free_area may update free_sp_area_cache + * without updating cached_hole_size. + */ + if (!free_sp_area_cache || size_align < cached_hole_size || + vstart != cached_vstart) { + cached_hole_size = 0; + free_sp_area_cache = NULL; + } + + /* record if we encounter less permissive parameters */ + cached_vstart = vstart; + + /* find starting point for our search */ + if (free_sp_area_cache) { + first = rb_entry(free_sp_area_cache, struct sp_area, rb_node); + addr = first->va_end; + if (addr + size_align < addr) { + err = ERR_PTR(-EOVERFLOW); + goto error; + } + } else { + addr = vstart; + if (addr + size_align < addr) { + err = ERR_PTR(-EOVERFLOW); + goto error; + } + + n = sp_area_root.rb_node; + first = NULL;
- do { + while (n) { struct sp_area *tmp; tmp = rb_entry(n, struct sp_area, rb_node); if (tmp->va_end >= addr) { - if (!first && tmp->va_start < addr + size_align) - first = tmp; - n = n->rb_left; - } else { first = tmp; + if (tmp->va_start <= addr) + break; + n = n->rb_left; + } else n = n->rb_right; - } - } while (n); + }
if (!first) goto found; + }
- if (first->va_end < addr) { - n = rb_next(&first->rb_node); - if (n) - first = rb_entry(n, struct sp_area, rb_node); - else - goto found; + /* from the starting point, traverse areas until a suitable hole is found */ + while (addr + size_align > first->va_start && addr + size_align <= vend) { + if (addr + cached_hole_size < first->va_start) + cached_hole_size = first->va_start - addr; + addr = first->va_end; + if (addr + size_align < addr) { + err = ERR_PTR(-EOVERFLOW); + goto error; }
- while (addr + size_align >= first->va_start && - addr + size_align <= vend) { - addr = first->va_end; - - n = rb_next(&first->rb_node); - if (n) - first = rb_entry(n, struct sp_area, rb_node); - else - goto found; - } + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct sp_area, rb_node); + else + goto found; } + found: if (addr + size_align > vend) { err = ERR_PTR(-EOVERFLOW); @@ -833,6 +866,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, }
__insert_sp_area(spa); + free_sp_area_cache = &spa->rb_node; if (spa->spg) { atomic_inc(&spg->spa_num); atomic64_add(size, &spg->size); @@ -889,6 +923,18 @@ static void sp_free_area(struct sp_area *spa) { lockdep_assert_held(&sp_area_lock);
+ if (free_sp_area_cache) { + struct sp_area *cache; + cache = rb_entry(free_sp_area_cache, struct sp_area, rb_node); + if (spa->va_start <= cache->va_start) { + free_sp_area_cache = rb_prev(&spa->rb_node); + /* + * We don't try to update cached_hole_size, + * but it won't go very wrong. + */ + } + } + spa_dec_usage(spa->type, spa->real_size); /* won't fail */ if (spa->spg) { atomic_dec(&spa->spg->spa_num);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
DVPP process channel destroy procedure: do_exit() -> exit_mm() (mm no longer in spg) -> exit_task_work() -> task_work_run() -> __fput() -> ... -> vdec_close() -> sp_unshare(uva, ..., SPG_ID_DEFAULT).
So when it calls sp_unshare(), current->mm is NULL, just like buff_module_guard_work kthread.
To solve this problem, we skip corresponding sanity checks.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 94 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 33 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 6738daa4b2f4c..4b9d1e28c9227 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -530,7 +530,8 @@ int sp_group_add_task(int pid, int spg_id) } }
- mm = get_task_mm(tsk); + /* current thread may be exiting in a multithread process */ + mm = get_task_mm(tsk->group_leader); if (!mm) { ret = -ESRCH; goto out_drop_group; @@ -583,6 +584,8 @@ int sp_group_add_task(int pid, int spg_id) pr_warn("share pool: task add group failed when mm populate " "failed (potential no enough memory): %d\n", ret); sp_munmap_task_areas(mm, spa->link.next); + spin_lock(&sp_area_lock); + break; } }
@@ -859,6 +862,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, spa->spg = spg; atomic_set(&spa->use_count, 1); spa->type = type; + spa->mm = NULL;
if (spa_inc_usage(type, size)) { err = ERR_PTR(-EINVAL); @@ -1292,7 +1296,6 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) pr_warn("share pool: allocation failed due to mm populate failed" "(potential no enough memory when -12): %d\n", ret); p = ERR_PTR(ret); - __sp_area_drop(spa);
mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; offset = sp_addr - MMAP_SHARE_POOL_START; @@ -1840,11 +1843,29 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) } EXPORT_SYMBOL_GPL(sp_make_share_u2k);
+/* + * Input parameters uva and spg_id are now useless. spg_id will be useful when + * supporting a process in multiple sp groups. + * Always use process pid. Using thread pid is hard to check sanity. + * + * Procedure of unshare uva must be compatible with: + * + * 1. DVPP channel destroy procedure: + * do_exit() -> exit_mm() (mm no longer in spg and current->mm == NULL) -> + * exit_task_work() -> task_work_run() -> __fput() -> ... -> vdec_close() -> + * sp_unshare(uva, SPG_ID_DEFAULT) + * + * 2. Process A once was the target of k2u(to group), then it exits. + * Guard worker kthread tries to free this uva and it must succeed, otherwise + * spa of this uva leaks. + * + * This also means we must trust DVPP channel destroy and guard worker code. + */ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int spg_id) { int ret = 0; struct task_struct *tsk; - struct sp_group *spg; + struct mm_struct *mm; struct sp_area *spa; unsigned long uva_aligned; unsigned long size_aligned; @@ -1861,7 +1882,8 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (!spa) { ret = -EINVAL; if (printk_ratelimit()) - pr_err("share pool: invalid input uva %pK in unshare uva\n", (void *)uva); + pr_err("share pool: invalid input uva %pK in unshare uva\n", + (void *)uva); goto out_unlock; } } @@ -1904,7 +1926,6 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (printk_ratelimit()) pr_info("share pool: no need to unshare uva(to task), " "target process not found or do_exit\n"); - ret = -EINVAL; rcu_read_unlock(); sp_dump_stack(); goto out_drop_area; @@ -1912,35 +1933,51 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp get_task_struct(tsk); rcu_read_unlock();
- if (!spa->mm || - (current->mm && (current->mm != tsk->mm || tsk->mm != spa->mm))) { + if (!spa->mm) { if (printk_ratelimit()) pr_err("share pool: unshare uva(to task) failed, " - "wrong pid or invalid spa\n"); + "none spa owner\n"); ret = -EINVAL; + put_task_struct(tsk); goto out_drop_area; }
- if (spa->mm != tsk->mm) { + /* current thread may be exiting in a multithread process */ + mm = get_task_mm(tsk->group_leader); + if (!mm) { + if (printk_ratelimit()) + pr_info("share pool: no need to unshare uva(to task), " + "target process mm is exiting\n"); + put_task_struct(tsk); + goto out_drop_area; + } + + if (spa->mm != mm) { if (printk_ratelimit()) pr_err("share pool: unshare uva(to task) failed, " "spa not belong to the task\n"); ret = -EINVAL; + mmput(mm); + put_task_struct(tsk); goto out_drop_area; }
- if (!mmget_not_zero(tsk->mm)) { - put_task_struct(tsk); + /* alway allow kthread and dvpp channel destroy procedure */ + if (current->mm && current->mm != mm) { if (printk_ratelimit()) - pr_info("share pool: no need to unshare uva(to task), " - "target process mm is not existing\n"); - sp_dump_stack(); + pr_err("share pool: unshare uva(to task failed, caller " + "process %d not match target process %d\n)", + current->pid, pid); + ret = -EINVAL; + mmput(mm); + put_task_struct(tsk); goto out_drop_area; } - down_write(&tsk->mm->mmap_sem); + + down_write(&mm->mmap_sem); ret = do_munmap(tsk->mm, uva_aligned, size_aligned, NULL); - up_write(&tsk->mm->mmap_sem); - mmput(tsk->mm); + up_write(&mm->mmap_sem); + mmput(mm); if (ret) { /* we are not supposed to fail */ pr_err("share pool: failed to unmap VA %pK when munmap in unshare uva\n", @@ -1948,7 +1985,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp } put_task_struct(tsk); } else if (spa->type == SPA_TYPE_K2SPG) { - if (!spa->spg || spg_id == SPG_ID_NONE) { + if (spg_id < 0) { if (printk_ratelimit()) pr_err("share pool: unshare uva(to group) failed, " "invalid spg id %d\n", spg_id); @@ -1956,24 +1993,15 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp goto out_drop_area; }
- spg = __sp_find_spg(pid, SPG_ID_DEFAULT); - if (!spg_valid(spg)) { + if (!spg_valid(spa->spg)) { if (printk_ratelimit()) - pr_err("share pool: unshare uva(to group) invalid pid, " - "process not in sp group or group is dead\n"); - ret = -EINVAL; - goto out_drop_area; - } - - if (spa->spg != spg) { - if (printk_ratelimit()) - pr_err("share pool: unshare uva(to group) failed, " - "spa not belong to the group\n"); - ret = -EINVAL; + pr_info("share pool: no need to unshare uva(to group), " + "spa doesn't belong to a sp group or group is dead\n"); goto out_drop_area; }
- if (current->mm && current->mm->sp_group != spg) { + /* alway allow kthread and dvpp channel destroy procedure */ + if (current->mm && current->mm->sp_group != spa->spg) { if (printk_ratelimit()) pr_err("share pool: unshare uva(to group) failed, " "caller process doesn't belong to target group\n"); @@ -1981,7 +2009,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp goto out_drop_area; }
- __sp_free(spg, uva_aligned, size_aligned, NULL); + __sp_free(spa->spg, uva_aligned, size_aligned, NULL); }
if (!ret) {
From: Zhou Guanghui zhouguanghui1@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
1. sp_group_add_task: when spg invalid, low 32bits of pointer spg will be returned. This will be considered a correct spg id.
2.sp_alloc: The error branch goto out, then sp_mutex will be unlocked incorrectly(since not be locked).
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 4b9d1e28c9227..10967dcb7df9e 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -516,10 +516,15 @@ int sp_group_add_task(int pid, int spg_id) goto out_unlock;
spg = find_or_alloc_sp_group(spg_id); - if (IS_ERR(spg) || !spg_valid(spg)) { + if (IS_ERR(spg)) { ret = PTR_ERR(spg); goto out_put_task; } + + if (!spg_valid(spg)) { + ret = -ENODEV; + goto out_put_task; + } atomic_inc(&spg->use_count);
/* access control permission check */ @@ -1191,8 +1196,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) if (ret < 0 && (ret != -EEXIST)) { pr_err("share pool: allocation failed due to add group error %d in DVPP pass through scenario", ret); - p = ERR_PTR(ret); - goto out; + return ERR_PTR(ret); } mutex_lock(&sp_mutex); spg = current->mm->sp_group;
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
In fact, input parameters *size*, *pid* and *spg_id* of sp_unshare_uva() now can be treated as useless. They will be reserved to support multiple sp group in the future.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 47 +++++++++++------------------------------------ 1 file changed, 11 insertions(+), 36 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 10967dcb7df9e..44e8f3c4aa967 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1848,9 +1848,8 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) EXPORT_SYMBOL_GPL(sp_make_share_u2k);
/* - * Input parameters uva and spg_id are now useless. spg_id will be useful when - * supporting a process in multiple sp groups. - * Always use process pid. Using thread pid is hard to check sanity. + * Input parameters uva, pid and spg_id are now useless. spg_id will be useful + * when supporting a process in multiple sp groups. * * Procedure of unshare uva must be compatible with: * @@ -1868,7 +1867,6 @@ EXPORT_SYMBOL_GPL(sp_make_share_u2k); static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int spg_id) { int ret = 0; - struct task_struct *tsk; struct mm_struct *mm; struct sp_area *spa; unsigned long uva_aligned; @@ -1924,35 +1922,26 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp goto out_drop_area; }
- rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (!tsk || !tsk->mm || (tsk->flags & PF_EXITING)) { - if (printk_ratelimit()) - pr_info("share pool: no need to unshare uva(to task), " - "target process not found or do_exit\n"); - rcu_read_unlock(); - sp_dump_stack(); - goto out_drop_area; - } - get_task_struct(tsk); - rcu_read_unlock(); - if (!spa->mm) { if (printk_ratelimit()) pr_err("share pool: unshare uva(to task) failed, " "none spa owner\n"); ret = -EINVAL; - put_task_struct(tsk); goto out_drop_area; }
- /* current thread may be exiting in a multithread process */ - mm = get_task_mm(tsk->group_leader); + /* + * current thread may be exiting in a multithread process + * + * 1. never need a kthread to make unshare when process has exited + * 2. in dvpp channel destroy procedure, exit_mm() has been called + * and don't need to make unshare + */ + mm = get_task_mm(current->group_leader); if (!mm) { if (printk_ratelimit()) pr_info("share pool: no need to unshare uva(to task), " "target process mm is exiting\n"); - put_task_struct(tsk); goto out_drop_area; }
@@ -1962,24 +1951,11 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp "spa not belong to the task\n"); ret = -EINVAL; mmput(mm); - put_task_struct(tsk); - goto out_drop_area; - } - - /* alway allow kthread and dvpp channel destroy procedure */ - if (current->mm && current->mm != mm) { - if (printk_ratelimit()) - pr_err("share pool: unshare uva(to task failed, caller " - "process %d not match target process %d\n)", - current->pid, pid); - ret = -EINVAL; - mmput(mm); - put_task_struct(tsk); goto out_drop_area; }
down_write(&mm->mmap_sem); - ret = do_munmap(tsk->mm, uva_aligned, size_aligned, NULL); + ret = do_munmap(mm, uva_aligned, size_aligned, NULL); up_write(&mm->mmap_sem); mmput(mm); if (ret) { @@ -1987,7 +1963,6 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp pr_err("share pool: failed to unmap VA %pK when munmap in unshare uva\n", (void *)uva_aligned); } - put_task_struct(tsk); } else if (spa->type == SPA_TYPE_K2SPG) { if (spg_id < 0) { if (printk_ratelimit())
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- We have two independent memory regions: normal 8T region and 16G DVPP region. When an spa is freed, new cache node found by rb_prev() may be located in the other region.
When this happens, cached_vstart should be changed, otherwise the next new allocated spa may be in the wrong region.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 44e8f3c4aa967..bf572383b901d 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -164,6 +164,7 @@ struct sp_area { unsigned long va_start; unsigned long va_end; /* va_end always align to hugepage */ unsigned long real_size; /* real size with alignment */ + unsigned long region_vstart; /* belong to normal region or DVPP region */ bool is_hugepage; atomic_t use_count; /* How many vmas use this VA region */ struct rb_node rb_node; /* address sorted rbtree */ @@ -863,6 +864,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, spa->va_start = addr; spa->va_end = addr + size_align; spa->real_size = size; + spa->region_vstart = vstart; spa->is_hugepage = (flags & SP_HUGEPAGE); spa->spg = spg; atomic_set(&spa->use_count, 1); @@ -937,6 +939,15 @@ static void sp_free_area(struct sp_area *spa) cache = rb_entry(free_sp_area_cache, struct sp_area, rb_node); if (spa->va_start <= cache->va_start) { free_sp_area_cache = rb_prev(&spa->rb_node); + /* + * the new cache node may be changed to another region, + * i.e. from DVPP region to normal region + */ + if (free_sp_area_cache) { + cache = rb_entry(free_sp_area_cache, + struct sp_area, rb_node); + cached_vstart = cache->region_vstart; + } /* * We don't try to update cached_hole_size, * but it won't go very wrong.
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- Now spa overview in /proc/sharepool/spa_stat supports two new data. Spa dvpp size: physical memory usage in DVPP region. Spa dvpp va size: virtual memory usage in DVPP region.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 52 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 37 insertions(+), 15 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index bf572383b901d..a51179630e23b 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -50,6 +50,7 @@ #define ESPGMMEXIT 4000
#define byte2kb(size) ((size) / 1024) +#define byte2mb(size) ((size) / 1024 / 1024)
/* mdc scene hack */ int enable_mdc_default_group; @@ -135,6 +136,8 @@ struct sp_spa_stat { unsigned long alloc_size; unsigned long k2u_task_size; unsigned long k2u_spg_size; + unsigned long dvpp_size; + unsigned long dvpp_va_size; };
static struct sp_spa_stat spa_stat = {0}; @@ -165,6 +168,7 @@ struct sp_area { unsigned long va_end; /* va_end always align to hugepage */ unsigned long real_size; /* real size with alignment */ unsigned long region_vstart; /* belong to normal region or DVPP region */ + unsigned long flags; bool is_hugepage; atomic_t use_count; /* How many vmas use this VA region */ struct rb_node rb_node; /* address sorted rbtree */ @@ -193,14 +197,8 @@ static struct file *spa_file(struct sp_area *spa) }
/* the caller should hold sp_area_lock */ -static int spa_inc_usage(enum spa_type type, unsigned long size) +static int spa_inc_usage(enum spa_type type, unsigned long size, bool is_dvpp) { - /* - * all the calculations won't overflow due to system limitation and - * parameter checking in sp_alloc_area() - */ - spa_stat.total_num += 1; - spa_stat.total_size += size; switch (type) { case SPA_TYPE_ALLOC: spa_stat.alloc_num += 1; @@ -218,11 +216,23 @@ static int spa_inc_usage(enum spa_type type, unsigned long size) /* usually impossible, perhaps a developer's mistake */ return -EINVAL; } + + if (is_dvpp) { + spa_stat.dvpp_size += size; + spa_stat.dvpp_va_size += PMD_ALIGN(size); + } + + /* + * all the calculations won't overflow due to system limitation and + * parameter checking in sp_alloc_area() + */ + spa_stat.total_num += 1; + spa_stat.total_size += size; return 0; }
/* the caller should hold sp_area_lock */ -static int spa_dec_usage(enum spa_type type, unsigned long size) +static int spa_dec_usage(enum spa_type type, unsigned long size, bool is_dvpp) { switch (type) { case SPA_TYPE_ALLOC: @@ -239,9 +249,14 @@ static int spa_dec_usage(enum spa_type type, unsigned long size) break; default: /* usually impossible, perhaps a developer's mistake */ - spin_unlock(&sp_area_lock); return -EINVAL; } + + if (is_dvpp) { + spa_stat.dvpp_size -= size; + spa_stat.dvpp_va_size -= PMD_ALIGN(size); + } + spa_stat.total_num -= 1; spa_stat.total_size -= size; return 0; @@ -760,7 +775,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, unsigned long vstart = MMAP_SHARE_POOL_START; unsigned long vend = MMAP_SHARE_POOL_16G_START; unsigned long addr; - unsigned long size_align = ALIGN(size, 1 << 21); /* align to 2M */ + unsigned long size_align = PMD_ALIGN(size); /* va aligned to 2M */
if ((flags & SP_DVPP)) { if (host_svm_sp_enable == false) { @@ -865,13 +880,14 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, spa->va_end = addr + size_align; spa->real_size = size; spa->region_vstart = vstart; + spa->flags = flags; spa->is_hugepage = (flags & SP_HUGEPAGE); spa->spg = spg; atomic_set(&spa->use_count, 1); spa->type = type; spa->mm = NULL;
- if (spa_inc_usage(type, size)) { + if (spa_inc_usage(type, size, (flags & SP_DVPP))) { err = ERR_PTR(-EINVAL); goto error; } @@ -955,7 +971,7 @@ static void sp_free_area(struct sp_area *spa) } }
- spa_dec_usage(spa->type, spa->real_size); /* won't fail */ + spa_dec_usage(spa->type, spa->real_size, (spa->flags & SP_DVPP)); /* won't fail */ if (spa->spg) { atomic_dec(&spa->spg->spa_num); atomic64_sub(spa->real_size, &spa->spg->size); @@ -2321,6 +2337,7 @@ static void spa_overview_show(struct seq_file *seq) { unsigned int total_num, alloc_num, k2u_task_num, k2u_spg_num; unsigned long total_size, alloc_size, k2u_task_size, k2u_spg_size; + unsigned long dvpp_size, dvpp_va_size;
spin_lock(&sp_area_lock); total_num = spa_stat.total_num; @@ -2331,6 +2348,8 @@ static void spa_overview_show(struct seq_file *seq) alloc_size = spa_stat.alloc_size; k2u_task_size = spa_stat.k2u_task_size; k2u_spg_size = spa_stat.k2u_spg_size; + dvpp_size = spa_stat.dvpp_size; + dvpp_va_size = spa_stat.dvpp_va_size; spin_unlock(&sp_area_lock);
seq_printf(seq, "Spa total num %u.\n", total_num); @@ -2340,6 +2359,8 @@ static void spa_overview_show(struct seq_file *seq) seq_printf(seq, "Spa alloc size: %13lu KB\n", byte2kb(alloc_size)); seq_printf(seq, "Spa k2u(task) size: %13lu KB\n", byte2kb(k2u_task_size)); seq_printf(seq, "Spa k2u(spg) size: %13lu KB\n", byte2kb(k2u_spg_size)); + seq_printf(seq, "Spa dvpp size: %13lu KB\n", byte2kb(dvpp_size)); + seq_printf(seq, "Spa dvpp va size: %13lu MB\n", byte2mb(dvpp_va_size)); seq_printf(seq, "\n"); }
@@ -2358,12 +2379,13 @@ static int idr_spg_stat_cb(int id, void *p, void *data)
static void spg_overview_show(struct seq_file *seq) { + seq_printf(seq, "Share pool total size: %13ld KB, spa total num: %d.\n", + byte2kb(atomic64_read(&spg_stat.spa_total_size)), + atomic_read(&spg_stat.spa_total_num)); mutex_lock(&sp_mutex); idr_for_each(&sp_group_idr, idr_spg_stat_cb, seq); mutex_unlock(&sp_mutex); - seq_printf(seq, "Share pool total size: %13ld KB, spa total num: %d.\n\n", - byte2kb(atomic64_read(&spg_stat.spa_total_size)), - atomic_read(&spg_stat.spa_total_num)); + seq_printf(seq, "\n"); }
static int spa_stat_show(struct seq_file *seq, void *offset)
From: Weilong Chen chenweilong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Sharepool applies for a dedicated interface for large pages, which optimizes the efficiency of memory application
Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/hugetlb.h | 4 ++ include/linux/share_pool.h | 5 +++ mm/hugetlb.c | 22 +++++++--- mm/share_pool.c | 90 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 115 insertions(+), 6 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2383d81ca2d6d..be07888005241 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -660,4 +660,8 @@ static inline int hugetlb_insert__hugepage_pte_by_pa(struct mm_struct *mm, } #endif
+#ifdef CONFIG_ASCEND_SHARE_POOL +pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, int writable); +#endif + #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 2557ef1381221..47d8579c23ece 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -220,6 +220,11 @@ static inline void sp_dump_stack(void) dump_stack(); }
+vm_fault_t sharepool_no_page(struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags); + #else
static inline int sp_group_add_task(int pid, int spg_id) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7d57d6a943c25..77d586cdd5c7b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3401,8 +3401,13 @@ const struct vm_operations_struct hugetlb_vm_ops = { .pagesize = hugetlb_vm_op_pagesize, };
+#ifdef CONFIG_ASCEND_SHARE_POOL +pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, + int writable) +#else static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, int writable) +#endif { pte_t entry;
@@ -3419,6 +3424,9 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
return entry; } +#ifdef CONFIG_ASCEND_SHARE_POOL +EXPORT_SYMBOL(make_huge_pte); +#endif
static void set_huge_ptep_writable(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) @@ -4011,12 +4019,6 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, }
page = alloc_huge_page(vma, haddr, 0); - if (IS_ERR(page) && sp_check_vm_share_pool(vma->vm_flags)) { - page = alloc_huge_page_node(hstate_file(vma->vm_file), - numa_mem_id()); - if (!page) - page = ERR_PTR(-ENOMEM); - } if (IS_ERR(page)) { /* * Returning error will result in faulting task being @@ -4204,7 +4206,15 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { +#ifdef CONFIG_ASCEND_SHARE_POOL + if (sp_check_vm_share_pool(vma->vm_flags)) { + ret = sharepool_no_page(mm, vma, mapping, idx, address, ptep, flags); + } else { + ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); + } +#else ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags); +#endif goto out_mutex; }
diff --git a/mm/share_pool.c b/mm/share_pool.c index a51179630e23b..2f4a4cef1fc93 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -41,6 +41,8 @@ #include <linux/idr.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> +#include <linux/rmap.h> +#include <linux/hugetlb.h>
/* access control mode macros */ #define AC_NONE 0 @@ -2399,6 +2401,94 @@ static int spa_stat_show(struct seq_file *seq, void *offset) return 0; }
+vm_fault_t sharepool_no_page(struct mm_struct *mm, + struct vm_area_struct *vma, + struct address_space *mapping, pgoff_t idx, + unsigned long address, pte_t *ptep, unsigned int flags) +{ + struct hstate *h = hstate_vma(vma); + vm_fault_t ret = VM_FAULT_SIGBUS; + unsigned long size; + struct page *page; + pte_t new_pte; + spinlock_t *ptl; + unsigned long haddr = address & huge_page_mask(h); + bool new_page = false; + int err; + +retry: + page = find_lock_page(mapping, idx); + if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; + + page = alloc_huge_page(vma, haddr, 0); + if (IS_ERR(page)) { + page = alloc_huge_page_node(hstate_file(vma->vm_file), + numa_mem_id()); + if (!page) + page = ERR_PTR(-ENOMEM); + } + if (IS_ERR(page)) { + ptl = huge_pte_lock(h, mm, ptep); + if (!huge_pte_none(huge_ptep_get(ptep))) { + ret = 0; + spin_unlock(ptl); + goto out; + } + spin_unlock(ptl); + ret = vmf_error(PTR_ERR(page)); + goto out; + } + __SetPageUptodate(page); + new_page = true; + + /* sharepool pages are all shared */ + err = huge_add_to_page_cache(page, mapping, idx); + if (err) { + put_page(page); + if (err == -EEXIST) + goto retry; + goto out; + } + } + + + ptl = huge_pte_lock(h, mm, ptep); + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto backout; + + ret = 0; + if (!huge_pte_none(huge_ptep_get(ptep))) + goto backout; + + page_dup_rmap(page, true); + new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) + && (vma->vm_flags & VM_SHARED))); + set_huge_pte_at(mm, haddr, ptep, new_pte); + + hugetlb_count_add(pages_per_huge_page(h), mm); + + spin_unlock(ptl); + + if (new_page) { + SetPagePrivate(&page[1]); + } + + unlock_page(page); +out: + return ret; + +backout: + spin_unlock(ptl); + unlock_page(page); + put_page(page); + goto out; +} +EXPORT_SYMBOL(sharepool_no_page); + /* * Called by proc_root_init() to initialize the /proc/sharepool subtree */
From: Weilong Chen chenweilong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Add a flag VM_SHAREPOOL to avoid vfree() a shared kva.
Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/hugetlb.h | 3 +++ include/linux/share_pool.h | 2 -- include/linux/vmalloc.h | 4 +++ mm/hugetlb.c | 7 +++++ mm/memory.c | 3 ++- mm/share_pool.c | 53 +++++++++++++++++++++++++++++++++----- mm/vmalloc.c | 8 ++++++ 7 files changed, 71 insertions(+), 9 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index be07888005241..5e7cc7bd616a7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -402,6 +402,9 @@ static inline int hugetlb_insert_hugepage_pte(struct mm_struct *mm, return -EPERM; } #endif +int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, + unsigned long vir_addr, + pgprot_t prot, unsigned long phy_addr); int hugetlb_insert_hugepage(struct vm_area_struct *vma, unsigned long addr, struct page *hpage, pgprot_t prot);
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 47d8579c23ece..933b77be8ff84 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -24,8 +24,6 @@
#define MAX_DEVID 1 /* the max num of Da-vinci devices */
-#define VM_HUGE_PAGES 0x00001000 /* use for huge pages */ - /* to align the pointer to the (next) PMD boundary */ #define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 244eedb7591a7..6383d6989c0fc 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -23,6 +23,10 @@ struct notifier_block; /* in notifier.h */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ #define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ +#ifdef CONFIG_ASCEND_SHARE_POOL +#define VM_HUGE_PAGES 0x00001000 /* use for huge pages */ +#define VM_SHAREPOOL 0x00002000 /* remapped to sharepool */ +#endif /* bits [20..32] reserved for arch specific ioremap internals */
/* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 77d586cdd5c7b..89404e28b2514 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3647,6 +3647,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
pte = huge_ptep_get_and_clear(mm, address, ptep); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); + + /* sharepool k2u mapped pages are marked special */ + if (sp_check_vm_share_pool(vma->vm_flags) && pte_special(pte)) { + spin_unlock(ptl); + continue; + } + if (huge_pte_dirty(pte)) set_page_dirty(page);
diff --git a/mm/memory.c b/mm/memory.c index 6530d76a40af8..054e62292902a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1543,7 +1543,8 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, }
if (sp_check_hugepage(page)) - return hugetlb_insert_hugepage(vma, addr, page, vma->vm_page_prot); + return hugetlb_insert_hugepage_pte_by_pa(vma->vm_mm, addr, + vma->vm_page_prot, page_to_phys(page)); else return insert_page(vma, addr, page, vma->vm_page_prot); } diff --git a/mm/share_pool.c b/mm/share_pool.c index 2f4a4cef1fc93..fa63d8e6a3012 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -178,6 +178,7 @@ struct sp_area { struct sp_group *spg; enum spa_type type; /* where spa born from */ struct mm_struct *mm; /* owner of k2u(task) */ + unsigned long kva; /* shared kva */ }; static DEFINE_SPINLOCK(sp_area_lock); static struct rb_root sp_area_root = RB_ROOT; @@ -1393,6 +1394,17 @@ static int is_vmap_hugepage(unsigned long addr) return 0; }
+static unsigned long __sp_remap_get_pfn(unsigned long kva) +{ + unsigned long pfn; + if (is_vmalloc_addr((void *)kva)) + pfn = vmalloc_to_pfn((void *)kva); + else + pfn = virt_to_pfn(kva); + + return pfn; +} + static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, struct mm_struct *mm) { @@ -1403,6 +1415,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, int ret = 0; struct user_struct *user = NULL; int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; + unsigned long addr, buf, offset;
if (spa->is_hugepage) { file = hugetlb_file_setup(HUGETLB_ANON_FILE, spa_size(spa), VM_NORESERVE, @@ -1437,13 +1450,23 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, ret_addr = ret; goto out; } + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; } else { - ret = remap_vmalloc_range(vma, (void *)kva, 0); - if (ret) { - pr_err("share pool: remap vmalloc failed, ret %d\n", ret); - ret_addr = ret; - goto out; - } + buf = ret_addr; + addr = kva; + offset = 0; + do { + ret = remap_pfn_range(vma, buf, __sp_remap_get_pfn(addr), PAGE_SIZE, + __pgprot(vma->vm_page_prot.pgprot)); + if (ret) { + pr_err("share pool: remap_pfn_range failed, ret %d\n", ret); + ret_addr = ret; + goto out; + } + offset += PAGE_SIZE; + buf += PAGE_SIZE; + addr += PAGE_SIZE; + } while (offset < spa_size(spa)); }
out: @@ -1551,6 +1574,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned int page_size = PAGE_SIZE; enum spa_type type; int ret; + struct vm_struct *area;
if (sp_flags & ~SP_DVPP) { if (printk_ratelimit()) @@ -1632,6 +1656,12 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, type, current->comm, current->tgid, current->pid, spg_id, (void *)spa->va_start, spa->real_size); sp_dump_stack(); + + /* associate vma and spa */ + area = find_vm_area((void *)kva); + if (area) + area->flags |= VM_SHAREPOOL; + spa->kva = kva; }
return uva; @@ -1901,6 +1931,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp unsigned long uva_aligned; unsigned long size_aligned; unsigned int page_size; + struct vm_struct *area;
mutex_lock(&sp_mutex); /* @@ -2031,6 +2062,10 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp }
out_drop_area: + /* deassociate vma and spa */ + area = find_vm_area((void *)spa->kva); + if (area) + area->flags &= ~VM_SHAREPOOL; __sp_area_drop(spa); out_unlock: mutex_unlock(&sp_mutex); @@ -2045,6 +2080,7 @@ static int sp_unshare_kva(unsigned long kva, unsigned long size) unsigned long step; bool is_hugepage = true; int ret; + struct vm_struct *area;
ret = is_vmap_hugepage(kva); if (ret > 0) { @@ -2080,6 +2116,11 @@ static int sp_unshare_kva(unsigned long kva, unsigned long size) (void *)addr); }
+ /* deassociate vma and spa */ + area = find_vm_area((void *)kva_aligned); + if (area) + area->flags &= ~VM_SHAREPOOL; + vunmap((void *)kva_aligned);
return 0; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 511578705e0d8..2f39cc83a5a4d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2369,6 +2369,14 @@ static void __vunmap(const void *addr, int deallocate_pages) return; }
+#ifdef CONFIG_ASCEND_SHARE_POOL + /* unmap a sharepool vm area will cause meamleak! */ + if (area->flags & VM_SHAREPOOL) { + WARN(1, KERN_ERR "Memory leak due to vfree() sharepool vm area (%p) !\n", addr); + return; + } +#endif + debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- 1. Error handling of sp_free().
2. When sp_alloc(..., SP_HUGEPAGE, ...) rollbacks to normal page, we need to call vfs_fallocate() otherwise memory leaks until sp group is dead.
3. When sp_alloc(..., SP_HUGEPAGE, ...) rollbacks to normal page, we need to clear SP_HUGEPAGE bit in sp_flags then spa_stat interface will show the spa as a normal page spa correctly.
4. Add the reference count of a spg in find_or_alloc_sp_group() due to closer relationship.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index fa63d8e6a3012..5717ac85ab483 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -358,7 +358,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) spg->hugepage_failures = 0; spg->dvpp_multi_spaces = false; spg->owner = current->group_leader; - atomic_set(&spg->use_count, 0); + atomic_set(&spg->use_count, 1); INIT_LIST_HEAD(&spg->procs); INIT_LIST_HEAD(&spg->spa_list);
@@ -391,6 +391,10 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) ret = PTR_ERR(spg->file_hugetlb); goto out_fput; } + } else { + if (!spg_valid(spg)) + return ERR_PTR(-ENODEV); + atomic_inc(&spg->use_count); }
return spg; @@ -540,12 +544,6 @@ int sp_group_add_task(int pid, int spg_id) goto out_put_task; }
- if (!spg_valid(spg)) { - ret = -ENODEV; - goto out_put_task; - } - atomic_inc(&spg->use_count); - /* access control permission check */ if (sysctl_ac_mode == AC_SINGLE_OWNER) { if (spg->owner != current->group_leader) { @@ -1102,6 +1100,7 @@ int sp_free(unsigned long addr) if (printk_ratelimit()) pr_err("share pool: sp free failed, addr %pK is not from sp_alloc\n", (void *)addr); + goto drop_spa; }
if (!spg_valid(spa->spg)) @@ -1312,31 +1311,32 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) __sp_free(spg, sp_addr, size_aligned, list_next_entry(mm, sp_node));
+ if (printk_ratelimit()) + pr_warn("share pool: allocation failed due to mm populate failed" + "(potential no enough memory when -12): %d\n", ret); + p = ERR_PTR(ret); + + mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; + offset = sp_addr - MMAP_SHARE_POOL_START; + + ret = vfs_fallocate(spa_file(spa), mode, offset, spa_size(spa)); + if (ret) + pr_err("share pool: sp alloc normal page fallocate failed %d\n", ret); + if (file == spg->file_hugetlb) { spg->hugepage_failures++;
/* fallback to small pages */ if (!(sp_flags & SP_HUGEPAGE_ONLY)) { file = spg->file; - spa->is_hugepage = false; size_aligned = ALIGN(size, PAGE_SIZE); + sp_flags &= ~SP_HUGEPAGE; __sp_area_drop(spa); mmput(mm); goto try_again; } }
- if (printk_ratelimit()) - pr_warn("share pool: allocation failed due to mm populate failed" - "(potential no enough memory when -12): %d\n", ret); - p = ERR_PTR(ret); - - mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; - offset = sp_addr - MMAP_SHARE_POOL_START; - ret = vfs_fallocate(spa_file(spa), mode, offset, spa_size(spa)); - if (ret) - pr_err("share pool: fallocate failed %d\n", ret); - mmput(mm); break; }
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The svm need to share some memory to other process in the same group, so use the share pool function to support it.
The svm also export a new features to transfer the va2pa function for special use.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Wu Peng wupeng58@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/char/svm.c | 547 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 452 insertions(+), 95 deletions(-)
diff --git a/drivers/char/svm.c b/drivers/char/svm.c index 6ffeff4601c39..5005e5b83f373 100644 --- a/drivers/char/svm.c +++ b/drivers/char/svm.c @@ -33,6 +33,7 @@ #include <linux/msi.h> #include <linux/acpi.h> #include <linux/ascend_smmu.h> +#include <linux/share_pool.h>
#define SVM_DEVICE_NAME "svm" #define ASID_SHIFT 48 @@ -40,11 +41,11 @@ #define SVM_IOCTL_PROCESS_BIND 0xffff #define SVM_IOCTL_GET_PHYS 0xfff9 #define SVM_IOCTL_SET_RC 0xfffc -#define SVM_IOCTL_GET_L2PTE_BASE 0xfffb #define SVM_IOCTL_LOAD_FLAG 0xfffa #define SVM_IOCTL_PIN_MEMORY 0xfff7 #define SVM_IOCTL_UNPIN_MEMORY 0xfff5 #define SVM_IOCTL_GETHUGEINFO 0xfff6 +#define SVM_IOCTL_GET_PHYMEMINFO 0xfff8 #define SVM_IOCTL_REMAP_PROC 0xfff4
#define SVM_REMAP_MEM_LEN_MAX (16 * 1024 * 1024) @@ -52,6 +53,9 @@ #define SVM_IOCTL_RELEASE_PHYS32 0xfff3 #define MMAP_PHY32_MAX (16 * 1024 * 1024)
+#define SVM_IOCTL_SP_ALLOC 0xfff2 +#define SVM_IOCTL_SP_FREE 0xfff1 +#define SPG_DEFAULT_ID 0 #define CORE_SID 0 static int probe_index; static LIST_HEAD(child_list); @@ -141,6 +145,24 @@ struct svm_mpam { int user_mpam_en; };
+struct phymeminfo { + unsigned long normal_total; + unsigned long normal_free; + unsigned long huge_total; + unsigned long huge_free; +}; + +struct phymeminfo_ioctl { + struct phymeminfo *info; + unsigned long nodemask; +}; + +struct spalloc { + unsigned long addr; + unsigned long size; + unsigned long flag; +}; + static struct bus_type svm_bus_type = { .name = "svm_bus", }; @@ -154,14 +176,14 @@ static char *svm_cmd_to_string(unsigned int cmd) return "get phys"; case SVM_IOCTL_SET_RC: return "set rc"; - case SVM_IOCTL_GET_L2PTE_BASE: - return "get l2pte base"; case SVM_IOCTL_PIN_MEMORY: return "pin memory"; case SVM_IOCTL_UNPIN_MEMORY: return "unpin memory"; case SVM_IOCTL_GETHUGEINFO: return "get hugeinfo"; + case SVM_IOCTL_GET_PHYMEMINFO: + return "get physical memory info"; case SVM_IOCTL_REMAP_PROC: return "remap proc"; case SVM_IOCTL_LOAD_FLAG: @@ -177,6 +199,223 @@ static char *svm_cmd_to_string(unsigned int cmd)
extern void sysrq_sched_debug_tidy(void);
+/* + * image word of slot + * SVM_IMAGE_WORD_INIT: initial value, indicating that the slot is not used. + * SVM_IMAGE_WORD_VALID: valid data is filled in the slot + * SVM_IMAGE_WORD_DONE: the DMA operation is complete when the TS uses this address, + so, this slot can be freed. + */ +#define SVM_IMAGE_WORD_INIT 0x0 +#define SVM_IMAGE_WORD_VALID 0xaa55aa55 +#define SVM_IMAGE_WORD_DONE 0x55ff55ff + +/* + * The length of this structure must be 64 bytes, which is the agreement with the TS. + * And the data type and sequence cannot be changed, because the TS core reads data + * based on the data type and sequence. + * image_word: slot status. For details, see SVM_IMAGE_WORD_xxx + * pid: pid of process which ioctl svm device to get physical addr, it is used for + verification by TS. + * data_type: used to determine the data type by TS. Currently, data type must be + SVM_VA2PA_TYPE_DMA. + * char data[48]: for the data type SVM_VA2PA_TYPE_DMA, the DMA address is stored. + */ +struct svm_va2pa_slot { + int image_word; + int resv; + int pid; + int data_type; + char data[48]; +}; + +struct svm_va2pa_trunk { + struct svm_va2pa_slot *slots; + int slot_total; + int slot_used; + unsigned long *bitmap; + struct mutex mutex; +}; + +struct svm_va2pa_trunk va2pa_trunk; + +#define SVM_VA2PA_TRUNK_SIZE_MAX 0x3200000 +#define SVM_VA2PA_MEMORY_ALIGN 64 +#define SVM_VA2PA_SLOT_SIZE sizeof(struct svm_va2pa_slot) +#define SVM_VA2PA_TYPE_DMA 0x1 +#define SVM_MEM_REG "va2pa trunk" +#define SVM_VA2PA_CLEAN_BATCH_NUM 0x80 + +struct device_node *svm_find_mem_reg_node(struct device *dev, const char *compat) +{ + int index = 0; + struct device_node *tmp = NULL; + struct device_node *np = dev->of_node; + + for (; ; index++) { + tmp = of_parse_phandle(np, "memory-region", index); + if (!tmp) + break; + + if (of_device_is_compatible(tmp, compat)) + return tmp; + + of_node_put(tmp); + } + + return NULL; +} + +static int svm_parse_trunk_memory(struct device *dev, phys_addr_t *base, unsigned long *size) +{ + int err; + struct resource r; + struct device_node *trunk = NULL; + + trunk = svm_find_mem_reg_node(dev, SVM_MEM_REG); + if (!trunk) { + dev_err(dev, "Didn't find reserved memory\n"); + return -EINVAL; + } + + err = of_address_to_resource(trunk, 0, &r); + of_node_put(trunk); + if (err) { + dev_err(dev, "Couldn't address to resource for reserved memory\n"); + return -ENOMEM; + } + + *base = r.start; + *size = resource_size(&r); + + return 0; +} + +static int svm_setup_trunk(struct device *dev, phys_addr_t base, unsigned long size) +{ + int slot_total; + unsigned long *bitmap = NULL; + struct svm_va2pa_slot *slot = NULL; + + if (!IS_ALIGNED(base, SVM_VA2PA_MEMORY_ALIGN)) { + dev_err(dev, "Didn't aligned to %u\n", SVM_VA2PA_MEMORY_ALIGN); + return -EINVAL; + } + + if ((size == 0) || (size > SVM_VA2PA_TRUNK_SIZE_MAX)) { + dev_err(dev, "Size of reserved memory is not right\n"); + return -EINVAL; + } + + slot_total = size / SVM_VA2PA_SLOT_SIZE; + if (slot_total < BITS_PER_LONG) + return -EINVAL; + + bitmap = kvcalloc(slot_total / BITS_PER_LONG, sizeof(unsigned long), GFP_KERNEL); + if (!bitmap) { + dev_err(dev, "alloc memory failed\n"); + return -ENOMEM; + } + + slot = ioremap(base, size); + if (!slot) { + kvfree(bitmap); + dev_err(dev, "Ioremap trunk failed\n"); + return -ENXIO; + } + + va2pa_trunk.slots = slot; + va2pa_trunk.slot_used = 0; + va2pa_trunk.slot_total = slot_total; + va2pa_trunk.bitmap = bitmap; + mutex_init(&va2pa_trunk.mutex); + + return 0; +} + +static void svm_remove_trunk(struct device *dev) +{ + iounmap(va2pa_trunk.slots); + kvfree(va2pa_trunk.bitmap); + + va2pa_trunk.slots = NULL; + va2pa_trunk.bitmap = NULL; +} + +static void svm_set_slot_valid(unsigned long index, unsigned long phys) +{ + struct svm_va2pa_slot *slot = &va2pa_trunk.slots[index]; + + *((unsigned long *)slot->data) = phys; + slot->image_word = SVM_IMAGE_WORD_VALID; + slot->pid = current->pid; + slot->data_type = SVM_VA2PA_TYPE_DMA; + __bitmap_set(va2pa_trunk.bitmap, index, 1); + va2pa_trunk.slot_used++; +} + +static void svm_set_slot_init(unsigned long index) +{ + struct svm_va2pa_slot *slot = &va2pa_trunk.slots[index]; + + slot->image_word = SVM_IMAGE_WORD_INIT; + __bitmap_clear(va2pa_trunk.bitmap, index, 1); + va2pa_trunk.slot_used--; +} + +static void svm_clean_done_slots(void) +{ + int used = va2pa_trunk.slot_used; + int count = 0; + long temp = -1; + phys_addr_t addr; + unsigned long *bitmap = va2pa_trunk.bitmap; + + for (; count < used && count < SVM_VA2PA_CLEAN_BATCH_NUM;) { + temp = find_next_bit(bitmap, va2pa_trunk.slot_total, temp + 1); + if (temp == va2pa_trunk.slot_total) + break; + + count++; + if (va2pa_trunk.slots[temp].image_word != SVM_IMAGE_WORD_DONE) + continue; + + addr = *((phys_addr_t *)(va2pa_trunk.slots[temp].data)); + put_page(pfn_to_page(PHYS_PFN(addr))); + svm_set_slot_init(temp); + } +} + +static int svm_find_slot_init(unsigned long *index) +{ + int temp; + unsigned long *bitmap = va2pa_trunk.bitmap; + + temp = find_first_zero_bit(bitmap, va2pa_trunk.slot_total); + if (temp == va2pa_trunk.slot_total) + return -ENOSPC; + + *index = temp; + return 0; +} + +static int svm_va2pa_trunk_init(struct device *dev) +{ + int err; + phys_addr_t base; + unsigned long size; + + err = svm_parse_trunk_memory(dev, &base, &size); + if (err) + return err; + + err = svm_setup_trunk(dev, base, size); + if (err) + return err; + + return 0; +} + void sysrq_sched_debug_show_export(void) { #ifdef CONFIG_SCHED_DEBUG @@ -1100,56 +1339,91 @@ static pte_t *svm_get_pte(struct vm_area_struct *vma, return pte; }
+/* Must be called with mmap_sem held */ static pte_t *svm_walk_pt(unsigned long addr, unsigned long *page_size, unsigned long *offset) { pgd_t *pgd = NULL; pud_t *pud = NULL; - pte_t *pte = NULL; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL;
- down_read(&mm->mmap_sem); vma = find_vma(mm, addr); if (!vma) - goto err; + return NULL;
pgd = pgd_offset(mm, addr); if (pgd_none_or_clear_bad(pgd)) - goto err; + return NULL;
pud = pud_offset(pgd, addr); if (pud_none_or_clear_bad(pud)) - goto err; - - pte = svm_get_pte(vma, pud, addr, page_size, offset); + return NULL;
-err: - up_read(&mm->mmap_sem); - return pte; + return svm_get_pte(vma, pud, addr, page_size, offset); }
static int svm_get_phys(unsigned long __user *arg) { - pte_t *pte = NULL; + int err; + pte_t *ptep = NULL; + pte_t pte; + unsigned long index = 0; + struct page *page; unsigned long addr, phys, offset; + struct mm_struct *mm = current->mm;
if (!acpi_disabled) return -EPERM;
- if (arg == NULL) - return -EINVAL; - if (get_user(addr, arg)) return -EFAULT;
- pte = svm_walk_pt(addr, NULL, &offset); - if (pte && pte_present(*pte)) { - phys = PFN_PHYS(pte_pfn(*pte)) + offset; - return put_user(phys, arg); + down_read(&mm->mmap_sem); + ptep = svm_walk_pt(addr, NULL, &offset); + if (!ptep) { + up_read(&mm->mmap_sem); + return -EINVAL; }
- return -EINVAL; + pte = READ_ONCE(*ptep); + if (!pte_present(pte) || !(pfn_present(pte_pfn(pte)))) { + up_read(&mm->mmap_sem); + return -EINVAL; + } + + page = pte_page(pte); + get_page(page); + + phys = PFN_PHYS(pte_pfn(pte)) + offset; + up_read(&mm->mmap_sem); + + mutex_lock(&va2pa_trunk.mutex); + svm_clean_done_slots(); + if (va2pa_trunk.slot_used == va2pa_trunk.slot_total) { + err = -ENOSPC; + goto err_mutex_unlock; + } + + err = svm_find_slot_init(&index); + if (err) + goto err_mutex_unlock; + + svm_set_slot_valid(index, phys); + + err = put_user(index * SVM_VA2PA_SLOT_SIZE, (unsigned long __user *)arg); + if (err) + goto err_slot_init; + + mutex_unlock(&va2pa_trunk.mutex); + return 0; + +err_slot_init: + svm_set_slot_init(index); +err_mutex_unlock: + mutex_unlock(&va2pa_trunk.mutex); + put_page(page); + return err; }
int svm_get_pasid(pid_t vpid, int dev_id __maybe_unused) @@ -1414,6 +1688,7 @@ static int svm_set_rc(unsigned long __user *arg) unsigned long addr, size, rc; unsigned long end, page_size, offset; pte_t *pte = NULL; + struct mm_struct *mm = current->mm;
if (acpi_disabled) return -EPERM; @@ -1434,24 +1709,25 @@ static int svm_set_rc(unsigned long __user *arg) if (addr >= end) return -EINVAL;
+ down_read(&mm->mmap_sem); while (addr < end) { pte = svm_walk_pt(addr, &page_size, &offset); - if (!pte) + if (!pte) { + up_read(&mm->mmap_sem); return -ESRCH; + } pte->pte |= (rc & (u64)0x0f) << 59; addr += page_size - offset; } + up_read(&mm->mmap_sem);
return 0; }
-static int svm_get_l2pte_base(struct svm_device *sdev, - unsigned long __user *arg) +static long svm_get_hugeinfo(unsigned long __user *arg) { - int i = 0, err = -EINVAL; - unsigned long *base = NULL; - unsigned long vaddr, size; - struct mm_struct *mm = current->mm; + struct hstate *h = &default_hstate; + struct meminfo info;
if (!acpi_disabled) return -EPERM; @@ -1459,87 +1735,81 @@ static int svm_get_l2pte_base(struct svm_device *sdev, if (arg == NULL) return -EINVAL;
- if (get_user(vaddr, arg)) - return -EFAULT; + if (!hugepages_supported()) + return -ENOTSUPP;
- if (!IS_ALIGNED(vaddr, sdev->l2size)) - return -EINVAL; + info.hugetlbfree = h->free_huge_pages; + info.hugetlbtotal = h->nr_huge_pages;
- if (get_user(size, arg + 1)) + if (copy_to_user((void __user *)arg, &info, sizeof(info))) return -EFAULT;
- if (size != sdev->l2size || size != sdev->l2size) - return -EINVAL; + pr_info("svm get hugetlb info: order(%u), max_huge_pages(%lu)," + "nr_huge_pages(%lu), free_huge_pages(%lu), resv_huge_pages(%lu)", + h->order, + h->max_huge_pages, + h->nr_huge_pages, + h->free_huge_pages, + h->resv_huge_pages);
- size = ALIGN(size, PMD_SIZE) / PMD_SIZE; - base = kmalloc_array(size, sizeof(*base), GFP_KERNEL); - if (base == NULL) - return -ENOMEM; + return 0; +}
- while (size) { - pgd_t *pgd = NULL; - pud_t *pud = NULL; - pmd_t *pmd = NULL; +static void svm_get_node_memory_info_inc(unsigned long nid, struct phymeminfo *info) +{ + struct sysinfo i; + struct hstate *h = &default_hstate; + unsigned long huge_free = 0; + unsigned long huge_total = 0;
- pgd = pgd_offset(mm, vaddr); - if (pgd_none(*pgd) || pgd_bad(*pgd)) - goto err_out; + if (hugepages_supported()) { + huge_free = h->free_huge_pages_node[nid] * (PAGE_SIZE << huge_page_order(h)); + huge_total = h->nr_huge_pages_node[nid] * (PAGE_SIZE << huge_page_order(h)); + }
- pud = pud_offset(pgd, vaddr); - if (pud_none(*pud) || pud_bad(*pud)) - goto err_out; +#ifdef CONFIG_NUMA + si_meminfo_node(&i, nid); +#else + si_meminfo(&i); +#endif + info->normal_free += i.freeram * PAGE_SIZE; + info->normal_total += i.totalram * PAGE_SIZE - huge_total; + info->huge_total += huge_total; + info->huge_free += huge_free; +}
- pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd) || pmd_bad(*pmd)) - goto err_out; +static void __svm_get_memory_info(unsigned long nodemask, struct phymeminfo *info) +{ + memset(info, 0x0, sizeof(struct phymeminfo));
- /* - * For small page base address, it should use pte_pfn - * instead of pmd_pfn. - */ - base[i] = PFN_PHYS(pte_pfn(*((pte_t *)pmd))); - vaddr += PMD_SIZE; - size--; - i++; - } + nodemask = nodemask & ((1UL << MAX_NUMNODES) - 1);
- /* lint !e647 */ - err = copy_to_user((void __user *)arg, base, i * sizeof(*base)); - if (err) - err = -EFAULT; -err_out: - kfree(base); - return err; + while (nodemask) { + unsigned long nid = find_first_bit(&nodemask, BITS_PER_LONG); + if (node_isset(nid, node_online_map)) { + (void)svm_get_node_memory_info_inc(nid, info); + } + + nodemask &= ~(1UL << nid); + } }
-static long svm_get_hugeinfo(unsigned long __user *arg) +static long svm_get_phy_memory_info(unsigned long __user *arg) { - struct hstate *h = &default_hstate; - struct meminfo info; - - if (!acpi_disabled) - return -EPERM; + struct phymeminfo info; + struct phymeminfo_ioctl para;
if (arg == NULL) return -EINVAL;
- if (!hugepages_supported()) - return -ENOTSUPP; + if (copy_from_user(¶, (void __user *)arg, sizeof(para))) + return -EFAULT;
- info.hugetlbfree = h->free_huge_pages; - info.hugetlbtotal = h->nr_huge_pages; + __svm_get_memory_info(para.nodemask, &info);
- if (copy_to_user((void __user *)arg, &info, sizeof(info))) + if (copy_to_user((void __user *)para.info, &info, sizeof(info))) return -EFAULT;
- pr_info("svm get hugetlb info: order(%u), max_huge_pages(%lu)," - "nr_huge_pages(%lu), free_huge_pages(%lu), resv_huge_pages(%lu)", - h->order, - h->max_huge_pages, - h->nr_huge_pages, - h->free_huge_pages, - h->resv_huge_pages); - return 0; }
@@ -1835,13 +2105,15 @@ static int svm_release_phys32(unsigned long __user *arg) if (get_user(addr, arg)) return -EFAULT;
+ down_read(&mm->mmap_sem); pte = svm_walk_pt(addr, NULL, &offset); - if (pte && pte_present(*pte)) + if (pte && pte_present(*pte)) { phys = PFN_PHYS(pte_pfn(*pte)) + offset; - else + } else { + up_read(&mm->mmap_sem); return -EINVAL; + }
- down_read(&mm->mmap_sem); vma = find_vma(mm, addr); if (!vma) { up_read(&mm->mmap_sem); @@ -1858,6 +2130,77 @@ static int svm_release_phys32(unsigned long __user *arg) return 0; }
+static unsigned long svm_sp_alloc_mem(unsigned long __user *arg) +{ + struct spalloc spallocinfo; + void *addr; + int ret; + + if (arg == NULL) { + pr_err("arg is invalid value.\n"); + return EFAULT; + } + + ret = copy_from_user(&spallocinfo, (void __user *)arg, sizeof(spallocinfo)); + if (ret) { + pr_err("failed to copy args from user space.\n"); + return EFAULT; + } + + addr = sp_alloc(spallocinfo.size, spallocinfo.flag, SPG_DEFAULT_ID); + if (IS_ERR_VALUE(addr)) { + pr_err("svm: sp alloc failed with %ld\n", PTR_ERR(addr)); + return EFAULT; + } + + pr_notice("svm: [sp alloc] caller %s(%d/%d); return addr 0x%pK, size %lu\n", + current->comm, current->tgid, current->pid, addr, spallocinfo.size); + sp_dump_stack(); + + spallocinfo.addr = (uintptr_t)addr; + if (copy_to_user((void __user *)arg, &spallocinfo, sizeof(struct spalloc))) { + sp_free(spallocinfo.addr); + return EFAULT; + } + + return 0; +} + +static int svm_sp_free_mem(unsigned long __user *arg) +{ + int ret; + struct spalloc spallocinfo; + + if (arg == NULL) { + pr_err("arg ivalue.\n"); + return -EFAULT; + } + + ret = copy_from_user(&spallocinfo, (void __user *)arg, sizeof(spallocinfo)); + if (ret) { + pr_err("failed to copy args from user space.\n"); + return -EFAULT; + } + + ret = is_sharepool_addr(spallocinfo.addr); + if (ret == FALSE){ + pr_err("svm: sp free failed because the addr is not from sp.\n"); + return -EINVAL; + } + + ret = sp_free(spallocinfo.addr); + if (ret != 0) { + pr_err("svm: sp free failed with %d.\n", ret); + return -EFAULT; + } + + pr_notice("svm: [sp free] caller %s(%d/%d); addr 0x%pK\n", + current->comm, current->tgid, current->pid, (void *)spallocinfo.addr); + sp_dump_stack(); + + return 0; +} + /*svm ioctl will include some case for HI1980 and HI1910*/ static long svm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -1909,9 +2252,6 @@ static long svm_ioctl(struct file *file, unsigned int cmd, case SVM_IOCTL_SET_RC: err = svm_set_rc((unsigned long __user *)arg); break; - case SVM_IOCTL_GET_L2PTE_BASE: - err = svm_get_l2pte_base(sdev, (unsigned long __user *)arg); - break; case SVM_IOCTL_PIN_MEMORY: err = svm_pin_memory((unsigned long __user *)arg); break; @@ -1921,6 +2261,9 @@ static long svm_ioctl(struct file *file, unsigned int cmd, case SVM_IOCTL_GETHUGEINFO: err = svm_get_hugeinfo((unsigned long __user *)arg); break; + case SVM_IOCTL_GET_PHYMEMINFO: + err = svm_get_phy_memory_info((unsigned long __user *)arg); + break; case SVM_IOCTL_REMAP_PROC: err = svm_remap_proc((unsigned long __user *)arg); break; @@ -1930,6 +2273,12 @@ static long svm_ioctl(struct file *file, unsigned int cmd, case SVM_IOCTL_RELEASE_PHYS32: err = svm_release_phys32((unsigned long __user *)arg); break; + case SVM_IOCTL_SP_ALLOC: + err = svm_sp_alloc_mem((unsigned long __user *)arg); + break; + case SVM_IOCTL_SP_FREE: + err = svm_sp_free_mem((unsigned long __user *)arg); + break; default: err = -EINVAL; } @@ -2041,10 +2390,15 @@ static int svm_device_probe(struct platform_device *pdev) if (err) dev_warn(dev, "Cannot get l2buff\n");
+ if (svm_va2pa_trunk_init(dev)) { + dev_err(dev, "failed to init va2pa trunk\n"); + goto err_unregister_misc; + } + err = svm_dt_init_core(sdev, np); if (err) { dev_err(dev, "failed to init dt cores\n"); - goto err_unregister_misc; + goto err_remove_trunk; }
probe_index++; @@ -2054,6 +2408,9 @@ static int svm_device_probe(struct platform_device *pdev)
return err;
+err_remove_trunk: + svm_remove_trunk(dev); + err_unregister_misc: misc_deregister(&sdev->miscdev);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- Since we must use "%pK" when printing address for security issue, memleak debug printing is useless and can be removed.
Additionally, we change the permission mode of spa_stat and proc_stat to 400 for security issue.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/char/svm.c | 4 ---- mm/share_pool.c | 44 +++++++++----------------------------------- 2 files changed, 9 insertions(+), 39 deletions(-)
diff --git a/drivers/char/svm.c b/drivers/char/svm.c index 5005e5b83f373..9c2965f6d4636 100644 --- a/drivers/char/svm.c +++ b/drivers/char/svm.c @@ -2153,8 +2153,6 @@ static unsigned long svm_sp_alloc_mem(unsigned long __user *arg) return EFAULT; }
- pr_notice("svm: [sp alloc] caller %s(%d/%d); return addr 0x%pK, size %lu\n", - current->comm, current->tgid, current->pid, addr, spallocinfo.size); sp_dump_stack();
spallocinfo.addr = (uintptr_t)addr; @@ -2194,8 +2192,6 @@ static int svm_sp_free_mem(unsigned long __user *arg) return -EFAULT; }
- pr_notice("svm: [sp free] caller %s(%d/%d); addr 0x%pK\n", - current->comm, current->tgid, current->pid, (void *)spallocinfo.addr); sp_dump_stack();
return 0; diff --git a/mm/share_pool.c b/mm/share_pool.c index 5717ac85ab483..1d26d99ea2410 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1106,10 +1106,6 @@ int sp_free(unsigned long addr) if (!spg_valid(spa->spg)) goto drop_spa;
- pr_notice("share pool: [sp free] caller %s(%d/%d); " - "group id %d addr 0x%pK, size %ld\n", - current->comm, current->tgid, current->pid, spa->spg->id, - (void *)spa->va_start, spa->real_size); sp_dump_stack();
__sp_free(spa->spg, spa->va_start, spa_size(spa), NULL); @@ -1356,13 +1352,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) if (spa && !IS_ERR(spa)) __sp_area_drop(spa);
- if (!IS_ERR(p)) { - pr_notice("share pool: [sp alloc] caller %s(%d/%d); group id %d; " - "return addr 0x%pK, size %ld\n", - current->comm, current->tgid, current->pid, spa->spg->id, - (void *)spa->va_start, spa->real_size); - sp_dump_stack(); - } + sp_dump_stack(); return p; } EXPORT_SYMBOL_GPL(sp_alloc); @@ -1572,7 +1562,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned long kva_aligned; unsigned long size_aligned; unsigned int page_size = PAGE_SIZE; - enum spa_type type; int ret; struct vm_struct *area;
@@ -1599,14 +1588,14 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, mutex_lock(&sp_mutex); spg = __sp_find_spg(pid, SPG_ID_DEFAULT); if (spg == NULL) { - type = SPA_TYPE_K2TASK; + /* k2u to task */ if (spg_id != SPG_ID_NONE && spg_id != SPG_ID_DEFAULT) { mutex_unlock(&sp_mutex); if (printk_ratelimit()) pr_err("share pool: k2task invalid spg id %d\n", spg_id); return ERR_PTR(-EINVAL); } - spa = sp_alloc_area(size_aligned, sp_flags, NULL, type); + spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); if (IS_ERR(spa)) { mutex_unlock(&sp_mutex); if (printk_ratelimit()) @@ -1618,14 +1607,14 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, uva = sp_make_share_kva_to_task(kva_aligned, spa, pid); mutex_unlock(&sp_mutex); } else if (spg_valid(spg)) { - type = SPA_TYPE_K2SPG; + /* k2u to group */ if (spg_id != SPG_ID_DEFAULT && spg_id != spg->id) { mutex_unlock(&sp_mutex); if (printk_ratelimit()) pr_err("share pool: k2spg invalid spg id %d\n", spg_id); return ERR_PTR(-EINVAL); } - spa = sp_alloc_area(size_aligned, sp_flags, spg, type); + spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); if (IS_ERR(spa)) { mutex_unlock(&sp_mutex); if (printk_ratelimit()) @@ -1649,20 +1638,13 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, __sp_area_drop(spa);
if (!IS_ERR(uva)) { - if (spg_valid(spa->spg)) - spg_id = spa->spg->id; - pr_notice("share pool: [sp k2u type %d] caller %s(%d/%d); group id %d; " - "return addr 0x%pK size %ld\n", - type, current->comm, current->tgid, current->pid, spg_id, - (void *)spa->va_start, spa->real_size); - sp_dump_stack(); - /* associate vma and spa */ area = find_vm_area((void *)kva); if (area) area->flags |= VM_SHAREPOOL; spa->kva = kva; } + sp_dump_stack();
return uva; } @@ -2051,15 +2033,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp __sp_free(spa->spg, uva_aligned, size_aligned, NULL); }
- if (!ret) { - if (spg_valid(spa->spg)) - spg_id = spa->spg->id; - pr_notice("share pool: [sp unshare uva type %d] caller %s(%d/%d); " - "group id %d addr 0x%pK size %ld\n", - spa->type, current->comm, current->tgid, current->pid, - spg_id, (void *)spa->va_start, spa->real_size); - sp_dump_stack(); - } + sp_dump_stack();
out_drop_area: /* deassociate vma and spa */ @@ -2538,8 +2512,8 @@ void __init proc_sharepool_init(void) if (!proc_mkdir("sharepool", NULL)) return;
- proc_create_single_data("sharepool/proc_stat", 0, NULL, proc_stat_show, NULL); - proc_create_single_data("sharepool/spa_stat", 0, NULL, spa_stat_show, NULL); + proc_create_single_data("sharepool/proc_stat", S_IRUSR, NULL, proc_stat_show, NULL); + proc_create_single_data("sharepool/spa_stat", S_IRUSR, NULL, spa_stat_show, NULL); }
struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask,
From: Tang Yizhou t00467064@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- If an spg id is generated automatically, we must ensure it is freed when something fails in sp_group_add_task().
Notice that spg id will be bonded to a struct sp_group spg after calling find_or_alloc_sp_group(), spg and spg id will be freed in __sp_group_drop_locked(). So we only need to free spg_id before calling it.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 1d26d99ea2410..1370e656320f2 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -268,15 +268,20 @@ static int spa_dec_usage(enum spa_type type, unsigned long size, bool is_dvpp) static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, struct sp_area *spa, unsigned long *populate);
+static void free_sp_group_id(unsigned int spg_id) +{ + if ((spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) || + (spg_id >= SPG_ID_DVPP_PASS_THROUGH_MIN && + spg_id <= SPG_ID_DVPP_PASS_THROUGH_MAX)) + ida_free(&sp_group_id_ida, spg_id); +} + static void free_sp_group(struct sp_group *spg) { fput(spg->file); fput(spg->file_hugetlb); idr_remove(&sp_group_idr, spg->id); - if ((spg->id >= SPG_ID_AUTO_MIN && spg->id <= SPG_ID_AUTO_MAX) || - (spg->id >= SPG_ID_DVPP_PASS_THROUGH_MIN && - spg->id <= SPG_ID_DVPP_PASS_THROUGH_MAX)) - ida_free(&sp_group_id_ida, (unsigned int)spg->id); + free_sp_group_id((unsigned int)spg->id); kfree(spg); }
@@ -535,12 +540,15 @@ int sp_group_add_task(int pid, int spg_id) get_task_struct(tsk);
rcu_read_unlock(); - if (ret) + if (ret) { + free_sp_group_id((unsigned int)spg_id); goto out_unlock; + }
spg = find_or_alloc_sp_group(spg_id); if (IS_ERR(spg)) { ret = PTR_ERR(spg); + free_sp_group_id((unsigned int)spg_id); goto out_put_task; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- v2: Fix a compilation warning.
1. Set errno when fail in sp_free()
2. Remove redundant deassociation of uva and kva in sp_unshare_kva(). One of the reason is that this code allows vmalloc + k2u + unshare kva to be legal.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 1370e656320f2..b36d336b21656 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -895,6 +895,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, atomic_set(&spa->use_count, 1); spa->type = type; spa->mm = NULL; + spa->kva = 0; /* NULL pointer */
if (spa_inc_usage(type, size, (flags & SP_DVPP))) { err = ERR_PTR(-EINVAL); @@ -1105,6 +1106,7 @@ int sp_free(unsigned long addr) }
if (spa->type != SPA_TYPE_ALLOC) { + ret = -EINVAL; if (printk_ratelimit()) pr_err("share pool: sp free failed, addr %pK is not from sp_alloc\n", (void *)addr); @@ -2062,7 +2064,6 @@ static int sp_unshare_kva(unsigned long kva, unsigned long size) unsigned long step; bool is_hugepage = true; int ret; - struct vm_struct *area;
ret = is_vmap_hugepage(kva); if (ret > 0) { @@ -2098,11 +2099,6 @@ static int sp_unshare_kva(unsigned long kva, unsigned long size) (void *)addr); }
- /* deassociate vma and spa */ - area = find_vm_area((void *)kva_aligned); - if (area) - area->flags &= ~VM_SHAREPOOL; - vunmap((void *)kva_aligned);
return 0;
From: Zhou Guanghui zhouguanghui1@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
In func sp_remap_kva_to_vma, if sp_mmap failed, the caller does not need to free this memory area of the current process. But if the operation fails after sp_mmap, roll back.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index b36d336b21656..f98d2a0f5dcc8 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1435,7 +1435,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, ret_addr = sp_mmap(mm, file, spa, &populate); if (IS_ERR_VALUE(ret_addr)) { pr_err("share pool: k2u mmap failed %lx\n", ret_addr); - goto out; + goto put_mm; } BUG_ON(ret_addr != spa->va_start);
@@ -1446,9 +1446,10 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, if (is_vm_hugetlb_page(vma)) { ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); if (ret) { + do_munmap(mm, ret_addr, spa_size(spa), NULL); pr_err("share pool: remap vmalloc hugepage failed, ret %d\n", ret); ret_addr = ret; - goto out; + goto put_mm; } vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; } else { @@ -1459,9 +1460,10 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, ret = remap_pfn_range(vma, buf, __sp_remap_get_pfn(addr), PAGE_SIZE, __pgprot(vma->vm_page_prot.pgprot)); if (ret) { + do_munmap(mm, ret_addr, spa_size(spa), NULL); pr_err("share pool: remap_pfn_range failed, ret %d\n", ret); ret_addr = ret; - goto out; + goto put_mm; } offset += PAGE_SIZE; buf += PAGE_SIZE; @@ -1469,7 +1471,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, } while (offset < spa_size(spa)); }
-out: +put_mm: up_write(&mm->mmap_sem); mmput(mm); put_file: @@ -1501,7 +1503,6 @@ static void *sp_make_share_kva_to_task(unsigned long kva, struct sp_area *spa, ret_addr = sp_remap_kva_to_vma(kva, spa, tsk->mm); if (IS_ERR_VALUE(ret_addr)) { pr_err("share pool: remap k2u to task failed, ret %ld\n", ret_addr); - sp_munmap(tsk->mm, spa->va_start, spa_size(spa)); p = ERR_PTR(ret_addr); goto out; } @@ -1509,10 +1510,12 @@ static void *sp_make_share_kva_to_task(unsigned long kva, struct sp_area *spa, p = (void *)ret_addr;
task_lock(tsk); - if (tsk->mm == NULL) + if (tsk->mm == NULL) { + sp_munmap(tsk->mm, spa->va_start, spa_size(spa)); p = ERR_PTR(-ESRCH); - else + } else { spa->mm = tsk->mm; + } task_unlock(tsk); out: put_task_struct(tsk); @@ -1532,8 +1535,7 @@ static void *sp_make_share_kva_to_spg(unsigned long kva, struct sp_area *spa, ret_addr = sp_remap_kva_to_vma(kva, spa, mm); if (IS_ERR_VALUE(ret_addr) && (ret_addr != -ESPGMMEXIT)) { pr_err("share pool: remap k2u to spg failed, ret %ld \n", ret_addr); - __sp_free(spg, spa->va_start, spa_size(spa), - list_next_entry(mm, sp_node)); + __sp_free(spg, spa->va_start, spa_size(spa), mm); p = ERR_PTR(ret_addr); goto out; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- We encounter a problem of accessing NULL pointer of mm as below:
[25853.542104] Unable to handle kernel NULL pointer dereference at virtual address 000000000000004c [25853.542105] Mem abort info: [25853.542106] ESR = 0x96000006 [25853.542108] Exception class = DABT (current EL), IL = 32 bits [25853.542109] SET = 0, FnV = 0 [25853.542110] EA = 0, S1PTW = 0 [25853.542110] Data abort info: [25853.542111] ISV = 0, ISS = 0x00000006 [25853.542112] CM = 0, WnR = 0 [25853.542115] user pgtable: 4k pages, 48-bit VAs, pgdp = 0000000090ec4b55 [25853.542116] [000000000000004c] pgd=0000000d49b38003, pud=0000000d49be6003, pmd=0000000000000000 [25853.542121] Internal error: Oops: 96000006 [#1] SMP [25853.542123] Process dds_common_app (pid: 29004, stack limit = 0x000000001fcac39f) [25853.542127] CPU: 8 PID: 29004 Comm: dds_common_app Tainted: G O 4.19.95-1.h1.AOS2.0.aarch64 #1 [25853.542128] Hardware name: asic (DT) [25853.542129] pstate: 60400009 (nZCv daif +PAN -UAO) [25853.542136] pc : mmput+0x20/0x170 [25853.542141] lr : sp_make_share_u2k+0x1c0/0x2d8 [25853.542141] sp : ffff00000d263b80 [25853.542142] pmr_save: 000000e0 [25853.542143] x29: ffff00000d263b80 x28: 0000000000000007 [25853.542145] x27: 0000000000000007 x26: ffff00000d263cb0 [25853.542147] x25: ffff800d3d32d000 x24: ffff00000107f000 [25853.542148] x23: 0000000000007143 x22: 0000e80006000000 [25853.542150] x21: ffff00009b140000 x20: ffff0000095fa000 [25853.542151] x19: 0000000000000000 x18: ffff00000961b588 [25853.542153] x17: 0000000000000000 x16: 000000000000000e [25853.542154] x15: 0000000000000000 x14: ffff800d44d66418 [25853.542155] x13: 0000000000000000 x12: 0140000000000000 [25853.542157] x11: ffff0000db140000 x10: ffff0000096468f8 [25853.542158] x9 : ffff80089a517e80 x8 : 0000000000000008 [25853.542160] x7 : 00000000ffffffff x6 : 000000c4c5d56692 [25853.542161] x5 : 0000000000000000 x4 : 0000000000000020 [25853.542163] x3 : 0000000000000010 x2 : 000000000000004c [25853.542164] x1 : 0000000000000000 x0 : ffff00000831c718 [25853.542166] Call trace: [25853.542168] mmput+0x20/0x170 [25853.542169] sp_make_share_u2k+0x1c0/0x2d8 [25853.542180] mz_create+0x7c/0x328 [drv_buff_module] [25853.542185] buff_req_ioctl_mz_create+0xf8/0x338 [drv_buff_module] [25853.542189] buff_ioctl+0xc8/0x3f8 [drv_buff_module] [25853.542193] do_vfs_ioctl+0xc4/0x8c0 [25853.542194] ksys_ioctl+0x8c/0xa0 [25853.542196] __arm64_sys_ioctl+0x28/0x38 [25853.542200] el0_svc_common+0x8c/0x218 [25853.542202] el0_svc_handler+0x38/0x88 [25853.542205] el0_svc+0x14/0x40
The offset of mm_user in mm_struct is 0x4c so the NULL pointer is mm.
The concurrency scene can be described as follows: At first, mmget_not_zero() increases the refcount of mm successfully, but then target process is killed. In exit_mm(), the exiting process set
current->mm = NULL;
And when mmput() is called in u2k, tsk->mm is already NULL.
To fix the problem, we use get_task_mm() to keep mm pointer in a local variable.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index f98d2a0f5dcc8..fb492679473c0 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1746,7 +1746,7 @@ static int sp_hugetlb_entry(pte_t *ptep, unsigned long hmask, * ALIGN(uva+size) - uva_aligned */ static int __sp_walk_page_range(unsigned long uva, unsigned long size, - struct task_struct *tsk, struct sp_walk_data *sp_walk_data) + struct mm_struct *mm, struct sp_walk_data *sp_walk_data) { int ret = 0; struct vm_area_struct *vma; @@ -1767,7 +1767,7 @@ static int __sp_walk_page_range(unsigned long uva, unsigned long size, * In this situation, the correctness of the parameters is mainly * guaranteed by the caller. */ - vma = find_vma(tsk->mm, uva); + vma = find_vma(mm, uva); if (!vma) { if (printk_ratelimit()) pr_err("share pool: u2k input uva %pK is invalid\n", (void *)uva); @@ -1811,7 +1811,7 @@ static int __sp_walk_page_range(unsigned long uva, unsigned long size, } sp_walk_data->pages = pages;
- sp_walk.mm = tsk->mm; + sp_walk.mm = mm; sp_walk.private = sp_walk_data;
ret = walk_page_range(uva_aligned, uva_aligned + size_aligned, @@ -1835,6 +1835,7 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) { int ret = 0; struct task_struct *tsk; + struct mm_struct *mm; void *p = ERR_PTR(-ENODEV); struct sp_walk_data sp_walk_data = { .page_count = 0, @@ -1853,14 +1854,15 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) goto out; }
- if (!mmget_not_zero(tsk->mm)) + mm = get_task_mm(tsk); + if (mm == NULL) goto out_put_task; - down_write(&tsk->mm->mmap_sem); - ret = __sp_walk_page_range(uva, size, tsk, &sp_walk_data); + down_write(&mm->mmap_sem); + ret = __sp_walk_page_range(uva, size, mm, &sp_walk_data); if (ret) { pr_err("share pool: walk page range failed, ret %d\n", ret); - up_write(&tsk->mm->mmap_sem); - mmput(tsk->mm); + up_write(&mm->mmap_sem); + mmput(mm); p = ERR_PTR(ret); goto out_put_task; } @@ -1871,8 +1873,8 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) else p = vmap(sp_walk_data.pages, sp_walk_data.page_count, VM_MAP, PAGE_KERNEL); - up_write(&tsk->mm->mmap_sem); - mmput(tsk->mm); + up_write(&mm->mmap_sem); + mmput(mm);
if (!p) { if (printk_ratelimit()) @@ -2145,6 +2147,7 @@ EXPORT_SYMBOL_GPL(sp_unshare); int sp_walk_page_range(unsigned long uva, unsigned long size, struct task_struct *tsk, struct sp_walk_data *sp_walk_data) { + struct mm_struct *mm; int ret = 0;
if (unlikely(!sp_walk_data)) { @@ -2155,17 +2158,19 @@ int sp_walk_page_range(unsigned long uva, unsigned long size, if (!tsk || (tsk->flags & PF_EXITING)) return -ESRCH;
- sp_walk_data->page_count = 0; - get_task_struct(tsk); - if (!mmget_not_zero(tsk->mm)) { + mm = get_task_mm(tsk); + if (!mm) { put_task_struct(tsk); return -ESRCH; } - down_write(&tsk->mm->mmap_sem); - ret = __sp_walk_page_range(uva, size, tsk, sp_walk_data); - up_write(&tsk->mm->mmap_sem); - mmput(tsk->mm); + + sp_walk_data->page_count = 0; + down_write(&mm->mmap_sem); + ret = __sp_walk_page_range(uva, size, mm, sp_walk_data); + up_write(&mm->mmap_sem); + + mmput(mm); put_task_struct(tsk);
return ret;
From: Zhou Guanghui zhouguanghui1@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- When the memory is insufficient or fragmentation is severe, the 2MB hugepage allocation will perform direct reclaim and compact.
The direct reclaim and compact may take a long time. As a result, sp mutex will be hold for too long time to casue the hung task problem. In this case, set the PF_MEMALLOC flag to prevent the direct reclaim and compact from being executed.
Direct compact is not allowed during hugepage allocation. As a result, 2MB hugepage may failed to be applied for.
When do sp alloc, if the 2MB hugepage cannot be allocated of the total free memory is less than 1/3 of total memory, a work is added compact the memory asynchronously.
When do sp free, if the total free memory is less than 1/3 of total memory, compact memory synchronously.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+)
diff --git a/mm/share_pool.c b/mm/share_pool.c index fb492679473c0..df859f066a973 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -43,6 +43,7 @@ #include <linux/seq_file.h> #include <linux/rmap.h> #include <linux/hugetlb.h> +#include <linux/compaction.h>
/* access control mode macros */ #define AC_NONE 0 @@ -1036,6 +1037,45 @@ void sp_area_drop(struct vm_area_struct *vma) spin_unlock(&sp_area_lock); }
+static unsigned long last_jiffies; +static void sp_compact_nodes(struct work_struct *work) +{ + sysctl_compaction_handler(NULL, 1, NULL, NULL, NULL); + + kfree(work); +} + +static void sp_add_work_compact(void) +{ + struct work_struct *compact_work; + + if (!time_after(jiffies, last_jiffies + 10 * HZ)) + return; + + compact_work = kzalloc(sizeof(*compact_work), GFP_KERNEL); + if (!compact_work) + return; + + last_jiffies = jiffies; + INIT_WORK(compact_work, sp_compact_nodes); + schedule_work(compact_work); +} + +static void sp_try_to_compact(void) +{ + unsigned long totalram; + unsigned long freeram; + + totalram = totalram_pages; + freeram = global_zone_page_state(NR_FREE_PAGES); + + /* free < total / 3 */ + if ((freeram + (freeram << 1)) > totalram) + return; + + sp_add_work_compact(); +} + /* The caller must hold sp_mutex. */ static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size) @@ -1143,6 +1183,7 @@ int sp_free(unsigned long addr) out: mutex_unlock(&sp_mutex);
+ sp_try_to_compact(); return ret; } EXPORT_SYMBOL_GPL(sp_free); @@ -1194,6 +1235,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) int ret = 0; struct mm_struct *tmp; unsigned long mode, offset; + unsigned int noreclaim_flag;
/* mdc scene hack */ if (enable_mdc_default_group) @@ -1306,6 +1348,21 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) /* clean PTE_RDONLY flags or trigger SMMU event */ vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); up_write(&mm->mmap_sem); + + /* + * The direct reclaim and compact may take a long + * time. As a result, sp mutex will be hold for too + * long time to casue the hung task problem. In this + * case, set the PF_MEMALLOC flag to prevent the + * direct reclaim and compact from being executed. + * Since direct reclaim and compact are not performed + * when the fragmentation is severe or the memory is + * insufficient, 2MB continuous physical pages fail + * to be allocated. This situation is allowed. + */ + if (spa->is_hugepage) + noreclaim_flag = memalloc_noreclaim_save(); + /* * We are not ignoring errors, so if we fail to allocate * physical memory we just return failure, so we won't encounter @@ -1313,6 +1370,11 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) * depends on this feature (and MAP_LOCKED) to work correctly. */ ret = do_mm_populate(mm, sp_addr, populate, 0); + if (spa->is_hugepage) { + memalloc_noreclaim_restore(noreclaim_flag); + if (ret) + sp_add_work_compact(); + } if (ret) { __sp_free(spg, sp_addr, size_aligned, list_next_entry(mm, sp_node)); @@ -1363,6 +1425,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) __sp_area_drop(spa);
sp_dump_stack(); + sp_try_to_compact(); return p; } EXPORT_SYMBOL_GPL(sp_alloc);
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The share_k2u_to_spg was designed to translate the kernel virtual address to user level address and share to the whole group, but tht function use the uncorrect file to mmap the memory region, it was not really work for whole group, and will consume more memory, fix this problem to use the correct hugepage file and make share to the whole group for each task.
Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 117 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 87 insertions(+), 30 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index df859f066a973..2e55404b5bae7 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -267,7 +267,10 @@ static int spa_dec_usage(enum spa_type type, unsigned long size, bool is_dvpp) }
static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, - struct sp_area *spa, unsigned long *populate); + struct sp_area *spa, unsigned long *populate); +static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size); +static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, + struct mm_struct *mm);
static void free_sp_group_id(unsigned int spg_id) { @@ -596,6 +599,15 @@ int sp_group_add_task(int pid, int spg_id) atomic_inc(&spa->use_count); spin_unlock(&sp_area_lock);
+ if (spa->type == SPA_TYPE_K2SPG && spa->kva) { + addr = sp_remap_kva_to_vma(spa->kva, spa, mm); + if (IS_ERR_VALUE(addr)) + pr_warn("share pool: task add group remap k2u failed, ret %ld\n", addr); + + spin_lock(&sp_area_lock); + continue; + } + down_write(&mm->mmap_sem); addr = sp_mmap(mm, file, spa, &populate); if (IS_ERR_VALUE(addr)) { @@ -611,9 +623,11 @@ int sp_group_add_task(int pid, int spg_id) if (populate) { ret = do_mm_populate(mm, spa->va_start, populate, 0); if (ret) { - if (printk_ratelimit()) + if (printk_ratelimit()) { pr_warn("share pool: task add group failed when mm populate " - "failed (potential no enough memory): %d\n", ret); + "failed (potential no enough memory): %d " + "spa flag is %d\n", ret, spa->type); + } sp_munmap_task_areas(mm, spa->link.next); spin_lock(&sp_area_lock); break; @@ -1480,12 +1494,16 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; unsigned long addr, buf, offset;
- if (spa->is_hugepage) { - file = hugetlb_file_setup(HUGETLB_ANON_FILE, spa_size(spa), VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE, hsize_log); - if (IS_ERR(file)) { - pr_err("share pool: file setup for k2u hugepage failed %ld\n", PTR_ERR(file)); - return PTR_ERR(file); + if (spg_valid(spa->spg)) { + file = spa_file(spa); + } else { + if (spa->is_hugepage) { + file = hugetlb_file_setup(HUGETLB_ANON_FILE, spa_size(spa), VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE, hsize_log); + if (IS_ERR(file)) { + pr_err("share pool: file setup for k2u hugepage failed %ld\n", PTR_ERR(file)); + return PTR_ERR(file); + } } }
@@ -1510,7 +1528,8 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); if (ret) { do_munmap(mm, ret_addr, spa_size(spa), NULL); - pr_err("share pool: remap vmalloc hugepage failed, ret %d\n", ret); + pr_err("share pool: remap vmalloc hugepage failed, " + "ret %d, kva is %lx\n", ret, kva); ret_addr = ret; goto put_mm; } @@ -1538,7 +1557,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, up_write(&mm->mmap_sem); mmput(mm); put_file: - if (file) + if (!spa->spg && file) fput(file);
return ret_addr; @@ -1615,6 +1634,35 @@ static void *sp_make_share_kva_to_spg(unsigned long kva, struct sp_area *spa, return p; }
+static bool vmalloc_area_set_flag(struct sp_area *spa, unsigned long kva, unsigned long flags) +{ + struct vm_struct *area; + + area = find_vm_area((void *)kva); + if (area) { + area->flags |= flags; + spa->kva = kva; + return true; + } + + return false; +} + +static bool vmalloc_area_clr_flag(struct sp_area *spa, unsigned long kva, unsigned long flags) +{ + struct vm_struct *area; + + spa->kva = 0; + + area = find_vm_area((void *)kva); + if (area) { + area->flags &= ~flags; + return true; + } + + return false; +} + /** * Share kernel memory to a specified process or sp_group * @kva: the VA of shared kernel memory @@ -1638,7 +1686,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned long size_aligned; unsigned int page_size = PAGE_SIZE; int ret; - struct vm_struct *area;
if (sp_flags & ~SP_DVPP) { if (printk_ratelimit()) @@ -1679,8 +1726,13 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, PTR_ERR(spa)); return spa; } + + if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { + pr_err("%s: the kva %ld is not valid\n", __func__, kva_aligned); + goto out; + } + uva = sp_make_share_kva_to_task(kva_aligned, spa, pid); - mutex_unlock(&sp_mutex); } else if (spg_valid(spg)) { /* k2u to group */ if (spg_id != SPG_ID_DEFAULT && spg_id != spg->id) { @@ -1699,26 +1751,31 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, return spa; }
+ if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { + pr_err("%s: the kva %ld is not valid\n", __func__, kva_aligned); + goto out; + } + uva = sp_make_share_kva_to_spg(kva_aligned, spa, spg); - mutex_unlock(&sp_mutex); } else { mutex_unlock(&sp_mutex); pr_err("share pool: failed to make k2u\n"); return NULL; }
- if (!IS_ERR(uva)) + if (!IS_ERR(uva)) { uva = uva + (kva - kva_aligned); + } else { + /* associate vma and spa */ + if (!vmalloc_area_clr_flag(spa, kva_aligned, VM_SHAREPOOL)) + pr_warn("share pool: %s: the kva %ld is not valid \n", + __func__, kva_aligned); + }
+out: __sp_area_drop(spa); + mutex_unlock(&sp_mutex);
- if (!IS_ERR(uva)) { - /* associate vma and spa */ - area = find_vm_area((void *)kva); - if (area) - area->flags |= VM_SHAREPOOL; - spa->kva = kva; - } sp_dump_stack();
return uva; @@ -1990,7 +2047,6 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp unsigned long uva_aligned; unsigned long size_aligned; unsigned int page_size; - struct vm_struct *area;
mutex_lock(&sp_mutex); /* @@ -2061,7 +2117,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (printk_ratelimit()) pr_info("share pool: no need to unshare uva(to task), " "target process mm is exiting\n"); - goto out_drop_area; + goto out_clr_flag; }
if (spa->mm != mm) { @@ -2095,7 +2151,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (printk_ratelimit()) pr_info("share pool: no need to unshare uva(to group), " "spa doesn't belong to a sp group or group is dead\n"); - goto out_drop_area; + goto out_clr_flag; }
/* alway allow kthread and dvpp channel destroy procedure */ @@ -2112,11 +2168,12 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp
sp_dump_stack();
-out_drop_area: +out_clr_flag: /* deassociate vma and spa */ - area = find_vm_area((void *)spa->kva); - if (area) - area->flags &= ~VM_SHAREPOOL; + if (!vmalloc_area_clr_flag(spa, spa->kva, VM_SHAREPOOL)) + pr_warn("share pool: %s: the spa->kva %ld is not valid\n", __func__, spa->kva); + +out_drop_area: __sp_area_drop(spa); out_unlock: mutex_unlock(&sp_mutex); @@ -2162,7 +2219,7 @@ static int sp_unshare_kva(unsigned long kva, unsigned long size) if (page) put_page(page); else - pr_err("share pool: vmalloc %pK to page/hugepage failed\n", + pr_warn("share pool: vmalloc %pK to page/hugepage failed\n", (void *)addr); }
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The uva from k2u was shared to the whole group now, it was useless for the application user, and waste more cycles for the api caller, so disable it by default for performance, and enable it until the user really need it future.
Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 2e55404b5bae7..93b532c21c142 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -59,6 +59,9 @@ int enable_mdc_default_group; static const int mdc_default_group_id = 1;
+/* share the uva to the whole group */ +int enable_share_k2u_spg; + /* access control mode */ int sysctl_ac_mode = AC_NONE; /* debug mode */ @@ -1741,7 +1744,12 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, pr_err("share pool: k2spg invalid spg id %d\n", spg_id); return ERR_PTR(-EINVAL); } - spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); + + if (enable_share_k2u_spg) + spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); + else + spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); + if (IS_ERR(spa)) { mutex_unlock(&sp_mutex); if (printk_ratelimit()) @@ -1756,7 +1764,10 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, goto out; }
- uva = sp_make_share_kva_to_spg(kva_aligned, spa, spg); + if (spa->spg) + uva = sp_make_share_kva_to_spg(kva_aligned, spa, spg); + else + uva = sp_make_share_kva_to_task(kva_aligned, spa, pid); } else { mutex_unlock(&sp_mutex); pr_err("share pool: failed to make k2u\n"); @@ -2375,6 +2386,13 @@ static int __init mdc_default_group(char *s) } __setup("enable_mdc_default_group", mdc_default_group);
+static int __init enable_share_k2u_to_group(char *s) +{ + enable_share_k2u_spg = 1; + return 1; +} +__setup("enable_sp_share_k2u_spg", enable_share_k2u_to_group); + int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) {
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- The reason of exporting buff_vzalloc_user() is that gfp_mask __GFP_ACCOUNT can be used to limit memory usage with memory cgroup.
The same reason for buff_vzalloc_hugepage_user(), a hugepage version.
By selecting HAVE_ARCH_HUGE_VMALLOC and enabling boot arg enable_share_pool, buff_vzalloc_user() and vmalloc_hugepage_user() can allocate hugepage memory. Also, vmalloc() will allocate hugepage memory if possible. Reference: https://lwn.net/Articles/839107/
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/Kconfig | 4 ++ arch/arm64/Kconfig | 2 + include/linux/share_pool.h | 29 ++++++++++++ include/linux/vmalloc.h | 3 +- mm/share_pool.c | 91 ++++++++++++++++++++++++++++++++++++++ mm/vmalloc.c | 52 +--------------------- 6 files changed, 129 insertions(+), 52 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig index 00f55932ba781..e877b083238de 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -562,6 +562,10 @@ config HAVE_ARCH_HUGE_VMAP config HAVE_ARCH_HUGE_VMALLOC depends on HAVE_ARCH_HUGE_VMAP bool + help + Archs that select this would be capable of PMD-sized vmaps (i.e., + arch_vmap_pmd_supported() returns true), and they must make no + assumptions that vmalloc memory is mapped with PAGE_SIZE ptes.
config HAVE_ARCH_SOFT_DIRTY bool diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b0b19554359fa..f2106f2cb6edf 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -105,6 +105,7 @@ config ARM64 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_BITREVERSE select HAVE_ARCH_HUGE_VMAP + select HAVE_ARCH_HUGE_VMALLOC select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48) select HAVE_ARCH_KGDB @@ -1466,6 +1467,7 @@ config ASCEND_SHARE_POOL default n select ARCH_USES_HIGH_VMA_FLAGS select MM_OWNER + depends on HAVE_ARCH_HUGE_VMALLOC help This feature allows multiple processes to share virtual memory both in kernel and user level, which is only enabled for ascend platform. diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 933b77be8ff84..3c5a41ae5bd14 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -38,6 +38,10 @@ extern int sysctl_sp_debug_mode;
extern int enable_ascend_share_pool;
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +extern bool vmap_allow_huge; +#endif + /* Processes in the same sp_group can share memory. * Memory layout for share pool: * @@ -223,6 +227,11 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm, struct address_space *mapping, pgoff_t idx, unsigned long address, pte_t *ptep, unsigned int flags);
+extern void *vmalloc_hugepage(unsigned long size); +extern void *vmalloc_hugepage_user(unsigned long size); +extern void *buff_vzalloc_user(unsigned long size); +extern void *buff_vzalloc_hugepage_user(unsigned long size); + #else
static inline int sp_group_add_task(int pid, int spg_id) @@ -365,6 +374,26 @@ static inline bool sp_mmap_check(unsigned long flags) static inline void sp_dump_stack(void) { } + +static inline void *vmalloc_hugepage(unsigned long size) +{ + return NULL; +} + +static inline void *vmalloc_hugepage_user(unsigned long size) +{ + return NULL; +} + +static inline void *buff_vzalloc_user(unsigned long size) +{ + return NULL; +} + +static inline void *buff_vzalloc_hugepage_user(unsigned long size) +{ + return NULL; +} #endif
#endif /* LINUX_SHARE_POOL_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 6383d6989c0fc..bb814f6418fd9 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -99,8 +99,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); -extern void *vmalloc_hugepage(unsigned long size); -extern void *vmalloc_hugepage_user(unsigned long size); + #ifndef CONFIG_MMU extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, diff --git a/mm/share_pool.c b/mm/share_pool.c index 93b532c21c142..dad82727ee896 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2674,11 +2674,102 @@ struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, return alloc_pages_node(node, mask, page_order); }
+/** + * vmalloc_hugepage - allocate virtually contiguous hugetlb memory + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage(unsigned long size) +{ + /* PMD hugepage aligned */ + size = PMD_ALIGN(size); + + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL, + VM_HUGE_PAGES, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_hugepage); + +/** + * vmalloc_hugepage_user - allocate virtually contiguous hugetlb memory + * for userspace + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. The resulting memory area + * is zeroed so it can be mapped to userspace without leaking data. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage_user(unsigned long size) +{ + /* PMD hugepage aligned */ + size = PMD_ALIGN(size); + + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + VM_HUGE_PAGES | VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_hugepage_user); + +/** + * buff_vzalloc_user - allocate zeroed virtually contiguous memory + * for userspace + * @size: allocation size + * + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. + * + * Compare to vmalloc_user(), this is a customized function because + * __GFP_ACCOUNT is used to limit memory usage. + */ +void *buff_vzalloc_user(unsigned long size) +{ + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(buff_vzalloc_user); + +/** + * buff_vzalloc_hugepage_user - allocate virtually contiguous hugetlb memory + * for userspace + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. The resulting memory area + * is zeroed so it can be mapped to userspace without leaking data. + * + * The allocation size is aligned to PMD_SIZE automatically + * + * Compare to vmalloc_hugepage_user(), this is a customized function because + * __GFP_ACCOUNT is used to limit memory usage. + */ +void *buff_vzalloc_hugepage_user(unsigned long size) +{ + /* PMD hugepage aligned */ + size = PMD_ALIGN(size); + + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT, PAGE_KERNEL, + VM_HUGE_PAGES | VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(buff_vzalloc_hugepage_user); + int enable_ascend_share_pool;
static int __init enable_share_pool(char *s) { enable_ascend_share_pool = 1; + vmap_allow_huge = true;
pr_info("Ascend enable share pool features\n");
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2f39cc83a5a4d..813befec2bf07 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -43,7 +43,7 @@ #include "internal.h"
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC -static bool __ro_after_init vmap_allow_huge = true; +bool __ro_after_init vmap_allow_huge;
static int __init set_nohugevmalloc(char *str) { @@ -52,7 +52,7 @@ static int __init set_nohugevmalloc(char *str) } early_param("nohugevmalloc", set_nohugevmalloc); #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ -static const bool vmap_allow_huge = false; +static const bool vmap_allow_huge; #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
struct vfree_deferred { @@ -2933,54 +2933,6 @@ void *vmalloc_32_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_32_user);
-/** - * vmalloc_hugepage - allocate virtually contiguous hugetlb memory - * @size: allocation size - * - * Allocate enough huge pages to cover @size and map them into - * contiguous kernel virtual space. - * - * The allocation size is aligned to PMD_SIZE automatically - */ -void *vmalloc_hugepage(unsigned long size) -{ - /* PMD hugepage aligned */ - size = PMD_ALIGN(size); - - return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); -} -EXPORT_SYMBOL(vmalloc_hugepage); - -/** - * vmalloc_hugepage_user - allocate virtually contiguous hugetlb memory - * for userspace - * @size: allocation size - * - * Allocate enough huge pages to cover @size and map them into - * contiguous kernel virtual space. The resulting memory area - * is zeroed so it can be mapped to userspace without leaking data. - * - * The allocation size is aligned to PMD_SIZE automatically - */ -void *vmalloc_hugepage_user(unsigned long size) -{ - struct vm_struct *area; - void *ret; - - /* 2M hugepa aligned */ - size = PMD_ALIGN(size); - - ret = __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); - if (ret) { - area = find_vm_area(ret); - area->flags |= VM_USERMAP; - } - return ret; -} -EXPORT_SYMBOL(vmalloc_hugepage_user); -
/* * small helper routine , copy contents to buf from addr.
From: Fang Lijun fanglijun3@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
--------------------------------------------------
The "area" is removed and freed in __vmalloc_area_node when it returned NULL, we needn't call free_vm_area to remove and free this area again.
Fixes: 59a57a82fb2a ("mm/vmalloc: Hugepage vmalloc mappings") Signed-off-by: Fang Lijun fanglijun3@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmalloc.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 813befec2bf07..4b0970d6fc913 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2709,7 +2709,6 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
fail: if (shift > PAGE_SHIFT) { - free_vm_area(area); shift = PAGE_SHIFT; align = real_align; size = real_size;
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- Statistical functions of sharepool are separated in several places and should be put together.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 86 ++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 41 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index dad82727ee896..eb5fe5a6d616e 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2393,6 +2393,8 @@ static int __init enable_share_k2u_to_group(char *s) } __setup("enable_sp_share_k2u_spg", enable_share_k2u_to_group);
+/*** Statistical and maintenance functions ***/ + int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -2418,35 +2420,6 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, return 0; }
-static int idr_proc_stat_cb(int id, void *p, void *data) -{ - struct sp_group *spg; - struct sp_proc_stat *stat = p; - struct seq_file *seq = data; - - mutex_lock(&sp_mutex); - spg = __sp_find_spg(id, SPG_ID_DEFAULT); - if (spg_valid(spg)) { - seq_printf(seq, "%-12d %-10d %-18ld\n", - id, spg->id, byte2kb(stat->amount)); - } - mutex_unlock(&sp_mutex); - - return 0; -} - -static int proc_stat_show(struct seq_file *seq, void *offset) -{ - /* print the file header */ - seq_printf(seq, "%-12s %-10s %-18s\n", - "Process ID", "Group ID", "Aligned Apply(KB)"); - /* print kthread buff_module_guard_work */ - seq_printf(seq, "%-12s %-10s %-18ld\n", - "guard", "-", byte2kb(kthread_stat.amount)); - idr_for_each(&sp_stat_idr, idr_proc_stat_cb, seq); - return 0; -} - static void rb_spa_stat_show(struct seq_file *seq) { struct rb_node *node; struct sp_area *spa; @@ -2565,6 +2538,49 @@ static int spa_stat_show(struct seq_file *seq, void *offset) return 0; }
+static int idr_proc_stat_cb(int id, void *p, void *data) +{ + struct sp_group *spg; + struct sp_proc_stat *stat = p; + struct seq_file *seq = data; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(id, SPG_ID_DEFAULT); + if (spg_valid(spg)) { + seq_printf(seq, "%-12d %-10d %-18ld\n", + id, spg->id, byte2kb(stat->amount)); + } + mutex_unlock(&sp_mutex); + + return 0; +} + +static int proc_stat_show(struct seq_file *seq, void *offset) +{ + /* print the file header */ + seq_printf(seq, "%-12s %-10s %-18s\n", + "Process ID", "Group ID", "Aligned Apply(KB)"); + /* print kthread buff_module_guard_work */ + seq_printf(seq, "%-12s %-10s %-18ld\n", + "guard", "-", byte2kb(kthread_stat.amount)); + idr_for_each(&sp_stat_idr, idr_proc_stat_cb, seq); + return 0; +} + +/* + * Called by proc_root_init() to initialize the /proc/sharepool subtree + */ +void __init proc_sharepool_init(void) +{ + if (!proc_mkdir("sharepool", NULL)) + return; + + proc_create_single_data("sharepool/proc_stat", 0400, NULL, proc_stat_show, NULL); + proc_create_single_data("sharepool/spa_stat", 0400, NULL, spa_stat_show, NULL); +} + +/*** End of tatistical and maintenance functions ***/ + vm_fault_t sharepool_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, @@ -2653,18 +2669,6 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm, } EXPORT_SYMBOL(sharepool_no_page);
-/* - * Called by proc_root_init() to initialize the /proc/sharepool subtree - */ -void __init proc_sharepool_init(void) -{ - if (!proc_mkdir("sharepool", NULL)) - return; - - proc_create_single_data("sharepool/proc_stat", S_IRUSR, NULL, proc_stat_show, NULL); - proc_create_single_data("sharepool/spa_stat", S_IRUSR, NULL, spa_stat_show, NULL); -} - struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, unsigned int page_order, int node) {
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- spg_overview() will show how many normal page memory and hugepage memory are allocated for a sp_group.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 12 ++++++++---- mm/share_pool.c | 28 ++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 10 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 3c5a41ae5bd14..eb0358bb6633b 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -66,6 +66,8 @@ extern bool vmap_allow_huge; */ struct sp_group { int id; + /* record the number of hugepage allocation failures */ + int hugepage_failures; struct file *file; struct file *file_hugetlb; /* list head of processes */ @@ -76,12 +78,14 @@ struct sp_group { atomic_t spa_num; /* total size of all sp_area from sp_alloc and k2u(spg) */ atomic64_t size; - /* record the number of hugepage allocation failures */ - int hugepage_failures; - /* is_alive == false means it's being destroyed */ - bool is_alive; + /* total size of all sp_area from sp_alloc normal page */ + atomic64_t alloc_nsize; + /* total size of all sp_area from sp_alloc hugepage */ + atomic64_t alloc_hsize; /* we define the creator process of a sp_group as owner */ struct task_struct *owner; + /* is_alive == false means it's being destroyed */ + bool is_alive; /* dvpp_multi_spaces == true means multiple dvpp 16G spaces are set */ bool dvpp_multi_spaces; unsigned long dvpp_va_start; diff --git a/mm/share_pool.c b/mm/share_pool.c index eb5fe5a6d616e..fb0a8f1b54405 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -52,8 +52,8 @@ #define spg_valid(spg) ((spg) && ((spg)->is_alive == true)) #define ESPGMMEXIT 4000
-#define byte2kb(size) ((size) / 1024) -#define byte2mb(size) ((size) / 1024 / 1024) +#define byte2kb(size) ((size) >> 10) +#define byte2mb(size) ((size) >> 20)
/* mdc scene hack */ int enable_mdc_default_group; @@ -366,6 +366,8 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) spg->id = spg_id; atomic_set(&spg->spa_num, 0); atomic64_set(&spg->size, 0); + atomic64_set(&spg->alloc_nsize, 0); + atomic64_set(&spg->alloc_hsize, 0); spg->is_alive = true; spg->hugepage_failures = 0; spg->dvpp_multi_spaces = false; @@ -925,6 +927,12 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, if (spa->spg) { atomic_inc(&spg->spa_num); atomic64_add(size, &spg->size); + if (type == SPA_TYPE_ALLOC) { + if (spa->is_hugepage) + atomic64_add(size, &spg->alloc_hsize); + else + atomic64_add(size, &spg->alloc_nsize); + } atomic_inc(&spg_stat.spa_total_num); atomic64_add(size, &spg_stat.spa_total_size); list_add_tail(&spa->link, &spg->spa_list); @@ -1003,6 +1011,12 @@ static void sp_free_area(struct sp_area *spa) if (spa->spg) { atomic_dec(&spa->spg->spa_num); atomic64_sub(spa->real_size, &spa->spg->size); + if (spa->type == SPA_TYPE_ALLOC) { + if (spa->is_hugepage) + atomic64_sub(spa->real_size, &spa->spg->alloc_hsize); + else + atomic64_sub(spa->real_size, &spa->spg->alloc_nsize); + } atomic_dec(&spg_stat.spa_total_num); atomic64_sub(spa->real_size, &spg_stat.spa_total_size); list_del(&spa->link); @@ -2509,16 +2523,18 @@ static int idr_spg_stat_cb(int id, void *p, void *data) struct sp_group *spg = p; struct seq_file *seq = data;
- seq_printf(seq, "Group %-10d size: %13ld KB, spa num: %d.\n", - id, byte2kb(atomic64_read(&spg->size)), - atomic_read(&spg->spa_num)); + seq_printf(seq, "Group %6d size: %ld KB, spa num: %d, normal alloc: %ld KB, " + "huge alloc: %ld KB\n", + id, byte2kb(atomic64_read(&spg->size)), atomic_read(&spg->spa_num), + byte2kb(atomic64_read(&spg->alloc_nsize)), + byte2kb(atomic64_read(&spg->alloc_hsize)));
return 0; }
static void spg_overview_show(struct seq_file *seq) { - seq_printf(seq, "Share pool total size: %13ld KB, spa total num: %d.\n", + seq_printf(seq, "Share pool total size: %ld KB, spa total num: %d.\n", byte2kb(atomic64_read(&spg_stat.spa_total_size)), atomic_read(&spg_stat.spa_total_num)); mutex_lock(&sp_mutex);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- Some new fields will be shown in /proc/sharepool/proc_stat, including: 1. VIRT, the virtual memory amount. 2. RES, resident memory size. 3. Shm, resident shared memory size which is a part of RES. 4. Non-SP_RES, resident memory size excluding share pool memory. 5. Non-SP_Shm, resident shared memory size size excluding share pool memory.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 72 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 21 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index fb0a8f1b54405..eb77bc882b19b 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -54,6 +54,7 @@
#define byte2kb(size) ((size) >> 10) #define byte2mb(size) ((size) >> 20) +#define page2kb(page_num) ((page_num) << (PAGE_SHIFT - 10))
/* mdc scene hack */ int enable_mdc_default_group; @@ -83,12 +84,13 @@ static DEFINE_IDR(sp_stat_idr);
/* per process memory usage statistics indexed by tgid */ struct sp_proc_stat { + struct mm_struct *mm; char comm[TASK_COMM_LEN]; /* * alloc amount minus free amount, may be negative when freed by * another task in the same sp group. */ - long amount; + long alloc_size; };
/* for kthread buff_module_guard_work */ @@ -100,7 +102,8 @@ static struct sp_proc_stat kthread_stat = {0}; */ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk) { struct sp_proc_stat *stat; - int id = tsk->mm->sp_stat_id; + struct mm_struct *mm = tsk->mm; + int id = mm->sp_stat_id; int tgid = tsk->tgid; int ret;
@@ -118,7 +121,8 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk) { return ERR_PTR(-ENOMEM); }
- stat->amount = 0; + stat->alloc_size = 0; + stat->mm = mm; get_task_comm(stat->comm, tsk); ret = idr_alloc(&sp_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); if (ret < 0) { @@ -128,7 +132,7 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk) { return ERR_PTR(ret); }
- tsk->mm->sp_stat_id = ret; + mm->sp_stat_id = ret; return stat; }
@@ -746,11 +750,11 @@ void sp_group_post_exit(struct mm_struct *mm) * * We decide to print a info when seeing both of the scenarios. */ - if (stat && stat->amount != 0) + if (stat && stat->alloc_size != 0) pr_info("share pool: process %s(%d) of sp group %d exits. " "It applied %ld aligned KB\n", stat->comm, mm->sp_stat_id, - mm->sp_group->id, byte2kb(stat->amount)); + mm->sp_group->id, byte2kb(stat->alloc_size));
idr_remove(&sp_stat_idr, mm->sp_stat_id);
@@ -1200,11 +1204,11 @@ int sp_free(unsigned long addr)
/* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { - kthread_stat.amount -= spa->real_size; + kthread_stat.alloc_size -= spa->real_size; } else { stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); if (stat) - stat->amount -= spa->real_size; + stat->alloc_size -= spa->real_size; else BUG(); } @@ -1445,7 +1449,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) if (!IS_ERR(p)) { stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); if (stat) - stat->amount += size_aligned; + stat->alloc_size += size_aligned; }
out: @@ -2424,10 +2428,10 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, mutex_unlock(&sp_mutex); return 0; } - seq_printf(m, "%-10s %-18s %-15s\n", - "Group ID", "Aligned Apply(KB)", "HugePage Fails"); - seq_printf(m, "%-10d %-18ld %-15d\n", - spg->id, byte2kb(stat->amount), spg->hugepage_failures); + seq_printf(m, "%-8s %-9s %-13s\n", + "Group_ID", "SP_ALLOC", "HugePage Fail"); + seq_printf(m, "%-8d %-9ld %-13d\n", + spg->id, byte2kb(stat->alloc_size), spg->hugepage_failures); } mutex_unlock(&sp_mutex);
@@ -2559,13 +2563,36 @@ static int idr_proc_stat_cb(int id, void *p, void *data) struct sp_group *spg; struct sp_proc_stat *stat = p; struct seq_file *seq = data; + struct mm_struct *mm = stat->mm; + unsigned long anon, file, shmem, total_rss; + /* + * non_sp_res: resident memory size excluding share pool memory + * non_sp_shm: resident shared memory size size excluding share pool + * memory + */ + long sp_alloc_nsize, non_sp_res, non_sp_shm;
mutex_lock(&sp_mutex); spg = __sp_find_spg(id, SPG_ID_DEFAULT); - if (spg_valid(spg)) { - seq_printf(seq, "%-12d %-10d %-18ld\n", - id, spg->id, byte2kb(stat->amount)); - } + if (!spg_valid(spg) || !mmget_not_zero(mm)) + goto out_unlock; + + sp_alloc_nsize = byte2kb(atomic64_read(&spg->alloc_nsize)); + anon = get_mm_counter(mm, MM_ANONPAGES); + file = get_mm_counter(mm, MM_FILEPAGES); + shmem = get_mm_counter(mm, MM_SHMEMPAGES); + total_rss = anon + file + shmem; + non_sp_res = page2kb(total_rss) - sp_alloc_nsize; + non_sp_shm = page2kb(shmem) - sp_alloc_nsize; + non_sp_shm = non_sp_shm < 0 ? 0 : non_sp_shm; /* to be investigated */ + + seq_printf(seq, "%-8d %-8d %-9ld %-10ld %-8ld %-7ld %-7ld %-10ld\n", + id, spg->id, byte2kb(stat->alloc_size), non_sp_res, + page2kb(mm->total_vm), page2kb(total_rss), page2kb(shmem), + non_sp_shm); + mmput(mm); + +out_unlock: mutex_unlock(&sp_mutex);
return 0; @@ -2573,12 +2600,15 @@ static int idr_proc_stat_cb(int id, void *p, void *data)
static int proc_stat_show(struct seq_file *seq, void *offset) { + spg_overview_show(seq); + spa_overview_show(seq); /* print the file header */ - seq_printf(seq, "%-12s %-10s %-18s\n", - "Process ID", "Group ID", "Aligned Apply(KB)"); + seq_printf(seq, "%-8s %-8s %-9s %-10s %-8s %-7s %-7s %-10s\n", + "PID", "Group_ID", "SP_ALLOC", "Non-SP_RES", "VIRT", "RES", + "Shm", "Non-SP_Shm"); /* print kthread buff_module_guard_work */ - seq_printf(seq, "%-12s %-10s %-18ld\n", - "guard", "-", byte2kb(kthread_stat.amount)); + seq_printf(seq, "%-8s %-8s %-9ld\n", + "guard", "-", byte2kb(kthread_stat.alloc_size)); idr_for_each(&sp_stat_idr, idr_proc_stat_cb, seq); return 0; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- 1. Addresses can't be printed for security issue. 2. Some small printing fixes.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index eb77bc882b19b..85d631677d76c 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -635,7 +635,7 @@ int sp_group_add_task(int pid, int spg_id) if (printk_ratelimit()) { pr_warn("share pool: task add group failed when mm populate " "failed (potential no enough memory): %d " - "spa flag is %d\n", ret, spa->type); + "spa type is %d\n", ret, spa->type); } sp_munmap_task_areas(mm, spa->link.next); spin_lock(&sp_area_lock); @@ -1516,8 +1516,10 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, unsigned long addr, buf, offset;
if (spg_valid(spa->spg)) { + /* k2u to group */ file = spa_file(spa); } else { + /* k2u to task */ if (spa->is_hugepage) { file = hugetlb_file_setup(HUGETLB_ANON_FILE, spa_size(spa), VM_NORESERVE, &user, HUGETLB_ANONHUGE_INODE, hsize_log); @@ -1550,7 +1552,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, if (ret) { do_munmap(mm, ret_addr, spa_size(spa), NULL); pr_err("share pool: remap vmalloc hugepage failed, " - "ret %d, kva is %lx\n", ret, kva); + "ret %d, kva is %pK\n", ret, (void *)kva); ret_addr = ret; goto put_mm; } @@ -1721,7 +1723,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, } else if (ret == 0) { /* do nothing */ } else { - pr_err("it is not vmalloc address\n"); + pr_err("share pool: k2u kva not vmalloc address\n"); return ERR_PTR(ret); } /* aligned down kva is convenient for caller to start with any valid kva */ @@ -1749,7 +1751,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { - pr_err("%s: the kva %ld is not valid\n", __func__, kva_aligned); + pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); goto out; }
@@ -1778,7 +1780,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { - pr_err("%s: the kva %ld is not valid\n", __func__, kva_aligned); + pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); goto out; }
@@ -1797,8 +1799,8 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, } else { /* associate vma and spa */ if (!vmalloc_area_clr_flag(spa, kva_aligned, VM_SHAREPOOL)) - pr_warn("share pool: %s: the kva %ld is not valid \n", - __func__, kva_aligned); + pr_warn("share pool: %s: the kva %pK is not valid\n", + __func__, (void *)kva_aligned); }
out:
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- To collect maintenance information of k2u more conviniently in the future, we should leverage input parameter pid. 1. Check sanity of pid in sp_make_share_k2u(), then get pointer of mm by get_task_mm(). 2. The input param pid of sp_make_share_kva_to_task() is replaced by mm.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 110 +++++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 53 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 85d631677d76c..745ca087a12ff 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1586,45 +1586,27 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, return ret_addr; }
+/** + * Share kernel memory to a specified task + * @kva: the VA of shared kernel memory + * @spa: the sp area associated with the shared user address + * @mm: mm_struct of target task + * + * Return: the shared user address to start at + */ static void *sp_make_share_kva_to_task(unsigned long kva, struct sp_area *spa, - int pid) + struct mm_struct *mm) { - struct task_struct *tsk; unsigned long ret_addr; - void *p = ERR_PTR(-ENODEV); - int ret = 0; - - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (!tsk || (tsk->flags & PF_EXITING)) - ret = -ESRCH; - else - get_task_struct(tsk); - - rcu_read_unlock(); - if (ret) - return ERR_PTR(ret);
- ret_addr = sp_remap_kva_to_vma(kva, spa, tsk->mm); + ret_addr = sp_remap_kva_to_vma(kva, spa, mm); if (IS_ERR_VALUE(ret_addr)) { pr_err("share pool: remap k2u to task failed, ret %ld\n", ret_addr); - p = ERR_PTR(ret_addr); - goto out; + return ERR_PTR(ret_addr); }
- p = (void *)ret_addr; - - task_lock(tsk); - if (tsk->mm == NULL) { - sp_munmap(tsk->mm, spa->va_start, spa_size(spa)); - p = ERR_PTR(-ESRCH); - } else { - spa->mm = tsk->mm; - } - task_unlock(tsk); -out: - put_task_struct(tsk); - return p; + spa->mm = mm; + return (void *)ret_addr; }
static void *sp_make_share_kva_to_spg(unsigned long kva, struct sp_area *spa, @@ -1708,7 +1690,9 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned long kva_aligned; unsigned long size_aligned; unsigned int page_size = PAGE_SIZE; - int ret; + struct task_struct *tsk; + struct mm_struct *mm; + int ret = 0, is_hugepage;
if (sp_flags & ~SP_DVPP) { if (printk_ratelimit()) @@ -1716,53 +1700,71 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, return ERR_PTR(-EINVAL); }
- ret = is_vmap_hugepage(kva); - if (ret > 0) { + is_hugepage = is_vmap_hugepage(kva); + if (is_hugepage > 0) { sp_flags |= SP_HUGEPAGE; page_size = PMD_SIZE; - } else if (ret == 0) { + } else if (is_hugepage == 0) { /* do nothing */ } else { pr_err("share pool: k2u kva not vmalloc address\n"); - return ERR_PTR(ret); + return ERR_PTR(is_hugepage); } + /* aligned down kva is convenient for caller to start with any valid kva */ kva_aligned = ALIGN_DOWN(kva, page_size); size_aligned = ALIGN(kva + size, page_size) - kva_aligned;
+ rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else + get_task_struct(tsk); + + rcu_read_unlock(); + if (ret) + return ERR_PTR(ret); + + mm = get_task_mm(tsk); + if (mm == NULL) { + uva = ERR_PTR(-ESRCH); + goto out_put_task; + } + mutex_lock(&sp_mutex); spg = __sp_find_spg(pid, SPG_ID_DEFAULT); if (spg == NULL) { /* k2u to task */ if (spg_id != SPG_ID_NONE && spg_id != SPG_ID_DEFAULT) { - mutex_unlock(&sp_mutex); if (printk_ratelimit()) pr_err("share pool: k2task invalid spg id %d\n", spg_id); - return ERR_PTR(-EINVAL); + uva = ERR_PTR(-EINVAL); + goto out_unlock; } spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); if (IS_ERR(spa)) { - mutex_unlock(&sp_mutex); if (printk_ratelimit()) pr_err("share pool: k2u(task) failed due to alloc spa failure " "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); - return spa; + uva = spa; + goto out_unlock; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); - goto out; + goto out_drop_spa; }
- uva = sp_make_share_kva_to_task(kva_aligned, spa, pid); + uva = sp_make_share_kva_to_task(kva_aligned, spa, mm); } else if (spg_valid(spg)) { /* k2u to group */ if (spg_id != SPG_ID_DEFAULT && spg_id != spg->id) { - mutex_unlock(&sp_mutex); if (printk_ratelimit()) pr_err("share pool: k2spg invalid spg id %d\n", spg_id); - return ERR_PTR(-EINVAL); + uva = ERR_PTR(-EINVAL); + goto out_unlock; }
if (enable_share_k2u_spg) @@ -1771,27 +1773,26 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK);
if (IS_ERR(spa)) { - mutex_unlock(&sp_mutex); if (printk_ratelimit()) pr_err("share pool: k2u(spg) failed due to alloc spa failure " "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); - return spa; + uva = spa; + goto out_unlock; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); - goto out; + goto out_drop_spa; }
if (spa->spg) uva = sp_make_share_kva_to_spg(kva_aligned, spa, spg); else - uva = sp_make_share_kva_to_task(kva_aligned, spa, pid); + uva = sp_make_share_kva_to_task(kva_aligned, spa, mm); } else { - mutex_unlock(&sp_mutex); - pr_err("share pool: failed to make k2u\n"); - return NULL; + /* group is dead, return -ENODEV */ + pr_err("share pool: failed to make k2u, sp group is dead\n"); }
if (!IS_ERR(uva)) { @@ -1803,12 +1804,15 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, __func__, (void *)kva_aligned); }
-out: +out_drop_spa: __sp_area_drop(spa); +out_unlock: mutex_unlock(&sp_mutex); + mmput(mm); +out_put_task: + put_task_struct(tsk);
sp_dump_stack(); - return uva; } EXPORT_SYMBOL_GPL(sp_make_share_k2u);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- Add a new column SP_K2U in /proc/sharepool/proc_stat which means k2u size for a task. If a task is the target of k2u(to task) but never added to a sp group, its sp group id is notated as '-'.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 77 +++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 61 insertions(+), 16 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 745ca087a12ff..acae8d1d7befb 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -91,6 +91,7 @@ struct sp_proc_stat { * another task in the same sp group. */ long alloc_size; + long k2u_size; };
/* for kthread buff_module_guard_work */ @@ -121,7 +122,7 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk) { return ERR_PTR(-ENOMEM); }
- stat->alloc_size = 0; + stat->alloc_size = stat->k2u_size = 0; stat->mm = mm; get_task_comm(stat->comm, tsk); ret = idr_alloc(&sp_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); @@ -750,11 +751,13 @@ void sp_group_post_exit(struct mm_struct *mm) * * We decide to print a info when seeing both of the scenarios. */ - if (stat && stat->alloc_size != 0) + if (stat && (stat->alloc_size != 0 || stat->k2u_size != 0)) pr_info("share pool: process %s(%d) of sp group %d exits. " - "It applied %ld aligned KB\n", + "It applied %ld aligned KB, k2u shared %ld aligned " + "KB\n", stat->comm, mm->sp_stat_id, - mm->sp_group->id, byte2kb(stat->alloc_size)); + mm->sp_group->id, byte2kb(stat->alloc_size), + byte2kb(stat->k2u_size));
idr_remove(&sp_stat_idr, mm->sp_stat_id);
@@ -1692,6 +1695,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned int page_size = PAGE_SIZE; struct task_struct *tsk; struct mm_struct *mm; + struct sp_proc_stat *stat; int ret = 0, is_hugepage;
if (sp_flags & ~SP_DVPP) { @@ -1733,6 +1737,18 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, }
mutex_lock(&sp_mutex); + /* + * Process statistics initialization. if the target process has been + * added to a sp group, then stat will be returned immediately. + * I believe there is no need to free stat in error handling branches. + */ + stat = sp_init_proc_stat(tsk); + if (IS_ERR(stat)) { + uva = stat; + pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); + goto out_unlock; + } + spg = __sp_find_spg(pid, SPG_ID_DEFAULT); if (spg == NULL) { /* k2u to task */ @@ -1797,6 +1813,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size,
if (!IS_ERR(uva)) { uva = uva + (kva - kva_aligned); + stat->k2u_size += size_aligned; } else { /* associate vma and spa */ if (!vmalloc_area_clr_flag(spa, kva_aligned, VM_SHAREPOOL)) @@ -2082,6 +2099,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp unsigned long uva_aligned; unsigned long size_aligned; unsigned int page_size; + struct sp_proc_stat *stat;
mutex_lock(&sp_mutex); /* @@ -2202,6 +2220,16 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp }
sp_dump_stack(); + /* pointer stat may be invalid because of kthread buff_module_guard_work */ + if (current->mm == NULL) { + kthread_stat.k2u_size -= spa->real_size; + } else { + stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); + if (stat) + stat->k2u_size -= spa->real_size; + else + WARN(1, "share_pool: %s: null process stat\n", __func__); + }
out_clr_flag: /* deassociate vma and spa */ @@ -2566,6 +2594,7 @@ static int spa_stat_show(struct seq_file *seq, void *offset)
static int idr_proc_stat_cb(int id, void *p, void *data) { + int spg_id; struct sp_group *spg; struct sp_proc_stat *stat = p; struct seq_file *seq = data; @@ -2579,11 +2608,21 @@ static int idr_proc_stat_cb(int id, void *p, void *data) long sp_alloc_nsize, non_sp_res, non_sp_shm;
mutex_lock(&sp_mutex); - spg = __sp_find_spg(id, SPG_ID_DEFAULT); - if (!spg_valid(spg) || !mmget_not_zero(mm)) + if (!mmget_not_zero(mm)) goto out_unlock; + /* + * a task which is the target of k2u(to task) but without adding to a + * sp group should be handled correctly. + */ + spg = __sp_find_spg(id, SPG_ID_DEFAULT); + if (!spg_valid(spg)) { + spg_id = 0; + sp_alloc_nsize = 0; + } else { + spg_id = spg->id; + sp_alloc_nsize = byte2kb(atomic64_read(&spg->alloc_nsize)); + }
- sp_alloc_nsize = byte2kb(atomic64_read(&spg->alloc_nsize)); anon = get_mm_counter(mm, MM_ANONPAGES); file = get_mm_counter(mm, MM_FILEPAGES); shmem = get_mm_counter(mm, MM_SHMEMPAGES); @@ -2592,10 +2631,15 @@ static int idr_proc_stat_cb(int id, void *p, void *data) non_sp_shm = page2kb(shmem) - sp_alloc_nsize; non_sp_shm = non_sp_shm < 0 ? 0 : non_sp_shm; /* to be investigated */
- seq_printf(seq, "%-8d %-8d %-9ld %-10ld %-8ld %-7ld %-7ld %-10ld\n", - id, spg->id, byte2kb(stat->alloc_size), non_sp_res, - page2kb(mm->total_vm), page2kb(total_rss), page2kb(shmem), - non_sp_shm); + seq_printf(seq, "%-8d ", id); + if (spg_id == 0) + seq_printf(seq, "%-8c ", '-'); + else + seq_printf(seq, "%-8d ", spg_id); + seq_printf(seq, "%-9ld %-9ld %-10ld %-8ld %-7ld %-7ld %-10ld\n", + byte2kb(stat->alloc_size), byte2kb(stat->k2u_size), + non_sp_res, page2kb(mm->total_vm), page2kb(total_rss), + page2kb(shmem), non_sp_shm); mmput(mm);
out_unlock: @@ -2609,12 +2653,13 @@ static int proc_stat_show(struct seq_file *seq, void *offset) spg_overview_show(seq); spa_overview_show(seq); /* print the file header */ - seq_printf(seq, "%-8s %-8s %-9s %-10s %-8s %-7s %-7s %-10s\n", - "PID", "Group_ID", "SP_ALLOC", "Non-SP_RES", "VIRT", "RES", - "Shm", "Non-SP_Shm"); + seq_printf(seq, "%-8s %-8s %-9s %-9s %-10s %-8s %-7s %-7s %-10s\n", + "PID", "Group_ID", "SP_ALLOC", "SP_K2U", "Non-SP_RES", + "VIRT", "RES", "Shm", "Non-SP_Shm"); /* print kthread buff_module_guard_work */ - seq_printf(seq, "%-8s %-8s %-9ld\n", - "guard", "-", byte2kb(kthread_stat.alloc_size)); + seq_printf(seq, "%-8s %-8s %-9ld %-9ld\n", + "guard", "-", byte2kb(kthread_stat.alloc_size), + byte2kb(kthread_stat.k2u_size)); idr_for_each(&sp_stat_idr, idr_proc_stat_cb, seq); return 0; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- Add a new column SP_RES in /proc/sharepool/proc_stat which means sp_alloc size for a task. It contains other sp_alloc memories applied by the tasks in the same sp_group.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 2 ++ mm/share_pool.c | 24 ++++++++++++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index eb0358bb6633b..70b841d0eb8e5 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -82,6 +82,8 @@ struct sp_group { atomic64_t alloc_nsize; /* total size of all sp_area from sp_alloc hugepage */ atomic64_t alloc_hsize; + /* total size of all sp_area from ap_alloc */ + atomic64_t alloc_size; /* we define the creator process of a sp_group as owner */ struct task_struct *owner; /* is_alive == false means it's being destroyed */ diff --git a/mm/share_pool.c b/mm/share_pool.c index acae8d1d7befb..6e13cbd032219 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -373,6 +373,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) atomic64_set(&spg->size, 0); atomic64_set(&spg->alloc_nsize, 0); atomic64_set(&spg->alloc_hsize, 0); + atomic64_set(&spg->alloc_size, 0); spg->is_alive = true; spg->hugepage_failures = 0; spg->dvpp_multi_spaces = false; @@ -939,6 +940,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, atomic64_add(size, &spg->alloc_hsize); else atomic64_add(size, &spg->alloc_nsize); + atomic64_add(size, &spg->alloc_size); } atomic_inc(&spg_stat.spa_total_num); atomic64_add(size, &spg_stat.spa_total_size); @@ -1023,6 +1025,7 @@ static void sp_free_area(struct sp_area *spa) atomic64_sub(spa->real_size, &spa->spg->alloc_hsize); else atomic64_sub(spa->real_size, &spa->spg->alloc_nsize); + atomic64_sub(spa->real_size, &spa->spg->alloc_size); } atomic_dec(&spg_stat.spa_total_num); atomic64_sub(spa->real_size, &spg_stat.spa_total_size); @@ -2561,9 +2564,10 @@ static int idr_spg_stat_cb(int id, void *p, void *data) struct sp_group *spg = p; struct seq_file *seq = data;
- seq_printf(seq, "Group %6d size: %ld KB, spa num: %d, normal alloc: %ld KB, " - "huge alloc: %ld KB\n", + seq_printf(seq, "Group %6d size: %ld KB, spa num: %d, total alloc: %ld KB, " + "normal alloc: %ld KB, huge alloc: %ld KB\n", id, byte2kb(atomic64_read(&spg->size)), atomic_read(&spg->spa_num), + byte2kb(atomic64_read(&spg->alloc_size)), byte2kb(atomic64_read(&spg->alloc_nsize)), byte2kb(atomic64_read(&spg->alloc_hsize)));
@@ -2602,10 +2606,12 @@ static int idr_proc_stat_cb(int id, void *p, void *data) unsigned long anon, file, shmem, total_rss; /* * non_sp_res: resident memory size excluding share pool memory + * sp_res: resident memory size of share pool, including normal + * page and hugepage memory * non_sp_shm: resident shared memory size size excluding share pool * memory */ - long sp_alloc_nsize, non_sp_res, non_sp_shm; + long sp_alloc_nsize, non_sp_res, sp_res, non_sp_shm;
mutex_lock(&sp_mutex); if (!mmget_not_zero(mm)) @@ -2618,9 +2624,11 @@ static int idr_proc_stat_cb(int id, void *p, void *data) if (!spg_valid(spg)) { spg_id = 0; sp_alloc_nsize = 0; + sp_res = 0; } else { spg_id = spg->id; sp_alloc_nsize = byte2kb(atomic64_read(&spg->alloc_nsize)); + sp_res = byte2kb(atomic64_read(&spg->alloc_size)); }
anon = get_mm_counter(mm, MM_ANONPAGES); @@ -2636,8 +2644,8 @@ static int idr_proc_stat_cb(int id, void *p, void *data) seq_printf(seq, "%-8c ", '-'); else seq_printf(seq, "%-8d ", spg_id); - seq_printf(seq, "%-9ld %-9ld %-10ld %-8ld %-7ld %-7ld %-10ld\n", - byte2kb(stat->alloc_size), byte2kb(stat->k2u_size), + seq_printf(seq, "%-9ld %-9ld %-9ld %-10ld %-8ld %-7ld %-7ld %-10ld\n", + byte2kb(stat->alloc_size), byte2kb(stat->k2u_size), sp_res, non_sp_res, page2kb(mm->total_vm), page2kb(total_rss), page2kb(shmem), non_sp_shm); mmput(mm); @@ -2653,9 +2661,9 @@ static int proc_stat_show(struct seq_file *seq, void *offset) spg_overview_show(seq); spa_overview_show(seq); /* print the file header */ - seq_printf(seq, "%-8s %-8s %-9s %-9s %-10s %-8s %-7s %-7s %-10s\n", - "PID", "Group_ID", "SP_ALLOC", "SP_K2U", "Non-SP_RES", - "VIRT", "RES", "Shm", "Non-SP_Shm"); + seq_printf(seq, "%-8s %-8s %-9s %-9s %-9s %-10s %-8s %-7s %-7s %-10s\n", + "PID", "Group_ID", "SP_ALLOC", "SP_K2U", "SP_RES", + "Non-SP_RES", "VIRT", "RES", "Shm", "Non-SP_Shm"); /* print kthread buff_module_guard_work */ seq_printf(seq, "%-8s %-8s %-9ld %-9ld\n", "guard", "-", byte2kb(kthread_stat.alloc_size),
From: Fang Lijun fanglijun3@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
--------------------------------------------------
arch_vmap_p4d_supported arch_vmap_pud_supported arch_vmap_pmd_supported redefined when disable CONFIG_HAVE_ARCH_HUGE_VMAP on x86.
Fixes: 396a313c1a6a ("mm: HUGE_VMAP arch support cleanup") Signed-off-by: Fang Lijun fanglijun3@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/mm/ioremap.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 944de9aaa0cd5..e788c58994150 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -431,6 +431,7 @@ void iounmap(volatile void __iomem *addr) } EXPORT_SYMBOL(iounmap);
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP bool arch_vmap_p4d_supported(pgprot_t prot) { return false; @@ -449,6 +450,7 @@ bool arch_vmap_pmd_supported(pgprot_t prot) { return boot_cpu_has(X86_FEATURE_PSE); } +#endif
/* * Convert a physical pointer to a virtual kernel pointer for /dev/mem
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------
When we add a task to a new group, we create and initialize the group at first. The is_active element of spg should not be true until the fully process completed, or the BUG of freeing an active spg shall occur if the works after the initialization raise an error.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 6e13cbd032219..f785b6ed41866 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -374,7 +374,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) atomic64_set(&spg->alloc_nsize, 0); atomic64_set(&spg->alloc_hsize, 0); atomic64_set(&spg->alloc_size, 0); - spg->is_alive = true; + spg->is_alive = false; spg->hugepage_failures = 0; spg->dvpp_multi_spaces = false; spg->owner = current->group_leader; @@ -591,6 +591,8 @@ int sp_group_add_task(int pid, int spg_id) }
mm->sp_group = spg; + /* We reactive the spg even the spg exists already. */ + spg->is_alive = true; list_add_tail(&tsk->mm->sp_node, &spg->procs); /* * create mappings of existing shared memory segments into this
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------
The user could give a pid of daemon process when add task to group. That daemon process has no mm_struct so we should check it before use.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index f785b6ed41866..eb15ad9a24e31 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -549,8 +549,6 @@ int sp_group_add_task(int pid, int spg_id) tsk = find_task_by_vpid(pid); if (!tsk || (tsk->flags & PF_EXITING)) ret = -ESRCH; - else if (tsk->mm->sp_group) /* if it's already in a sp_group */ - ret = -EEXIST; else get_task_struct(tsk);
@@ -560,6 +558,11 @@ int sp_group_add_task(int pid, int spg_id) goto out_unlock; }
+ if (!tsk->mm || tsk->mm->sp_group) { /* if it's already in a sp_group */ + ret = -EEXIST; + goto out_unlock; + } + spg = find_or_alloc_sp_group(spg_id); if (IS_ERR(spg)) { ret = PTR_ERR(spg);
From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------
When we add a task which is exiting to a group, the mm of the task shall be reset to NULL unexpectedly even if we increase the refcnt of the task. Then if we access the mm's element via the task_struct, a NULL pointer dereference issue could occur.
Get and store the mm_struct to be used at beginning to fix the problem.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index eb15ad9a24e31..169ac88e37d2f 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -101,9 +101,10 @@ static struct sp_proc_stat kthread_stat = {0}; * The caller must hold sp_mutex and ensure no concurrency problem * for task_struct and mm_struct. */ -static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk) { +static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, + struct mm_struct *mm) +{ struct sp_proc_stat *stat; - struct mm_struct *mm = tsk->mm; int id = mm->sp_stat_id; int tgid = tsk->tgid; int ret; @@ -558,16 +559,21 @@ int sp_group_add_task(int pid, int spg_id) goto out_unlock; }
- if (!tsk->mm || tsk->mm->sp_group) { /* if it's already in a sp_group */ + /* current thread may be exiting in a multithread process */ + mm = get_task_mm(tsk->group_leader); + if (!mm) { + ret = -ESRCH; + goto out_put_task; + } else if (mm->sp_group) { ret = -EEXIST; - goto out_unlock; + goto out_put_mm; }
spg = find_or_alloc_sp_group(spg_id); if (IS_ERR(spg)) { ret = PTR_ERR(spg); free_sp_group_id((unsigned int)spg_id); - goto out_put_task; + goto out_put_mm; }
/* access control permission check */ @@ -578,25 +584,18 @@ int sp_group_add_task(int pid, int spg_id) } }
- /* current thread may be exiting in a multithread process */ - mm = get_task_mm(tsk->group_leader); - if (!mm) { - ret = -ESRCH; - goto out_drop_group; - } - /* per process statistics initialization */ - stat = sp_init_proc_stat(tsk); + stat = sp_init_proc_stat(tsk, mm); if (IS_ERR(stat)) { ret = PTR_ERR(stat); pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); - goto out_put_mm; + goto out_drop_group; }
mm->sp_group = spg; /* We reactive the spg even the spg exists already. */ spg->is_alive = true; - list_add_tail(&tsk->mm->sp_node, &spg->procs); + list_add_tail(&mm->sp_node, &spg->procs); /* * create mappings of existing shared memory segments into this * new process' page table. @@ -664,11 +663,11 @@ int sp_group_add_task(int pid, int spg_id) mm->sp_group = NULL; }
-out_put_mm: - mmput(mm); out_drop_group: if (unlikely(ret)) __sp_group_drop_locked(spg); +out_put_mm: + mmput(mm); out_put_task: put_task_struct(tsk); out_unlock: @@ -1750,7 +1749,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, * added to a sp group, then stat will be returned immediately. * I believe there is no need to free stat in error handling branches. */ - stat = sp_init_proc_stat(tsk); + stat = sp_init_proc_stat(tsk, mm); if (IS_ERR(stat)) { uva = stat; pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat));
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
do_mm_populate() should return an error number.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h index e4a20206c3f39..7ee7214f17bee 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2448,9 +2448,10 @@ extern int do_mm_populate(struct mm_struct *mm, unsigned long addr, unsigned lon int ignore_errors); #else static inline void mm_populate(unsigned long addr, unsigned long len) {} -int do_mm_populate(struct mm_struct *mm, unsigned long addr, unsigned long len, - int ignore_errors) +static inline int do_mm_populate(struct mm_struct *mm, unsigned long addr, + unsigned long len, int ignore_errors) { + return -EPERM; } #endif
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
When CONFIG_MEMCG is disabled an CONFIG_MM_OWNER is enabled, we encounter a compilation error as follows:
kernel/exit.c:399:6: error: redefinition of ‘mm_update_next_owner’ void mm_update_next_owner(struct mm_struct *mm) ^~~~~~~~~~~~~~~~~~~~ In file included from kernel/exit.c:10:0: ./include/linux/sched/mm.h:128:20: note: previous definition of ‘mm_update_next_owner’ was here static inline void mm_update_next_owner(struct mm_struct *mm)
To fix it, let mm_update_next_owner() depend on CONFIG_MM_OWNER
Fixes: 7560b5eec37c ("ascend: mm: add an owner for mm_struct") Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/sched/mm.h | 4 ++-- mm/Kconfig | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index ee7eada5b0164..bd762e7a2ec14 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -124,13 +124,13 @@ extern void exit_mm_release(struct task_struct *, struct mm_struct *); /* Remove the current tasks stale references to the old mm_struct on exec() */ extern void exec_mm_release(struct task_struct *, struct mm_struct *);
-#ifdef CONFIG_MEMCG +#ifdef CONFIG_MM_OWNER extern void mm_update_next_owner(struct mm_struct *mm); #else static inline void mm_update_next_owner(struct mm_struct *mm) { } -#endif /* CONFIG_MEMCG */ +#endif /* CONFIG_MM_OWNER */
#ifdef CONFIG_MMU extern void arch_pick_mmap_layout(struct mm_struct *mm, diff --git a/mm/Kconfig b/mm/Kconfig index 253fb184c8f8d..12601505c4a4a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -302,7 +302,7 @@ config VIRT_TO_BUS config MM_OWNER bool "Enable the ownership the mm owner" help - This option enables mm_struct's to have an owner. + This option allows to record the canonical user of an mm struct
config MMU_NOTIFIER bool
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
__sp_area_drop_locked() checks null pointer of spa, so remove null pointer checks before calling __sp_area_drop_locked().
Reported-by: Cui Bixuan cuibixuan@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 169ac88e37d2f..dfce9001b9e42 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -443,8 +443,7 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) if (&spa->link == stop) break;
- if (prev) - __sp_area_drop_locked(prev); + __sp_area_drop_locked(prev); prev = spa;
atomic_inc(&spa->use_count); @@ -459,8 +458,7 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop)
spin_lock(&sp_area_lock); } - if (prev) - __sp_area_drop_locked(prev); + __sp_area_drop_locked(prev);
spin_unlock(&sp_area_lock); } @@ -607,8 +605,7 @@ int sp_group_add_task(int pid, int spg_id) struct file *file = spa_file(spa); unsigned long addr;
- if (prev) - __sp_area_drop_locked(prev); + __sp_area_drop_locked(prev); prev = spa;
atomic_inc(&spa->use_count); @@ -651,8 +648,7 @@ int sp_group_add_task(int pid, int spg_id)
spin_lock(&sp_area_lock); } - if (prev) - __sp_area_drop_locked(prev); + __sp_area_drop_locked(prev); spin_unlock(&sp_area_lock);
if (unlikely(ret)) {
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
If alignment is 1, there won't be any bugs due to the implementation of __vmalloc_node_range().
Changing alignment to PMD_SIZE is more readable.
Reported-by: Xu Qiang xuqiang36@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index dfce9001b9e42..d7a256de14ce6 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2797,7 +2797,7 @@ void *vmalloc_hugepage(unsigned long size) /* PMD hugepage aligned */ size = PMD_ALIGN(size);
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range(size, PMD_SIZE, VMALLOC_START, VMALLOC_END, GFP_KERNEL, PAGE_KERNEL, VM_HUGE_PAGES, NUMA_NO_NODE, __builtin_return_address(0)); @@ -2820,7 +2820,7 @@ void *vmalloc_hugepage_user(unsigned long size) /* PMD hugepage aligned */ size = PMD_ALIGN(size);
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range(size, PMD_SIZE, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_HUGE_PAGES | VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); @@ -2866,7 +2866,7 @@ void *buff_vzalloc_hugepage_user(unsigned long size) /* PMD hugepage aligned */ size = PMD_ALIGN(size);
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range(size, PMD_SIZE, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT, PAGE_KERNEL, VM_HUGE_PAGES | VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0));
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
All the share pool functions shouldn't be used in interrupt context. Add a checker function and call it at the beginning of them.
Reported-by: Xu Qiang xuqiang36@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+)
diff --git a/mm/share_pool.c b/mm/share_pool.c index d7a256de14ce6..12468f785686d 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -44,6 +44,7 @@ #include <linux/rmap.h> #include <linux/hugetlb.h> #include <linux/compaction.h> +#include <linux/preempt.h>
/* access control mode macros */ #define AC_NONE 0 @@ -275,6 +276,12 @@ static int spa_dec_usage(enum spa_type type, unsigned long size, bool is_dvpp) return 0; }
+static inline void check_interrupt_context(void) +{ + if (unlikely(in_interrupt())) + panic("share_pool: can't be used in interrupt context\n"); +} + static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, struct sp_area *spa, unsigned long *populate); static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size); @@ -341,6 +348,8 @@ int sp_group_id_by_pid(int pid) struct sp_group *spg; int spg_id = -ENODEV;
+ check_interrupt_context(); + mutex_lock(&sp_mutex); spg = __sp_find_spg(pid, SPG_ID_DEFAULT); if (spg_valid(spg)) @@ -494,6 +503,8 @@ int sp_group_add_task(int pid, int spg_id) struct sp_area *spa, *prev = NULL; struct sp_proc_stat *stat;
+ check_interrupt_context(); + /* mdc scene hack */ if (enable_mdc_default_group) spg_id = mdc_default_group_id; @@ -1165,6 +1176,8 @@ int sp_free(unsigned long addr) loff_t offset; int ret = 0;
+ check_interrupt_context(); + mutex_lock(&sp_mutex);
/* @@ -1278,6 +1291,8 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) unsigned long mode, offset; unsigned int noreclaim_flag;
+ check_interrupt_context(); + /* mdc scene hack */ if (enable_mdc_default_group) spg_id = mdc_default_group_id; @@ -1701,6 +1716,8 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, struct sp_proc_stat *stat; int ret = 0, is_hugepage;
+ check_interrupt_context(); + if (sp_flags & ~SP_DVPP) { if (printk_ratelimit()) pr_err("share pool: k2u sp_flags %lu error\n", sp_flags); @@ -2017,6 +2034,8 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) }; struct vm_struct *area;
+ check_interrupt_context(); + rcu_read_lock(); tsk = find_task_by_vpid(pid); if (!tsk || (tsk->flags & PF_EXITING)) @@ -2309,6 +2328,8 @@ int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) { int ret = 0;
+ check_interrupt_context(); + if (va < TASK_SIZE) { /* user address */ ret = sp_unshare_uva(va, size, pid, spg_id); @@ -2336,6 +2357,8 @@ int sp_walk_page_range(unsigned long uva, unsigned long size, struct mm_struct *mm; int ret = 0;
+ check_interrupt_context(); + if (unlikely(!sp_walk_data)) { if (printk_ratelimit()) pr_err("share pool: null pointer when walk page range\n"); @@ -2368,6 +2391,8 @@ void sp_walk_page_free(struct sp_walk_data *sp_walk_data) struct page *page; unsigned int i = 0;
+ check_interrupt_context(); + if (!sp_walk_data) return;
@@ -2405,6 +2430,8 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) { struct sp_group *spg;
+ check_interrupt_context(); + if (device_id < 0 || device_id >= MAX_DEVID || pid < 0 || size <= 0 || size> MMAP_SHARE_POOL_16G_SIZE) return false;
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
sp_group_add_task() may be called with a valid spg_id as input parameter. It should not be freed in abnormal branch.
Reported-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 12468f785686d..6cbbf2138ad6e 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -500,6 +500,7 @@ int sp_group_add_task(int pid, int spg_id) struct mm_struct *mm; struct sp_group *spg; int ret = 0; + bool id_newly_generated = false; struct sp_area *spa, *prev = NULL; struct sp_proc_stat *stat;
@@ -538,6 +539,7 @@ int sp_group_add_task(int pid, int spg_id) "generate group id failed\n"); return spg_id; } + id_newly_generated = true; }
if (spg_id == SPG_ID_DVPP_PASS_THROUGH) { @@ -550,6 +552,7 @@ int sp_group_add_task(int pid, int spg_id) "generate group id failed in DVPP pass through\n"); return spg_id; } + id_newly_generated = true; }
mutex_lock(&sp_mutex); @@ -564,7 +567,8 @@ int sp_group_add_task(int pid, int spg_id)
rcu_read_unlock(); if (ret) { - free_sp_group_id((unsigned int)spg_id); + if (id_newly_generated) + free_sp_group_id((unsigned int)spg_id); goto out_unlock; }
@@ -581,7 +585,8 @@ int sp_group_add_task(int pid, int spg_id) spg = find_or_alloc_sp_group(spg_id); if (IS_ERR(spg)) { ret = PTR_ERR(spg); - free_sp_group_id((unsigned int)spg_id); + if (id_newly_generated) + free_sp_group_id((unsigned int)spg_id); goto out_put_mm; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
If vmap() or vmap_hugepage() fails in sp_make_share_u2k(), we need to decrease the reference count of the pages in struct sp_walk_data otherwise memleak happens.
There are also some additional cleanups to do.
Reported-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 6cbbf2138ad6e..5149864c94c09 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2019,6 +2019,22 @@ static int __sp_walk_page_range(unsigned long uva, unsigned long size, return ret; }
+static void __sp_walk_page_free(struct sp_walk_data *data) +{ + int i = 0; + struct page *page; + + while (i < data->page_count) { + page = data->pages[i++]; + put_page(page); + } + + kvfree(data->pages); + /* prevent repeated release */ + data->page_count = 0; + data->pages = NULL; +} + /** * Share user memory of a specified process to kernel * @uva: the VA of shared user memory @@ -2078,8 +2094,9 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) if (!p) { if (printk_ratelimit()) pr_err("share pool: vmap(huge) in u2k failed\n"); + __sp_walk_page_free(&sp_walk_data); p = ERR_PTR(-ENOMEM); - goto out_free_pages; + goto out_put_task; } else { p = p + (uva - sp_walk_data.uva_aligned); } @@ -2092,7 +2109,6 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) area = find_vm_area(p); area->flags |= VM_USERMAP;
-out_free_pages: kvfree(sp_walk_data.pages); out_put_task: put_task_struct(tsk); @@ -2393,20 +2409,12 @@ EXPORT_SYMBOL_GPL(sp_walk_page_range);
void sp_walk_page_free(struct sp_walk_data *sp_walk_data) { - struct page *page; - unsigned int i = 0; - check_interrupt_context();
if (!sp_walk_data) return;
- while (i < sp_walk_data->page_count) { - page = sp_walk_data->pages[i++]; - put_page(page); - } - - kvfree(sp_walk_data->pages); + __sp_walk_page_free(sp_walk_data); } EXPORT_SYMBOL_GPL(sp_walk_page_free);
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The sp_mutex is used to protect all critical path for share pool, it has serious affected the performance of the the memory alloc and release interface when there is a lot of process in the same memory group, it will serious break the scailability of the system, so add a new read semaphore lock to instead of the big lock for allocation and release critical path.
The scailability has been greatly improved by this modification.
Show the test result: number of process: alloc 4M avg time: Before the patch: 1 32us 3 96us 10 330us
after the patch: 1 32us 3 40us 10 60us
v2: fix some conflicts and clean some code.
Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 5 ++ kernel/fork.c | 4 +- mm/share_pool.c | 170 ++++++++++++++++++++----------------- 3 files changed, 100 insertions(+), 79 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 70b841d0eb8e5..f2d17cb85fa52 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -93,6 +93,8 @@ struct sp_group { unsigned long dvpp_va_start; unsigned long dvpp_size; atomic_t use_count; + /* protect the group internal elements */ + struct rw_semaphore rw_lock; };
struct sp_walk_data { @@ -238,6 +240,8 @@ extern void *vmalloc_hugepage_user(unsigned long size); extern void *buff_vzalloc_user(unsigned long size); extern void *buff_vzalloc_hugepage_user(unsigned long size);
+void sp_exit_mm(struct mm_struct *mm); + #else
static inline int sp_group_add_task(int pid, int spg_id) @@ -400,6 +404,7 @@ static inline void *buff_vzalloc_hugepage_user(unsigned long size) { return NULL; } + #endif
#endif /* LINUX_SHARE_POOL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index c410887b502b2..22ed43ed527de 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1059,8 +1059,6 @@ static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users));
- sp_group_exit(mm); - uprobe_clear_state(mm); exit_aio(mm); ksm_exit(mm); @@ -1088,6 +1086,8 @@ void mmput(struct mm_struct *mm) { might_sleep();
+ sp_group_exit(mm); + if (atomic_dec_and_test(&mm->mm_users)) __mmput(mm); } diff --git a/mm/share_pool.c b/mm/share_pool.c index 5149864c94c09..d9f70526bae17 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -197,6 +197,16 @@ static bool host_svm_sp_enable = false;
int sysctl_share_pool_hugepage_enable = 1;
+static void free_sp_group(struct sp_group *spg); + +static bool sp_group_get(struct sp_group *spg) +{ + if (spg_valid(spg) && atomic_inc_not_zero(&spg->use_count)) + return true; + + return false; +} + static unsigned long spa_size(struct sp_area *spa) { return spa->real_size; @@ -337,7 +347,9 @@ static struct sp_group *__sp_find_spg(int pid, int spg_id)
put_task_struct(tsk); } else { + mutex_lock(&sp_mutex); spg = idr_find(&sp_group_idr, spg_id); + mutex_unlock(&sp_mutex); }
return spg; @@ -392,6 +404,8 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) INIT_LIST_HEAD(&spg->procs); INIT_LIST_HEAD(&spg->spa_list);
+ init_rwsem(&spg->rw_lock); + ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id+1, GFP_KERNEL); if (ret < 0) { @@ -422,9 +436,8 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) goto out_fput; } } else { - if (!spg_valid(spg)) + if (!sp_group_get(spg)) return ERR_PTR(-ENODEV); - atomic_inc(&spg->use_count); }
return spg; @@ -607,6 +620,8 @@ int sp_group_add_task(int pid, int spg_id) }
mm->sp_group = spg; + + down_write(&spg->rw_lock); /* We reactive the spg even the spg exists already. */ spg->is_alive = true; list_add_tail(&mm->sp_node, &spg->procs); @@ -675,11 +690,14 @@ int sp_group_add_task(int pid, int spg_id) mm->sp_group = NULL; }
+ up_write(&spg->rw_lock); out_drop_group: if (unlikely(ret)) __sp_group_drop_locked(spg); out_put_mm: - mmput(mm); + /* No need to put the mm if the sp group add this mm success.*/ + if (unlikely(ret)) + mmput(mm); out_put_task: put_task_struct(tsk); out_unlock: @@ -712,44 +730,12 @@ static void spg_exit_unlock(bool unlock) mutex_unlock(&sp_mutex); }
-/* - * Do cleanup when a process exits. - */ -void sp_group_exit(struct mm_struct *mm) -{ - bool is_alive = true; - bool unlock; - - /* - * Nothing to do if this thread group doesn't belong to any sp_group. - * No need to protect this check with lock because we can add a task - * to a group if !PF_EXITING. - */ - if (!mm->sp_group) - return; - - spg_exit_lock(&unlock); - if (list_is_singular(&mm->sp_group->procs)) - is_alive = mm->sp_group->is_alive = false; - list_del(&mm->sp_node); - spg_exit_unlock(unlock); - - /* - * To avoid calling this with sp_mutex held, we first mark the - * sp_group as dead and then send the notification and then do - * the real cleanup in sp_group_post_exit(). - */ - if (!is_alive) - blocking_notifier_call_chain(&sp_notifier_chain, 0, - mm->sp_group); -} - void sp_group_post_exit(struct mm_struct *mm) { struct sp_proc_stat *stat; bool unlock;
- if (!mm->sp_group) + if (!enable_ascend_share_pool || !mm->sp_group) return;
spg_exit_lock(&unlock); @@ -1139,8 +1125,6 @@ static void sp_munmap(struct mm_struct *mm, unsigned long addr, { int err;
- if (!mmget_not_zero(mm)) - return; down_write(&mm->mmap_sem);
err = do_munmap(mm, addr, size, NULL); @@ -1150,7 +1134,6 @@ static void sp_munmap(struct mm_struct *mm, unsigned long addr, }
up_write(&mm->mmap_sem); - mmput(mm); }
/* The caller must hold sp_mutex. */ @@ -1183,8 +1166,6 @@ int sp_free(unsigned long addr)
check_interrupt_context();
- mutex_lock(&sp_mutex); - /* * Access control: a share pool addr can only be freed by another task * in the same spg or a kthread (such as buff_module_guard_work) @@ -1217,6 +1198,8 @@ int sp_free(unsigned long addr)
sp_dump_stack();
+ down_read(&spa->spg->rw_lock); + __sp_free(spa->spg, spa->va_start, spa_size(spa), NULL);
/* Free the memory of the backing shmem or hugetlbfs */ @@ -1226,6 +1209,9 @@ int sp_free(unsigned long addr) if (ret) pr_err("share pool: sp free fallocate failed: %d\n", ret);
+ up_read(&spa->spg->rw_lock); + + mutex_lock(&sp_mutex); /* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { kthread_stat.alloc_size -= spa->real_size; @@ -1236,12 +1222,11 @@ int sp_free(unsigned long addr) else BUG(); } + mutex_unlock(&sp_mutex);
drop_spa: __sp_area_drop(spa); out: - mutex_unlock(&sp_mutex); - sp_try_to_compact(); return ret; } @@ -1317,9 +1302,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) if (sp_flags & SP_HUGEPAGE_ONLY) sp_flags |= SP_HUGEPAGE;
- mutex_lock(&sp_mutex); spg = __sp_find_spg(current->pid, SPG_ID_DEFAULT); - mutex_unlock(&sp_mutex); if (!spg) { /* DVPP pass through scene: first call sp_alloc() */ /* mdc scene hack */ if (enable_mdc_default_group) @@ -1336,14 +1319,16 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) ret); return ERR_PTR(ret); } - mutex_lock(&sp_mutex); spg = current->mm->sp_group; } else { /* other scenes */ - mutex_lock(&sp_mutex); if (spg_id != SPG_ID_DEFAULT) { + mutex_lock(&sp_mutex); /* the caller should be a member of the sp group */ - if (spg != idr_find(&sp_group_idr, spg_id)) + if (spg != idr_find(&sp_group_idr, spg_id)) { + mutex_unlock(&sp_mutex); goto out; + } + mutex_unlock(&sp_mutex); } }
@@ -1352,6 +1337,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) goto out; }
+ down_read(&spg->rw_lock); if (sp_flags & SP_HUGEPAGE) { file = spg->file_hugetlb; size_aligned = ALIGN(size, PMD_SIZE); @@ -1376,31 +1362,25 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) unsigned long populate = 0; struct vm_area_struct *vma;
- if (!mmget_not_zero(mm)) - continue; - down_write(&mm->mmap_sem); mmap_addr = sp_mmap(mm, file, spa, &populate); if (IS_ERR_VALUE(mmap_addr)) { up_write(&mm->mmap_sem); p = (void *)mmap_addr; __sp_free(spg, sp_addr, size_aligned, mm); - mmput(mm); pr_err("share pool: allocation sp mmap failed, ret %ld\n", mmap_addr); goto out; }
- p =(void *)mmap_addr; /* success */ + p = (void *)mmap_addr; /* success */ if (populate == 0) { up_write(&mm->mmap_sem); - mmput(mm); continue; }
vma = find_vma(mm, sp_addr); if (unlikely(!vma)) { up_write(&mm->mmap_sem); - mmput(mm); pr_err("share pool: allocation failed due to find %pK vma failure\n", (void *)sp_addr); p = ERR_PTR(-EINVAL); @@ -1461,24 +1441,22 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) size_aligned = ALIGN(size, PAGE_SIZE); sp_flags &= ~SP_HUGEPAGE; __sp_area_drop(spa); - mmput(mm); goto try_again; } } - - mmput(mm); break; } - mmput(mm); }
+out: + up_read(&spg->rw_lock); + + mutex_lock(&sp_mutex); if (!IS_ERR(p)) { stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); if (stat) stat->alloc_size += size_aligned; } - -out: mutex_unlock(&sp_mutex);
/* this will free spa if mmap failed */ @@ -1556,10 +1534,6 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, } }
- if (!mmget_not_zero(mm)) { - ret_addr = -ESPGMMEXIT; - goto put_file; - } down_write(&mm->mmap_sem);
ret_addr = sp_mmap(mm, file, spa, &populate); @@ -1604,8 +1578,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa,
put_mm: up_write(&mm->mmap_sem); - mmput(mm); -put_file: + if (!spa->spg && file) fput(file);
@@ -1769,10 +1742,12 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, */ stat = sp_init_proc_stat(tsk, mm); if (IS_ERR(stat)) { + mutex_unlock(&sp_mutex); uva = stat; pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); goto out_unlock; } + mutex_unlock(&sp_mutex);
spg = __sp_find_spg(pid, SPG_ID_DEFAULT); if (spg == NULL) { @@ -1794,6 +1769,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { + up_read(&spg->rw_lock); pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); goto out_drop_spa; } @@ -1808,12 +1784,14 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, goto out_unlock; }
+ down_read(&spg->rw_lock); if (enable_share_k2u_spg) spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); else spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK);
if (IS_ERR(spa)) { + up_read(&spg->rw_lock); if (printk_ratelimit()) pr_err("share pool: k2u(spg) failed due to alloc spa failure " "(potential no enough virtual memory when -75): %ld\n", @@ -1831,14 +1809,18 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, uva = sp_make_share_kva_to_spg(kva_aligned, spa, spg); else uva = sp_make_share_kva_to_task(kva_aligned, spa, mm); + + up_read(&spg->rw_lock); } else { /* group is dead, return -ENODEV */ pr_err("share pool: failed to make k2u, sp group is dead\n"); }
if (!IS_ERR(uva)) { + mutex_lock(&sp_mutex); uva = uva + (kva - kva_aligned); stat->k2u_size += size_aligned; + mutex_unlock(&sp_mutex); } else { /* associate vma and spa */ if (!vmalloc_area_clr_flag(spa, kva_aligned, VM_SHAREPOOL)) @@ -1849,7 +1831,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, out_drop_spa: __sp_area_drop(spa); out_unlock: - mutex_unlock(&sp_mutex); mmput(mm); out_put_task: put_task_struct(tsk); @@ -2144,7 +2125,6 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp unsigned int page_size; struct sp_proc_stat *stat;
- mutex_lock(&sp_mutex); /* * at first we guess it's a hugepage addr * we can tolerate at most PMD_SIZE or PAGE_SIZE which is matched in k2u @@ -2157,7 +2137,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (printk_ratelimit()) pr_err("share pool: invalid input uva %pK in unshare uva\n", (void *)uva); - goto out_unlock; + goto out; } }
@@ -2259,10 +2239,14 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp goto out_drop_area; }
+ down_read(&spa->spg->rw_lock); __sp_free(spa->spg, uva_aligned, size_aligned, NULL); + up_read(&spa->spg->rw_lock); }
sp_dump_stack(); + + mutex_lock(&sp_mutex); /* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { kthread_stat.k2u_size -= spa->real_size; @@ -2273,6 +2257,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp else WARN(1, "share_pool: %s: null process stat\n", __func__); } + mutex_unlock(&sp_mutex);
out_clr_flag: /* deassociate vma and spa */ @@ -2281,8 +2266,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp
out_drop_area: __sp_area_drop(spa); -out_unlock: - mutex_unlock(&sp_mutex); +out: return ret; }
@@ -2446,7 +2430,7 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) check_interrupt_context();
if (device_id < 0 || device_id >= MAX_DEVID || pid < 0 || size <= 0 || - size> MMAP_SHARE_POOL_16G_SIZE) + size > MMAP_SHARE_POOL_16G_SIZE) return false;
mutex_lock(&sp_mutex); @@ -2468,9 +2452,10 @@ EXPORT_SYMBOL_GPL(sp_config_dvpp_range); /* Check whether the address belongs to the share pool. */ bool is_sharepool_addr(unsigned long addr) { - if (host_svm_sp_enable == false) - return addr >= MMAP_SHARE_POOL_START && addr < (MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE); - return addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_END; + if (host_svm_sp_enable == false) + return addr >= MMAP_SHARE_POOL_START && addr < (MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE); + + return addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_END; } EXPORT_SYMBOL_GPL(is_sharepool_addr);
@@ -2515,7 +2500,8 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, return 0; }
-static void rb_spa_stat_show(struct seq_file *seq) { +static void rb_spa_stat_show(struct seq_file *seq) +{ struct rb_node *node; struct sp_area *spa;
@@ -2814,6 +2800,36 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm, } EXPORT_SYMBOL(sharepool_no_page);
+#define MM_WOULD_FREE 2 + +void sp_group_exit(struct mm_struct *mm) +{ + struct sp_group *spg = NULL; + bool is_alive = true, unlock; + + if (!enable_ascend_share_pool) + return; + + spg = mm->sp_group; + + /* If the mm_users is 2, it means that the mm is ready to be freed + because the last owner of this mm is in exiting process. + */ + if (spg_valid(spg) && atomic_read(&mm->mm_users) == MM_WOULD_FREE) { + spg_exit_lock(&unlock); + down_write(&spg->rw_lock); + if (list_is_singular(&spg->procs)) + is_alive = spg->is_alive = false; + list_del(&mm->sp_node); + up_write(&spg->rw_lock); + if (!is_alive) + blocking_notifier_call_chain(&sp_notifier_chain, 0, + mm->sp_group); + atomic_dec(&mm->mm_users); + spg_exit_unlock(unlock); + } +} + struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, unsigned int page_order, int node) {
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
When oom happens, it's better to print share pool info of a process to check if it allocates too many share pool memory or not.
We also print print share pool system level stats.
Suggested-by: Cui Bixuan cuibixuan@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 41 ++++++++++++++ mm/oom_kill.c | 42 +++++++++++--- mm/share_pool.c | 109 +++++++++++++++++++++++++------------ 3 files changed, 149 insertions(+), 43 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index f2d17cb85fa52..fb7237351a995 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -105,6 +105,18 @@ struct sp_walk_data { bool is_hugepage; };
+/* per process memory usage statistics indexed by tgid */ +struct sp_proc_stat { + struct mm_struct *mm; + char comm[TASK_COMM_LEN]; + /* + * alloc amount minus free amount, may be negative when freed by + * another task in the same sp group. + */ + long alloc_size; + long k2u_size; +}; + #ifdef CONFIG_ASCEND_SHARE_POOL
#define MAP_SHARE_POOL 0x100000 @@ -155,6 +167,9 @@ extern int sp_register_notifier(struct notifier_block *nb); extern int sp_unregister_notifier(struct notifier_block *nb); extern bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid); extern bool is_sharepool_addr(unsigned long addr); +extern struct sp_proc_stat *sp_get_proc_stat(int tgid); +extern void spa_overview_show(struct seq_file *seq); +extern void spg_overview_show(struct seq_file *seq); extern void proc_sharepool_init(void);
static inline struct task_struct *sp_get_task(struct mm_struct *mm) @@ -230,6 +245,11 @@ static inline void sp_dump_stack(void) dump_stack(); }
+static inline bool ascend_sp_oom_show(void) +{ + return enable_ascend_share_pool ? true : false; +} + vm_fault_t sharepool_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, @@ -310,6 +330,7 @@ static inline int sp_walk_page_range(unsigned long uva, unsigned long size, static inline void sp_walk_page_free(struct sp_walk_data *sp_walk_data) { } + static inline int sp_register_notifier(struct notifier_block *nb) { return -EPERM; @@ -319,6 +340,7 @@ static inline int sp_unregister_notifier(struct notifier_block *nb) { return -EPERM; } + static inline bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) { return false; @@ -329,6 +351,19 @@ static inline bool is_sharepool_addr(unsigned long addr) return false; }
+static inline struct sp_proc_stat *sp_get_proc_stat(int tgid) +{ + return NULL; +} + +static inline void spa_overview_show(struct seq_file *seq) +{ +} + +static inline void spg_overview_show(struct seq_file *seq) +{ +} + static inline void proc_sharepool_init(void) { } @@ -337,6 +372,7 @@ static inline struct task_struct *sp_get_task(struct mm_struct *mm) { return current; } + static inline bool sp_check_hugepage(struct page *p) { return false; @@ -385,6 +421,11 @@ static inline void sp_dump_stack(void) { }
+static inline bool ascend_sp_oom_show(void) +{ + return false; +} + static inline void *vmalloc_hugepage(unsigned long size) { return NULL; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d422223d2d6bf..0c77331492384 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -41,6 +41,7 @@ #include <linux/kthread.h> #include <linux/init.h> #include <linux/mmu_notifier.h> +#include <linux/share_pool.h>
#include <asm/tlb.h> #include "internal.h" @@ -454,9 +455,16 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; + struct sp_proc_stat *stat; + + if (ascend_sp_oom_show()) { + pr_info("Tasks state (memory values in pages, share pool memory values in KB):\n"); + pr_info("[ pid ] uid tgid total_vm rss sp_alloc sp_k2u pgtables_bytes swapents oom_score_adj name\n"); + } else { + pr_info("Tasks state (memory values in pages):\n"); + pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); + }
- pr_info("Tasks state (memory values in pages):\n"); - pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -472,12 +480,28 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; }
- pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", - task->pid, from_kuid(&init_user_ns, task_uid(task)), - task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - mm_pgtables_bytes(task->mm), - get_mm_counter(task->mm, MM_SWAPENTS), - task->signal->oom_score_adj, task->comm); + if (ascend_sp_oom_show()) { + stat = sp_get_proc_stat(task->tgid); + + pr_cont("[%7d] %5d %5d %8lu %8lu ", + task->pid, from_kuid(&init_user_ns, task_uid(task)), + task->tgid, task->mm->total_vm, get_mm_rss(task->mm)); + if (!stat) + pr_cont("%-9c %-9c ", '-', '-'); + else + pr_cont("%-9ld %-9ld ", (stat->alloc_size) >> 10, (stat->k2u_size) >> 10); /* byte to KB */ + pr_cont("%8ld %8lu %5hd %s\n", + mm_pgtables_bytes(task->mm), + get_mm_counter(task->mm, MM_SWAPENTS), + task->signal->oom_score_adj, task->comm); + } else { + pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", + task->pid, from_kuid(&init_user_ns, task_uid(task)), + task->tgid, task->mm->total_vm, get_mm_rss(task->mm), + mm_pgtables_bytes(task->mm), + get_mm_counter(task->mm, MM_SWAPENTS), + task->signal->oom_score_adj, task->comm); + } task_unlock(task); } rcu_read_unlock(); @@ -1141,6 +1165,8 @@ int hisi_oom_notifier_call(unsigned long val, void *v) pr_err("OOM_NOTIFIER: oom type %lu\n", val); dump_stack(); show_mem(SHOW_MEM_FILTER_NODES, NULL); + spg_overview_show(NULL); + spa_overview_show(NULL); dump_tasks(NULL, 0); last_jiffies = jiffies; } diff --git a/mm/share_pool.c b/mm/share_pool.c index d9f70526bae17..94e2be2ce96bc 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -83,18 +83,6 @@ static DEFINE_IDA(sp_group_id_ida); /* idr of all sp_proc_stats */ static DEFINE_IDR(sp_stat_idr);
-/* per process memory usage statistics indexed by tgid */ -struct sp_proc_stat { - struct mm_struct *mm; - char comm[TASK_COMM_LEN]; - /* - * alloc amount minus free amount, may be negative when freed by - * another task in the same sp group. - */ - long alloc_size; - long k2u_size; -}; - /* for kthread buff_module_guard_work */ static struct sp_proc_stat kthread_stat = {0};
@@ -2475,6 +2463,18 @@ __setup("enable_sp_share_k2u_spg", enable_share_k2u_to_group);
/*** Statistical and maintenance functions ***/
+struct sp_proc_stat *sp_get_proc_stat(int tgid) +{ + struct sp_proc_stat *stat; + + mutex_lock(&sp_mutex); + stat = idr_find(&sp_stat_idr, tgid); + mutex_unlock(&sp_mutex); + + /* maybe NULL or not, we always return it */ + return stat; +} + int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -2484,12 +2484,12 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, mutex_lock(&sp_mutex); spg = __sp_find_spg(task->pid, SPG_ID_DEFAULT); if (spg_valid(spg)) { - /* print the file header */ stat = idr_find(&sp_stat_idr, task->mm->sp_stat_id); if (!stat) { mutex_unlock(&sp_mutex); return 0; } + /* print the file header */ seq_printf(m, "%-8s %-9s %-13s\n", "Group_ID", "SP_ALLOC", "HugePage Fail"); seq_printf(m, "%-8d %-9ld %-13d\n", @@ -2553,12 +2553,15 @@ static void rb_spa_stat_show(struct seq_file *seq) spin_unlock(&sp_area_lock); }
-static void spa_overview_show(struct seq_file *seq) +void spa_overview_show(struct seq_file *seq) { unsigned int total_num, alloc_num, k2u_task_num, k2u_spg_num; unsigned long total_size, alloc_size, k2u_task_size, k2u_spg_size; unsigned long dvpp_size, dvpp_va_size;
+ if (!enable_ascend_share_pool) + return; + spin_lock(&sp_area_lock); total_num = spa_stat.total_num; alloc_num = spa_stat.alloc_num; @@ -2572,16 +2575,29 @@ static void spa_overview_show(struct seq_file *seq) dvpp_va_size = spa_stat.dvpp_va_size; spin_unlock(&sp_area_lock);
- seq_printf(seq, "Spa total num %u.\n", total_num); - seq_printf(seq, "Spa alloc num %u, k2u(task) num %u, k2u(spg) num %u.\n", - alloc_num, k2u_task_num, k2u_spg_num); - seq_printf(seq, "Spa total size: %13lu KB\n", byte2kb(total_size)); - seq_printf(seq, "Spa alloc size: %13lu KB\n", byte2kb(alloc_size)); - seq_printf(seq, "Spa k2u(task) size: %13lu KB\n", byte2kb(k2u_task_size)); - seq_printf(seq, "Spa k2u(spg) size: %13lu KB\n", byte2kb(k2u_spg_size)); - seq_printf(seq, "Spa dvpp size: %13lu KB\n", byte2kb(dvpp_size)); - seq_printf(seq, "Spa dvpp va size: %13lu MB\n", byte2mb(dvpp_va_size)); - seq_printf(seq, "\n"); + if (seq != NULL) { + seq_printf(seq, "Spa total num %u.\n", total_num); + seq_printf(seq, "Spa alloc num %u, k2u(task) num %u, k2u(spg) num %u.\n", + alloc_num, k2u_task_num, k2u_spg_num); + seq_printf(seq, "Spa total size: %13lu KB\n", byte2kb(total_size)); + seq_printf(seq, "Spa alloc size: %13lu KB\n", byte2kb(alloc_size)); + seq_printf(seq, "Spa k2u(task) size: %13lu KB\n", byte2kb(k2u_task_size)); + seq_printf(seq, "Spa k2u(spg) size: %13lu KB\n", byte2kb(k2u_spg_size)); + seq_printf(seq, "Spa dvpp size: %13lu KB\n", byte2kb(dvpp_size)); + seq_printf(seq, "Spa dvpp va size: %13lu MB\n", byte2mb(dvpp_va_size)); + seq_puts(seq, "\n"); + } else { + pr_info("Spa total num %u.\n", total_num); + pr_info("Spa alloc num %u, k2u(task) num %u, k2u(spg) num %u.\n", + alloc_num, k2u_task_num, k2u_spg_num); + pr_info("Spa total size: %13lu KB\n", byte2kb(total_size)); + pr_info("Spa alloc size: %13lu KB\n", byte2kb(alloc_size)); + pr_info("Spa k2u(task) size: %13lu KB\n", byte2kb(k2u_task_size)); + pr_info("Spa k2u(spg) size: %13lu KB\n", byte2kb(k2u_spg_size)); + pr_info("Spa dvpp size: %13lu KB\n", byte2kb(dvpp_size)); + pr_info("Spa dvpp va size: %13lu MB\n", byte2mb(dvpp_va_size)); + pr_info("\n"); + } }
/* the caller must hold sp_mutex */ @@ -2590,25 +2606,48 @@ static int idr_spg_stat_cb(int id, void *p, void *data) struct sp_group *spg = p; struct seq_file *seq = data;
- seq_printf(seq, "Group %6d size: %ld KB, spa num: %d, total alloc: %ld KB, " - "normal alloc: %ld KB, huge alloc: %ld KB\n", - id, byte2kb(atomic64_read(&spg->size)), atomic_read(&spg->spa_num), - byte2kb(atomic64_read(&spg->alloc_size)), - byte2kb(atomic64_read(&spg->alloc_nsize)), - byte2kb(atomic64_read(&spg->alloc_hsize))); + if (seq != NULL) { + seq_printf(seq, "Group %6d size: %ld KB, spa num: %d, total alloc: %ld KB, " + "normal alloc: %ld KB, huge alloc: %ld KB\n", + id, byte2kb(atomic64_read(&spg->size)), atomic_read(&spg->spa_num), + byte2kb(atomic64_read(&spg->alloc_size)), + byte2kb(atomic64_read(&spg->alloc_nsize)), + byte2kb(atomic64_read(&spg->alloc_hsize))); + } else { + pr_info("Group %6d size: %ld KB, spa num: %d, total alloc: %ld KB, " + "normal alloc: %ld KB, huge alloc: %ld KB\n", + id, byte2kb(atomic64_read(&spg->size)), atomic_read(&spg->spa_num), + byte2kb(atomic64_read(&spg->alloc_size)), + byte2kb(atomic64_read(&spg->alloc_nsize)), + byte2kb(atomic64_read(&spg->alloc_hsize))); + }
return 0; }
-static void spg_overview_show(struct seq_file *seq) +void spg_overview_show(struct seq_file *seq) { - seq_printf(seq, "Share pool total size: %ld KB, spa total num: %d.\n", - byte2kb(atomic64_read(&spg_stat.spa_total_size)), - atomic_read(&spg_stat.spa_total_num)); + if (!enable_ascend_share_pool) + return; + + if (seq != NULL) { + seq_printf(seq, "Share pool total size: %ld KB, spa total num: %d.\n", + byte2kb(atomic64_read(&spg_stat.spa_total_size)), + atomic_read(&spg_stat.spa_total_num)); + } else { + pr_info("Share pool total size: %ld KB, spa total num: %d.\n", + byte2kb(atomic64_read(&spg_stat.spa_total_size)), + atomic_read(&spg_stat.spa_total_num)); + } + mutex_lock(&sp_mutex); idr_for_each(&sp_group_idr, idr_spg_stat_cb, seq); mutex_unlock(&sp_mutex); - seq_printf(seq, "\n"); + + if (seq != NULL) + seq_puts(seq, "\n"); + else + pr_info("\n"); }
static int spa_stat_show(struct seq_file *seq, void *offset)
From: Fang Lijun fanglijun3@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------
The pud_page is not defined in asm/pgtable-2level.h, when we use it by disabling CONFIG_ARM_LPAE. So this cause compile error like this:
mm/vmalloc.c: In function 'vmalloc_to_page': include/asm-generic/pgtable-nop4d-hack.h:48:27: error: implicit declaration of function 'pud_page'; did you mean 'put_page'? [-Werror=implicit-function-declaration] ^~~~~~~~ Fixes: e02c5b9bddda ("mm/vmalloc: fix vmalloc_to_page for huge vmap mappings") Signed-off-by: Fang Lijun fanglijun3@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm/include/asm/pgtable-2level.h | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h index 12659ce5c1f38..3d3103ff6e14d 100644 --- a/arch/arm/include/asm/pgtable-2level.h +++ b/arch/arm/include/asm/pgtable-2level.h @@ -191,6 +191,7 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr) return (pmd_t *)pud; }
+#define pud_page(pud) NULL #define pmd_large(pmd) (pmd_val(pmd) & 2) #define pmd_bad(pmd) (pmd_val(pmd) & 2) #define pmd_present(pmd) (pmd_val(pmd))
From: Bixuan Cui cuibixuan@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
After the MAP_LOCKED flag is enabled, resource contention occurs when multiple tasks map/munmap memory at the same time, causing performance loss.
Add sysctl_share_pool_map_lock_enable to control whether the map is locked in sp_mmap().
Signed-off-by: Bixuan Cui cuibixuan@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 2 ++ kernel/sysctl.c | 10 ++++++++++ mm/share_pool.c | 12 ++++++++++-- mm/vmalloc.c | 2 +- 4 files changed, 23 insertions(+), 3 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index fb7237351a995..c3120b7b24948 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -38,6 +38,8 @@ extern int sysctl_sp_debug_mode;
extern int enable_ascend_share_pool;
+extern int sysctl_share_pool_map_lock_enable; + #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC extern bool vmap_allow_huge; #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b88e12d942166..97a24290f0750 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1280,6 +1280,16 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + /* 0: map_unlock, 1: map_lock */ + .procname = "share_pool_map_lock_enable", + .data = &sysctl_share_pool_map_lock_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; diff --git a/mm/share_pool.c b/mm/share_pool.c index 94e2be2ce96bc..36e33d1de4f8c 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -69,6 +69,8 @@ int sysctl_ac_mode = AC_NONE; /* debug mode */ int sysctl_sp_debug_mode;
+int sysctl_share_pool_map_lock_enable; + /* idr of all sp_groups */ static DEFINE_IDR(sp_group_idr);
@@ -1227,11 +1229,17 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, unsigned long addr = spa->va_start; unsigned long size = spa_size(spa); unsigned long prot = PROT_READ | PROT_WRITE; - unsigned long flags = MAP_FIXED | MAP_SHARED | MAP_LOCKED | - MAP_POPULATE | MAP_SHARE_POOL; + unsigned long flags = MAP_FIXED | MAP_SHARED | MAP_POPULATE | + MAP_SHARE_POOL; unsigned long vm_flags = VM_NORESERVE | VM_SHARE_POOL | VM_DONTCOPY; unsigned long pgoff = (addr - MMAP_SHARE_POOL_START) >> PAGE_SHIFT;
+ /* Mark the mapped region to be locked. After the MAP_LOCKED is enable, + * multiple tasks will preempt resources, causing performance loss. + */ + if (sysctl_share_pool_map_lock_enable) + flags |= MAP_LOCKED; + atomic_inc(&spa->use_count); addr = __do_mmap(mm, file, addr, size, prot, flags, vm_flags, pgoff, populate, NULL); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4b0970d6fc913..da610bc88ae92 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2650,7 +2650,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) { - struct vm_struct *area; + struct vm_struct *area = NULL; void *addr; unsigned long real_size = size; unsigned long real_align = align;
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The commit 59a57a82fb2a ("mm/vmalloc: Hugepage vmalloc mappings") would enable the vmalloc for hugepage default when the alloc size is bigger than the PMD_SIZE, it looks like the transparent hugepage for mmap, the driver could not control the hugepage accurately and be break the logic, now the share pool already export the vmalloc_hugepage_xxx function to control the vmalloc hugepage allocation, it looks like the static hugepage for vmalloc, so disable the transparent hugepage function.
This patch also fix the problem of breaking the kabi of vm_struct, the user could applied it for commercial version.
Fixes: 59a57a82fb2a ("mm/vmalloc: Hugepage vmalloc mappings") Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 51 ++++++++++++++++++++++++++++---------- include/linux/vmalloc.h | 1 - mm/vmalloc.c | 47 ++++++++++++----------------------- 3 files changed, 54 insertions(+), 45 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index c3120b7b24948..4a18c88d5a10e 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -211,15 +211,6 @@ static inline void sp_area_work_around(struct vm_unmapped_area_info *info)
extern struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, unsigned int page_order, int node); - -static inline void sp_free_pages(struct page *page, struct vm_struct *area) -{ - if (PageHuge(page)) - put_page(page); - else - __free_pages(page, area->page_order); -} - static inline bool sp_check_vm_share_pool(unsigned long vm_flags) { if (enable_ascend_share_pool && (vm_flags & VM_SHARE_POOL)) @@ -264,6 +255,30 @@ extern void *buff_vzalloc_hugepage_user(unsigned long size);
void sp_exit_mm(struct mm_struct *mm);
+static inline bool is_vmalloc_huge(unsigned long vm_flags) +{ + if (enable_ascend_share_pool && (vm_flags & VM_HUGE_PAGES)) + return true; + + return false; +} + +static inline bool is_vmalloc_sharepool(unsigned long vm_flags) +{ + if (enable_ascend_share_pool && (vm_flags & VM_SHAREPOOL)) + return true; + + return false; +} + +static inline void sp_free_pages(struct page *page, struct vm_struct *area) +{ + if (PageHuge(page)) + put_page(page); + else + __free_pages(page, is_vmalloc_huge(area->flags) ? PMD_SHIFT - PAGE_SHIFT : 0); +} + #else
static inline int sp_group_add_task(int pid, int spg_id) @@ -400,10 +415,6 @@ static inline struct page *sp_alloc_pages(void *area, gfp_t mask, return NULL; }
-static inline void sp_free_pages(struct page *page, struct vm_struct *area) -{ -} - static inline bool sp_check_vm_share_pool(unsigned long vm_flags) { return false; @@ -448,6 +459,20 @@ static inline void *buff_vzalloc_hugepage_user(unsigned long size) return NULL; }
+static inline bool is_vmalloc_huge(struct vm_struct *vm) +{ + return NULL; +} + +static inline bool is_vmalloc_sharepool(struct vm_struct *vm) +{ + return NULL; +} + +static inline void sp_free_pages(struct page *page, struct vm_struct *area) +{ +} + #endif
#endif /* LINUX_SHARE_POOL_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index bb814f6418fd9..298eff5579b21 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -43,7 +43,6 @@ struct vm_struct { unsigned long size; unsigned long flags; struct page **pages; - unsigned int page_order; unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index da610bc88ae92..9bd49a700707e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2354,6 +2354,7 @@ struct vm_struct *remove_vm_area(const void *addr) static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; + unsigned int page_order = 0;
if (!addr) return; @@ -2369,13 +2370,14 @@ static void __vunmap(const void *addr, int deallocate_pages) return; }
-#ifdef CONFIG_ASCEND_SHARE_POOL /* unmap a sharepool vm area will cause meamleak! */ - if (area->flags & VM_SHAREPOOL) { + if (is_vmalloc_sharepool(area->flags)) { WARN(1, KERN_ERR "Memory leak due to vfree() sharepool vm area (%p) !\n", addr); return; } -#endif + + if (is_vmalloc_huge(area->flags)) + page_order = PMD_SHIFT - PAGE_SHIFT;
debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); @@ -2384,14 +2386,14 @@ static void __vunmap(const void *addr, int deallocate_pages) if (deallocate_pages) { int i;
- for (i = 0; i < area->nr_pages; i += 1U << area->page_order) { + for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page = area->pages[i];
BUG_ON(!page); if (sp_is_enabled()) sp_free_pages(page, area); else - __free_pages(page, area->page_order); + __free_pages(page, page_order); }
kvfree(area->pages); @@ -2591,7 +2593,6 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->pages = pages; area->nr_pages = nr_pages; - area->page_order = page_order;
for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page; @@ -2659,27 +2660,17 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) goto fail;
- if (vmap_allow_huge && (pgprot_val(prot) == pgprot_val(PAGE_KERNEL))) { - unsigned long size_per_node; - + if (vmap_allow_huge && (pgprot_val(prot) == pgprot_val(PAGE_KERNEL)) && is_vmalloc_huge(vm_flags)) { /* - * Try huge pages. Only try for PAGE_KERNEL allocations, - * others like modules don't yet expect huge pages in - * their allocations due to apply_to_page_range not - * supporting them. + * Alloc huge pages. Only valid for PAGE_KERNEL allocations and + * VM_HUGE_PAGES flags. */
- size_per_node = size; - if (node == NUMA_NO_NODE && !sp_is_enabled()) - size_per_node /= num_online_nodes(); - if (size_per_node >= PMD_SIZE) { - shift = PMD_SHIFT; - align = max(real_align, 1UL << shift); - size = ALIGN(real_size, 1UL << shift); - } + shift = PMD_SHIFT; + align = max(real_align, 1UL << shift); + size = ALIGN(real_size, 1UL << shift); }
-again: size = PAGE_ALIGN(size); area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); @@ -2708,12 +2699,6 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, return addr;
fail: - if (shift > PAGE_SHIFT) { - shift = PAGE_SHIFT; - align = real_align; - size = real_size; - goto again; - }
if (!area) { /* Warn for area allocation, page allocations already warn */ @@ -3778,7 +3763,7 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, " %pS", v->caller);
if (v->nr_pages) - seq_printf(m, " pages=%d order=%d", v->nr_pages, v->page_order); + seq_printf(m, " pages=%d", v->nr_pages);
if (v->phys_addr) seq_printf(m, " phys=%pa", &v->phys_addr); @@ -3798,8 +3783,8 @@ static int s_show(struct seq_file *m, void *p) if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages");
- if (sp_is_enabled()) - seq_printf(m, " order=%d", v->page_order); + if (is_vmalloc_huge(v->flags)) + seq_printf(m, " order=%d", PMD_SHIFT - PAGE_SHIFT);
show_numa_info(m, v); seq_putc(m, '\n');
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
do_munmap() is called in sp_munmap_task_areas(), we should leverage mm->mmap_sem.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 36e33d1de4f8c..9bf872ed680eb 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -661,7 +661,9 @@ int sp_group_add_task(int pid, int spg_id) "failed (potential no enough memory): %d " "spa type is %d\n", ret, spa->type); } + down_write(&mm->mmap_sem); sp_munmap_task_areas(mm, spa->link.next); + up_write(&mm->mmap_sem); spin_lock(&sp_area_lock); break; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
KASAN report: [ 127.094921] BUG: KASAN: use-after-free in rb_next+0x18/0xa8 [ 127.095591] Read of size 8 at addr ffff8000cffb0130 by task cat/642 [ 127.096169] [ 127.096935] CPU: 1 PID: 642 Comm: cat Tainted: G OE 4.19.170+ #168 [ 127.097499] Hardware name: linux,dummy-virt (DT) [ 127.098200] Call trace: [ 127.098508] dump_backtrace+0x0/0x268 [ 127.098885] show_stack+0x24/0x30 [ 127.099241] dump_stack+0x104/0x15c [ 127.099754] print_address_description+0x68/0x278 [ 127.100317] kasan_report+0x208/0x328 [ 127.100683] __asan_load8+0x84/0xa8 [ 127.101035] rb_next+0x18/0xa8 [ 127.101355] spa_stat_show+0x148/0x378 [ 127.101746] seq_read+0x160/0x730 [ 127.102106] proc_reg_read+0xac/0x100 [ 127.102492] do_iter_read+0x248/0x290 [ 127.102860] vfs_readv+0xe4/0x140 [ 127.103220] default_file_splice_read+0x298/0x4e0 [ 127.103765] do_splice_to+0xa8/0xe0 [ 127.104179] splice_direct_to_actor+0x180/0x3d8 [ 127.104603] do_splice_direct+0x100/0x178 [ 127.104991] do_sendfile+0x2ec/0x520 [ 127.105363] __arm64_sys_sendfile64+0x204/0x250 [ 127.105792] el0_svc_common+0xb0/0x2d0 [ 127.106168] el0_svc_handler+0x40/0x90 [ 127.106523] el0_svc+0x10/0x248
The reason is that __sp_area_drop_locked(spa) may free the spa and its corresponding rbtree node. Then the node of rb_next(node) is use-after-free.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 9bf872ed680eb..3099365745816 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2513,12 +2513,15 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, static void rb_spa_stat_show(struct seq_file *seq) { struct rb_node *node; - struct sp_area *spa; + struct sp_area *spa, *prev = NULL;
spin_lock(&sp_area_lock);
for (node = rb_first(&sp_area_root); node; node = rb_next(node)) { + __sp_area_drop_locked(prev); + spa = rb_entry(node, struct sp_area, rb_node); + prev = spa; atomic_inc(&spa->use_count); spin_unlock(&sp_area_lock);
@@ -2557,9 +2560,8 @@ static void rb_spa_stat_show(struct seq_file *seq) seq_printf(seq, "%-10d\n", atomic_read(&spa->use_count));
spin_lock(&sp_area_lock); - __sp_area_drop_locked(spa); } - + __sp_area_drop_locked(prev); spin_unlock(&sp_area_lock); }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
For the new fine grained locking design, ESPGMMEXIT is no longer needed.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 3099365745816..7d6299743dd74 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -51,7 +51,6 @@ #define AC_SINGLE_OWNER 1
#define spg_valid(spg) ((spg) && ((spg)->is_alive == true)) -#define ESPGMMEXIT 4000
#define byte2kb(size) ((size) >> 10) #define byte2mb(size) ((size) >> 20) @@ -1617,18 +1616,13 @@ static void *sp_make_share_kva_to_spg(unsigned long kva, struct sp_area *spa,
list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { ret_addr = sp_remap_kva_to_vma(kva, spa, mm); - if (IS_ERR_VALUE(ret_addr) && (ret_addr != -ESPGMMEXIT)) { + if (IS_ERR_VALUE(ret_addr)) { pr_err("share pool: remap k2u to spg failed, ret %ld \n", ret_addr); __sp_free(spg, spa->va_start, spa_size(spa), mm); p = ERR_PTR(ret_addr); goto out; }
- if (ret_addr == -ESPGMMEXIT) { - pr_info("share pool: remap k2u, ret is -ESPGMMEXIT\n"); - continue; - } - uva = ret_addr; } p = (void *)uva;
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: doc bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Some of the comments are outdated and need to be update.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 4 ++-- mm/share_pool.c | 25 +++++++++++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 4a18c88d5a10e..26e44d51fd849 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -74,7 +74,7 @@ struct sp_group { struct file *file_hugetlb; /* list head of processes */ struct list_head procs; - /* list of sp_area */ + /* list of sp_area. it is protected by spin_lock sp_area_lock */ struct list_head spa_list; /* number of sp_area */ atomic_t spa_num; @@ -95,7 +95,7 @@ struct sp_group { unsigned long dvpp_va_start; unsigned long dvpp_size; atomic_t use_count; - /* protect the group internal elements */ + /* protect the group internal elements, except spa_list */ struct rw_semaphore rw_lock; };
diff --git a/mm/share_pool.c b/mm/share_pool.c index 7d6299743dd74..44fdc693baf61 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -574,7 +574,18 @@ int sp_group_add_task(int pid, int spg_id) goto out_unlock; }
- /* current thread may be exiting in a multithread process */ + /* + * group_leader: current thread may be exiting in a multithread process + * + * DESIGN IDEA + * We increase mm->mm_users deliberately to ensure it's decreased in + * share pool under only 2 circumstances, which will simply the overall + * design as mm won't be freed unexpectedly. + * + * The corresponding refcount decrements are as follows: + * 1. the error handling branch of THIS function. + * 2. In sp_group_exit(). It's called only when process is exiting. + */ mm = get_task_mm(tsk->group_leader); if (!mm) { ret = -ESRCH; @@ -677,6 +688,7 @@ int sp_group_add_task(int pid, int spg_id) idr_remove(&sp_stat_idr, mm->sp_stat_id); kfree(stat); mm->sp_stat_id = 0; + /* spg->procs is modified, spg->rw_lock should be put below */ list_del(&mm->sp_node); mm->sp_group = NULL; } @@ -686,7 +698,7 @@ int sp_group_add_task(int pid, int spg_id) if (unlikely(ret)) __sp_group_drop_locked(spg); out_put_mm: - /* No need to put the mm if the sp group add this mm success.*/ + /* No need to put the mm if the sp group adds this mm successfully */ if (unlikely(ret)) mmput(mm); out_put_task: @@ -1504,6 +1516,7 @@ static unsigned long __sp_remap_get_pfn(unsigned long kva) return pfn; }
+/* when called by k2u to group, always make sure rw_lock of spg is down */ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, struct mm_struct *mm) { @@ -2857,8 +2870,11 @@ void sp_group_exit(struct mm_struct *mm)
spg = mm->sp_group;
- /* If the mm_users is 2, it means that the mm is ready to be freed - because the last owner of this mm is in exiting process. + /* + * Recall we add mm->users by 1 deliberately in sp_group_add_task(). + * If the mm_users is 2, it means that the mm is ready to be freed + * because the last owner of this mm is in exiting procedure: + * do_exit() -> exit_mm() -> mmput() -> THIS function. */ if (spg_valid(spg) && atomic_read(&mm->mm_users) == MM_WOULD_FREE) { spg_exit_lock(&unlock); @@ -2870,6 +2886,7 @@ void sp_group_exit(struct mm_struct *mm) if (!is_alive) blocking_notifier_call_chain(&sp_notifier_chain, 0, mm->sp_group); + /* match with get_task_mm() in sp_group_add_task() */ atomic_dec(&mm->mm_users); spg_exit_unlock(unlock); }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
struct sp_proc_stat for a process using share pool may be accessed concurrently. If use type atomic64_t in it, locks won't be needed anymore.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 4 ++-- mm/oom_kill.c | 4 +++- mm/share_pool.c | 47 +++++++++++++++++++++++--------------- 3 files changed, 33 insertions(+), 22 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 26e44d51fd849..356781bfe3e0a 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -115,8 +115,8 @@ struct sp_proc_stat { * alloc amount minus free amount, may be negative when freed by * another task in the same sp group. */ - long alloc_size; - long k2u_size; + atomic64_t alloc_size; + atomic64_t k2u_size; };
#ifdef CONFIG_ASCEND_SHARE_POOL diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0c77331492384..9554786cfddcd 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -489,7 +489,9 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) if (!stat) pr_cont("%-9c %-9c ", '-', '-'); else - pr_cont("%-9ld %-9ld ", (stat->alloc_size) >> 10, (stat->k2u_size) >> 10); /* byte to KB */ + pr_cont("%-9ld %-9ld ", /* byte to KB */ + atomic64_read(&stat->alloc_size) >> 10, + atomic64_read(&stat->k2u_size) >> 10); pr_cont("%8ld %8lu %5hd %s\n", mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), diff --git a/mm/share_pool.c b/mm/share_pool.c index 44fdc693baf61..3bb1a3f5b4e12 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -113,7 +113,8 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, return ERR_PTR(-ENOMEM); }
- stat->alloc_size = stat->k2u_size = 0; + atomic64_set(&stat->alloc_size, 0); + atomic64_set(&stat->k2u_size, 0); stat->mm = mm; get_task_comm(stat->comm, tsk); ret = idr_alloc(&sp_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); @@ -736,6 +737,7 @@ static void spg_exit_unlock(bool unlock) void sp_group_post_exit(struct mm_struct *mm) { struct sp_proc_stat *stat; + long alloc_size, k2u_size; bool unlock;
if (!enable_ascend_share_pool || !mm->sp_group) @@ -757,13 +759,15 @@ void sp_group_post_exit(struct mm_struct *mm) * * We decide to print a info when seeing both of the scenarios. */ - if (stat && (stat->alloc_size != 0 || stat->k2u_size != 0)) - pr_info("share pool: process %s(%d) of sp group %d exits. " - "It applied %ld aligned KB, k2u shared %ld aligned " - "KB\n", - stat->comm, mm->sp_stat_id, - mm->sp_group->id, byte2kb(stat->alloc_size), - byte2kb(stat->k2u_size)); + if (stat) { + alloc_size = atomic64_read(&stat->alloc_size); + k2u_size = atomic64_read(&stat->k2u_size); + if (alloc_size != 0 || k2u_size != 0) + pr_info("share pool: process %s(%d) of sp group %d exits. " + "It applied %ld aligned KB, k2u shared %ld aligned KB\n", + stat->comm, mm->sp_stat_id, mm->sp_group->id, + byte2kb(alloc_size), byte2kb(k2u_size)); + }
idr_remove(&sp_stat_idr, mm->sp_stat_id);
@@ -1217,11 +1221,11 @@ int sp_free(unsigned long addr) mutex_lock(&sp_mutex); /* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { - kthread_stat.alloc_size -= spa->real_size; + atomic64_sub(spa->real_size, &kthread_stat.alloc_size); } else { stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); if (stat) - stat->alloc_size -= spa->real_size; + atomic64_sub(spa->real_size, &stat->alloc_size); else BUG(); } @@ -1464,7 +1468,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) if (!IS_ERR(p)) { stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); if (stat) - stat->alloc_size += size_aligned; + atomic64_add(size_aligned, &stat->alloc_size); } mutex_unlock(&sp_mutex);
@@ -1824,7 +1828,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, if (!IS_ERR(uva)) { mutex_lock(&sp_mutex); uva = uva + (kva - kva_aligned); - stat->k2u_size += size_aligned; + atomic64_add(size_aligned, &stat->k2u_size); mutex_unlock(&sp_mutex); } else { /* associate vma and spa */ @@ -2254,11 +2258,11 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp mutex_lock(&sp_mutex); /* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { - kthread_stat.k2u_size -= spa->real_size; + atomic64_sub(spa->real_size, &kthread_stat.k2u_size); } else { stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); if (stat) - stat->k2u_size -= spa->real_size; + atomic64_sub(spa->real_size, &stat->k2u_size); else WARN(1, "share_pool: %s: null process stat\n", __func__); } @@ -2510,7 +2514,9 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%-8s %-9s %-13s\n", "Group_ID", "SP_ALLOC", "HugePage Fail"); seq_printf(m, "%-8d %-9ld %-13d\n", - spg->id, byte2kb(stat->alloc_size), spg->hugepage_failures); + spg->id, + byte2kb(atomic64_read(&stat->alloc_size)), + spg->hugepage_failures); } mutex_unlock(&sp_mutex);
@@ -2729,8 +2735,10 @@ static int idr_proc_stat_cb(int id, void *p, void *data) else seq_printf(seq, "%-8d ", spg_id); seq_printf(seq, "%-9ld %-9ld %-9ld %-10ld %-8ld %-7ld %-7ld %-10ld\n", - byte2kb(stat->alloc_size), byte2kb(stat->k2u_size), sp_res, - non_sp_res, page2kb(mm->total_vm), page2kb(total_rss), + byte2kb(atomic64_read(&stat->alloc_size)), + byte2kb(atomic64_read(&stat->k2u_size)), + sp_res, non_sp_res, + page2kb(mm->total_vm), page2kb(total_rss), page2kb(shmem), non_sp_shm); mmput(mm);
@@ -2750,8 +2758,9 @@ static int proc_stat_show(struct seq_file *seq, void *offset) "Non-SP_RES", "VIRT", "RES", "Shm", "Non-SP_Shm"); /* print kthread buff_module_guard_work */ seq_printf(seq, "%-8s %-8s %-9ld %-9ld\n", - "guard", "-", byte2kb(kthread_stat.alloc_size), - byte2kb(kthread_stat.k2u_size)); + "guard", "-", + byte2kb(atomic64_read(&kthread_stat.alloc_size)), + byte2kb(atomic64_read(&kthread_stat.k2u_size))); idr_for_each(&sp_stat_idr, idr_proc_stat_cb, seq); return 0; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Introduce rw_semaphore sp_stat_sem, it only protects the idr operations of sp_stat_idr.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 66 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 23 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 3bb1a3f5b4e12..f8c61c1dd573b 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -23,6 +23,7 @@ #include <linux/mm_types.h> #include <linux/idr.h> #include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/rbtree.h> @@ -83,6 +84,8 @@ static DEFINE_IDA(sp_group_id_ida);
/* idr of all sp_proc_stats */ static DEFINE_IDR(sp_stat_idr); +/* rw semaphore for sp_stat_idr */ +static DECLARE_RWSEM(sp_stat_sem);
/* for kthread buff_module_guard_work */ static struct sp_proc_stat kthread_stat = {0}; @@ -100,7 +103,7 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, int ret;
if (id) { - stat = idr_find(&sp_stat_idr, id); + stat = sp_get_proc_stat(id); /* other threads in the same process may have initialized it */ if (stat) return stat; @@ -117,7 +120,10 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, atomic64_set(&stat->k2u_size, 0); stat->mm = mm; get_task_comm(stat->comm, tsk); + + down_write(&sp_stat_sem); ret = idr_alloc(&sp_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); + up_write(&sp_stat_sem); if (ret < 0) { if (printk_ratelimit()) pr_err("share pool: proc stat idr alloc failed %d\n", ret); @@ -686,15 +692,20 @@ int sp_group_add_task(int pid, int spg_id) spin_unlock(&sp_area_lock);
if (unlikely(ret)) { - idr_remove(&sp_stat_idr, mm->sp_stat_id); - kfree(stat); - mm->sp_stat_id = 0; /* spg->procs is modified, spg->rw_lock should be put below */ list_del(&mm->sp_node); mm->sp_group = NULL; } - up_write(&spg->rw_lock); + + if (unlikely(ret)) { + down_write(&sp_stat_sem); + idr_remove(&sp_stat_idr, mm->sp_stat_id); + up_write(&sp_stat_sem); + kfree(stat); + mm->sp_stat_id = 0; + } + out_drop_group: if (unlikely(ret)) __sp_group_drop_locked(spg); @@ -743,10 +754,7 @@ void sp_group_post_exit(struct mm_struct *mm) if (!enable_ascend_share_pool || !mm->sp_group) return;
- spg_exit_lock(&unlock); - - /* pointer stat must be valid, we don't need to check sanity */ - stat = idr_find(&sp_stat_idr, mm->sp_stat_id); + stat = sp_get_proc_stat(mm->sp_stat_id); /* * There are two basic scenarios when a process in the share pool is * exiting but its share pool memory usage is not 0. @@ -769,8 +777,11 @@ void sp_group_post_exit(struct mm_struct *mm) byte2kb(alloc_size), byte2kb(k2u_size)); }
+ down_write(&sp_stat_sem); idr_remove(&sp_stat_idr, mm->sp_stat_id); + up_write(&sp_stat_sem);
+ spg_exit_lock(&unlock); __sp_group_drop_locked(mm->sp_group); spg_exit_unlock(unlock);
@@ -1223,7 +1234,7 @@ int sp_free(unsigned long addr) if (current->mm == NULL) { atomic64_sub(spa->real_size, &kthread_stat.alloc_size); } else { - stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); + stat = sp_get_proc_stat(current->mm->sp_stat_id); if (stat) atomic64_sub(spa->real_size, &stat->alloc_size); else @@ -1466,7 +1477,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id)
mutex_lock(&sp_mutex); if (!IS_ERR(p)) { - stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); + stat = sp_get_proc_stat(current->mm->sp_stat_id); if (stat) atomic64_add(size_aligned, &stat->alloc_size); } @@ -2260,7 +2271,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (current->mm == NULL) { atomic64_sub(spa->real_size, &kthread_stat.k2u_size); } else { - stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); + stat = sp_get_proc_stat(current->mm->sp_stat_id); if (stat) atomic64_sub(spa->real_size, &stat->k2u_size); else @@ -2488,9 +2499,9 @@ struct sp_proc_stat *sp_get_proc_stat(int tgid) { struct sp_proc_stat *stat;
- mutex_lock(&sp_mutex); + down_read(&sp_stat_sem); stat = idr_find(&sp_stat_idr, tgid); - mutex_unlock(&sp_mutex); + up_read(&sp_stat_sem);
/* maybe NULL or not, we always return it */ return stat; @@ -2501,22 +2512,28 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, { struct sp_group *spg = NULL; struct sp_proc_stat *stat; + int spg_id, hugepage_failures;
mutex_lock(&sp_mutex); spg = __sp_find_spg(task->pid, SPG_ID_DEFAULT); if (spg_valid(spg)) { - stat = idr_find(&sp_stat_idr, task->mm->sp_stat_id); - if (!stat) { - mutex_unlock(&sp_mutex); + spg_id = spg->id; + hugepage_failures = spg->hugepage_failures; + mutex_unlock(&sp_mutex); + + /* eliminate potential ABBA deadlock */ + stat = sp_get_proc_stat(task->mm->sp_stat_id); + if (!stat) return 0; - } + /* print the file header */ seq_printf(m, "%-8s %-9s %-13s\n", "Group_ID", "SP_ALLOC", "HugePage Fail"); seq_printf(m, "%-8d %-9ld %-13d\n", - spg->id, + spg_id, byte2kb(atomic64_read(&stat->alloc_size)), - spg->hugepage_failures); + hugepage_failures); + return 0; } mutex_unlock(&sp_mutex);
@@ -2704,11 +2721,11 @@ static int idr_proc_stat_cb(int id, void *p, void *data) long sp_alloc_nsize, non_sp_res, sp_res, non_sp_shm;
mutex_lock(&sp_mutex); - if (!mmget_not_zero(mm)) - goto out_unlock; /* * a task which is the target of k2u(to task) but without adding to a * sp group should be handled correctly. + * No longer mmget_not_zero(mm) but a process (k2u to task) may have + * problem */ spg = __sp_find_spg(id, SPG_ID_DEFAULT); if (!spg_valid(spg)) { @@ -2740,7 +2757,6 @@ static int idr_proc_stat_cb(int id, void *p, void *data) sp_res, non_sp_res, page2kb(mm->total_vm), page2kb(total_rss), page2kb(shmem), non_sp_shm); - mmput(mm);
out_unlock: mutex_unlock(&sp_mutex); @@ -2761,7 +2777,11 @@ static int proc_stat_show(struct seq_file *seq, void *offset) "guard", "-", byte2kb(atomic64_read(&kthread_stat.alloc_size)), byte2kb(atomic64_read(&kthread_stat.k2u_size))); + + /* pay attention to potential ABBA deadlock */ + down_read(&sp_stat_sem); idr_for_each(&sp_stat_idr, idr_proc_stat_cb, seq); + up_read(&sp_stat_sem); return 0; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
This commit aims to accomplish our main goal: remove the 'big lock' sp_mutex.
We introduce rw_semaphore sp_group_sem, it only protects the idr operations of sp_group_idr.
The critical sections originally protected by sp_mutex is divided into four main parts. 1. idr operations of sp_group_idr, now protected by sp_group_sem. 2. idr operations of sp_stat_idr, now protected by sp_stat_sem. 3. access of the non-atomic members of struct sp_group, now protected by rw_semphore spg->rw_lock. 4. access of the accounting members of struct sp_proc_stat, now they have been changed to atomic types.
All of these works have been done, and sp_mutex can be removed safely. However, we decide to reserve sp_mutex, it can be used for inter-group operations in the future.
Meanwhile, we eliminate ambiguity of spg_valid().
Currently macro spg_valid(spg) has two meanings: 1. spg is NULL. 2. spg is not NULL but spg is dead. This is not a good design as we can make it simple and clear.
In the new implementation, spg_valid() only represents the second meaning. It is especially beneficial for the down operation of spg->rw_lock before calling spg_valid() as spg should always be NOT NULL, so checking NULL is not needed in spg_valid().
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 232 ++++++++++++++++++++++++++---------------------- 1 file changed, 124 insertions(+), 108 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index f8c61c1dd573b..167f990d8d9e2 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -51,7 +51,7 @@ #define AC_NONE 0 #define AC_SINGLE_OWNER 1
-#define spg_valid(spg) ((spg) && ((spg)->is_alive == true)) +#define spg_valid(spg) ((spg)->is_alive == true)
#define byte2kb(size) ((size) >> 10) #define byte2mb(size) ((size) >> 20) @@ -71,10 +71,13 @@ int sysctl_sp_debug_mode;
int sysctl_share_pool_map_lock_enable;
+/* for inter-group operations */ +static DEFINE_MUTEX(sp_mutex); + /* idr of all sp_groups */ static DEFINE_IDR(sp_group_idr); - -static DEFINE_MUTEX(sp_mutex); +/* rw semaphore for sp_group_idr */ +static DECLARE_RWSEM(sp_group_sem);
static BLOCKING_NOTIFIER_HEAD(sp_notifier_chain);
@@ -91,7 +94,7 @@ static DECLARE_RWSEM(sp_stat_sem); static struct sp_proc_stat kthread_stat = {0};
/* - * The caller must hold sp_mutex and ensure no concurrency problem + * The caller must ensure no concurrency problem * for task_struct and mm_struct. */ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, @@ -195,10 +198,15 @@ int sysctl_share_pool_hugepage_enable = 1;
static void free_sp_group(struct sp_group *spg);
+/* the caller make sure spg is not NULL */ static bool sp_group_get(struct sp_group *spg) { - if (spg_valid(spg) && atomic_inc_not_zero(&spg->use_count)) + down_read(&spg->rw_lock); + if (spg_valid(spg) && atomic_inc_not_zero(&spg->use_count)) { + up_read(&spg->rw_lock); return true; + } + up_read(&spg->rw_lock);
return false; } @@ -296,6 +304,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa,
static void free_sp_group_id(unsigned int spg_id) { + /* ida operation is protected by an internal spin_lock */ if ((spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) || (spg_id >= SPG_ID_DVPP_PASS_THROUGH_MIN && spg_id <= SPG_ID_DVPP_PASS_THROUGH_MAX)) @@ -306,7 +315,9 @@ static void free_sp_group(struct sp_group *spg) { fput(spg->file); fput(spg->file_hugetlb); + down_write(&sp_group_sem); idr_remove(&sp_group_idr, spg->id); + up_write(&sp_group_sem); free_sp_group_id((unsigned int)spg->id); kfree(spg); } @@ -343,9 +354,9 @@ static struct sp_group *__sp_find_spg(int pid, int spg_id)
put_task_struct(tsk); } else { - mutex_lock(&sp_mutex); + down_read(&sp_group_sem); spg = idr_find(&sp_group_idr, spg_id); - mutex_unlock(&sp_mutex); + up_read(&sp_group_sem); }
return spg; @@ -358,12 +369,15 @@ int sp_group_id_by_pid(int pid)
check_interrupt_context();
- mutex_lock(&sp_mutex); spg = __sp_find_spg(pid, SPG_ID_DEFAULT); + if (!spg) + return -ENODEV; + + down_read(&spg->rw_lock); if (spg_valid(spg)) spg_id = spg->id; + up_read(&spg->rw_lock);
- mutex_unlock(&sp_mutex); return spg_id; } EXPORT_SYMBOL_GPL(sp_group_id_by_pid); @@ -375,7 +389,10 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) int ret; char name[20];
+ down_read(&sp_group_sem); spg = idr_find(&sp_group_idr, spg_id); + up_read(&sp_group_sem); + if (!spg) { struct user_struct *user = NULL; int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; @@ -392,7 +409,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) atomic64_set(&spg->alloc_nsize, 0); atomic64_set(&spg->alloc_hsize, 0); atomic64_set(&spg->alloc_size, 0); - spg->is_alive = false; + spg->is_alive = true; spg->hugepage_failures = 0; spg->dvpp_multi_spaces = false; spg->owner = current->group_leader; @@ -402,8 +419,10 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id)
init_rwsem(&spg->rw_lock);
- ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id+1, + down_write(&sp_group_sem); + ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id + 1, GFP_KERNEL); + up_write(&sp_group_sem); if (ret < 0) { if (printk_ratelimit()) pr_err("share pool: create group idr alloc failed\n"); @@ -441,7 +460,9 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) out_fput: fput(spg->file); out_idr: + down_write(&sp_group_sem); idr_remove(&sp_group_idr, spg_id); + up_write(&sp_group_sem); out_kfree: kfree(spg); return ERR_PTR(ret); @@ -484,12 +505,8 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) /* The caller must hold sp_mutex. */ static void __sp_group_drop_locked(struct sp_group *spg) { - bool is_alive = spg->is_alive; - - if (atomic_dec_and_test(&spg->use_count)) { - BUG_ON(is_alive); + if (atomic_dec_and_test(&spg->use_count)) free_sp_group(spg); - } }
/** @@ -527,16 +544,25 @@ int sp_group_add_task(int pid, int spg_id) }
if (spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) { - mutex_lock(&sp_mutex); + down_read(&sp_group_sem); spg = idr_find(&sp_group_idr, spg_id); + up_read(&sp_group_sem); + + if (!spg) { + if (printk_ratelimit()) + pr_err("share pool: spg %d hasn't been created\n", spg_id); + return -EINVAL; + } + + down_read(&spg->rw_lock); if (!spg_valid(spg)) { - mutex_unlock(&sp_mutex); + up_read(&spg->rw_lock); if (printk_ratelimit()) pr_err("share pool: task add group failed because group id %d " - "hasn't been create or dead\n", spg_id); + "is dead\n", spg_id); return -EINVAL; } - mutex_unlock(&sp_mutex); + up_read(&spg->rw_lock); }
if (spg_id == SPG_ID_AUTO) { @@ -629,8 +655,6 @@ int sp_group_add_task(int pid, int spg_id) mm->sp_group = spg;
down_write(&spg->rw_lock); - /* We reactive the spg even the spg exists already. */ - spg->is_alive = true; list_add_tail(&mm->sp_node, &spg->procs); /* * create mappings of existing shared memory segments into this @@ -721,37 +745,13 @@ int sp_group_add_task(int pid, int spg_id) } EXPORT_SYMBOL_GPL(sp_group_add_task);
-static void spg_exit_lock(bool *unlock) -{ - switch (mutex_trylock_recursive(&sp_mutex)) { - case MUTEX_TRYLOCK_RECURSIVE: - *unlock = false; - break; - case MUTEX_TRYLOCK_FAILED: - mutex_lock(&sp_mutex); - *unlock = true; - break; - case MUTEX_TRYLOCK_SUCCESS: - *unlock = true; - break; - default: - BUG(); - } -} - -static void spg_exit_unlock(bool unlock) -{ - if (unlock) - mutex_unlock(&sp_mutex); -} - void sp_group_post_exit(struct mm_struct *mm) { struct sp_proc_stat *stat; + struct sp_group *spg = mm->sp_group; long alloc_size, k2u_size; - bool unlock;
- if (!enable_ascend_share_pool || !mm->sp_group) + if (!spg || !enable_ascend_share_pool) return;
stat = sp_get_proc_stat(mm->sp_stat_id); @@ -781,9 +781,7 @@ void sp_group_post_exit(struct mm_struct *mm) idr_remove(&sp_stat_idr, mm->sp_stat_id); up_write(&sp_stat_sem);
- spg_exit_lock(&unlock); - __sp_group_drop_locked(mm->sp_group); - spg_exit_unlock(unlock); + __sp_group_drop_locked(spg);
kfree(stat); } @@ -1229,7 +1227,6 @@ int sp_free(unsigned long addr)
up_read(&spa->spg->rw_lock);
- mutex_lock(&sp_mutex); /* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { atomic64_sub(spa->real_size, &kthread_stat.alloc_size); @@ -1240,7 +1237,6 @@ int sp_free(unsigned long addr) else BUG(); } - mutex_unlock(&sp_mutex);
drop_spa: __sp_area_drop(spa); @@ -1346,22 +1342,23 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) spg = current->mm->sp_group; } else { /* other scenes */ if (spg_id != SPG_ID_DEFAULT) { - mutex_lock(&sp_mutex); + down_read(&sp_group_sem); /* the caller should be a member of the sp group */ if (spg != idr_find(&sp_group_idr, spg_id)) { - mutex_unlock(&sp_mutex); - goto out; + up_read(&sp_group_sem); + return ERR_PTR(-EINVAL); } - mutex_unlock(&sp_mutex); + up_read(&sp_group_sem); } }
+ down_read(&spg->rw_lock); if (!spg_valid(spg)) { - pr_err("share pool: sp alloc failed, spg is invalid\n"); - goto out; + up_read(&spg->rw_lock); + pr_err("share pool: sp alloc failed, spg is dead\n"); + return ERR_PTR(-ENODEV); }
- down_read(&spg->rw_lock); if (sp_flags & SP_HUGEPAGE) { file = spg->file_hugetlb; size_aligned = ALIGN(size, PMD_SIZE); @@ -1475,13 +1472,11 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) out: up_read(&spg->rw_lock);
- mutex_lock(&sp_mutex); if (!IS_ERR(p)) { stat = sp_get_proc_stat(current->mm->sp_stat_id); if (stat) atomic64_add(size_aligned, &stat->alloc_size); } - mutex_unlock(&sp_mutex);
/* this will free spa if mmap failed */ if (spa && !IS_ERR(spa)) @@ -1544,7 +1539,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; unsigned long addr, buf, offset;
- if (spg_valid(spa->spg)) { + if (spa->spg != NULL) { /* k2u to group */ file = spa_file(spa); } else { @@ -1703,7 +1698,7 @@ static bool vmalloc_area_clr_flag(struct sp_area *spa, unsigned long kva, unsign void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned long sp_flags, int pid, int spg_id) { - void *uva = ERR_PTR(-ENODEV); + void *uva; struct sp_group *spg; struct sp_area *spa; unsigned long kva_aligned; @@ -1754,7 +1749,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, goto out_put_task; }
- mutex_lock(&sp_mutex); /* * Process statistics initialization. if the target process has been * added to a sp group, then stat will be returned immediately. @@ -1762,12 +1756,10 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, */ stat = sp_init_proc_stat(tsk, mm); if (IS_ERR(stat)) { - mutex_unlock(&sp_mutex); uva = stat; pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); - goto out_unlock; + goto out; } - mutex_unlock(&sp_mutex);
spg = __sp_find_spg(pid, SPG_ID_DEFAULT); if (spg == NULL) { @@ -1776,7 +1768,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, if (printk_ratelimit()) pr_err("share pool: k2task invalid spg id %d\n", spg_id); uva = ERR_PTR(-EINVAL); - goto out_unlock; + goto out; } spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); if (IS_ERR(spa)) { @@ -1785,7 +1777,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); uva = spa; - goto out_unlock; + goto out; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { @@ -1795,16 +1787,20 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, }
uva = sp_make_share_kva_to_task(kva_aligned, spa, mm); - } else if (spg_valid(spg)) { + goto accounting; + } + + down_read(&spg->rw_lock); + if (spg_valid(spg)) { /* k2u to group */ if (spg_id != SPG_ID_DEFAULT && spg_id != spg->id) { + up_read(&spg->rw_lock); if (printk_ratelimit()) pr_err("share pool: k2spg invalid spg id %d\n", spg_id); uva = ERR_PTR(-EINVAL); - goto out_unlock; + goto out; }
- down_read(&spg->rw_lock); if (enable_share_k2u_spg) spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); else @@ -1817,10 +1813,11 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); uva = spa; - goto out_unlock; + goto out; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { + up_read(&spg->rw_lock); pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); goto out_drop_spa; } @@ -1830,17 +1827,17 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, else uva = sp_make_share_kva_to_task(kva_aligned, spa, mm);
- up_read(&spg->rw_lock); } else { /* group is dead, return -ENODEV */ pr_err("share pool: failed to make k2u, sp group is dead\n"); + uva = ERR_PTR(-ENODEV); } + up_read(&spg->rw_lock);
+accounting: if (!IS_ERR(uva)) { - mutex_lock(&sp_mutex); uva = uva + (kva - kva_aligned); atomic64_add(size_aligned, &stat->k2u_size); - mutex_unlock(&sp_mutex); } else { /* associate vma and spa */ if (!vmalloc_area_clr_flag(spa, kva_aligned, VM_SHAREPOOL)) @@ -1850,7 +1847,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size,
out_drop_spa: __sp_area_drop(spa); -out_unlock: +out: mmput(mm); out_put_task: put_task_struct(tsk); @@ -2243,12 +2240,21 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp goto out_drop_area; }
+ if (unlikely(!spa->spg)) { + WARN(1, "share pool: unshare uva NULL spg pointer\n"); + ret = -EINVAL; + goto out_drop_area; + } + + down_read(&spa->spg->rw_lock); if (!spg_valid(spa->spg)) { + up_read(&spa->spg->rw_lock); if (printk_ratelimit()) pr_info("share pool: no need to unshare uva(to group), " - "spa doesn't belong to a sp group or group is dead\n"); + "sp group of spa is dead\n"); goto out_clr_flag; } + up_read(&spa->spg->rw_lock);
/* alway allow kthread and dvpp channel destroy procedure */ if (current->mm && current->mm->sp_group != spa->spg) { @@ -2266,7 +2272,6 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp
sp_dump_stack();
- mutex_lock(&sp_mutex); /* pointer stat may be invalid because of kthread buff_module_guard_work */ if (current->mm == NULL) { atomic64_sub(spa->real_size, &kthread_stat.k2u_size); @@ -2275,9 +2280,8 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (stat) atomic64_sub(spa->real_size, &stat->k2u_size); else - WARN(1, "share_pool: %s: null process stat\n", __func__); + WARN(1, "share pool: %s: null process stat\n", __func__); } - mutex_unlock(&sp_mutex);
out_clr_flag: /* deassociate vma and spa */ @@ -2453,17 +2457,21 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) size > MMAP_SHARE_POOL_16G_SIZE) return false;
- mutex_lock(&sp_mutex); spg = __sp_find_spg(pid, SPG_ID_DEFAULT); + if (!spg) + return false; + + down_write(&spg->rw_lock); if (!spg_valid(spg) || spg->dvpp_multi_spaces == true) { - mutex_unlock(&sp_mutex); + up_write(&spg->rw_lock); return false; } spg->dvpp_va_start = start; spg->dvpp_size = size; spg->dvpp_multi_spaces = true; + up_write(&spg->rw_lock); + host_svm_sp_enable = true; - mutex_unlock(&sp_mutex);
return true; } @@ -2514,12 +2522,15 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, struct sp_proc_stat *stat; int spg_id, hugepage_failures;
- mutex_lock(&sp_mutex); spg = __sp_find_spg(task->pid, SPG_ID_DEFAULT); + if (!spg) + return 0; + + down_read(&spg->rw_lock); if (spg_valid(spg)) { spg_id = spg->id; hugepage_failures = spg->hugepage_failures; - mutex_unlock(&sp_mutex); + up_read(&spg->rw_lock);
/* eliminate potential ABBA deadlock */ stat = sp_get_proc_stat(task->mm->sp_stat_id); @@ -2535,7 +2546,7 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, hugepage_failures); return 0; } - mutex_unlock(&sp_mutex); + up_read(&spg->rw_lock);
return 0; } @@ -2555,12 +2566,16 @@ static void rb_spa_stat_show(struct seq_file *seq) atomic_inc(&spa->use_count); spin_unlock(&sp_area_lock);
- mutex_lock(&sp_mutex); - if (spg_valid(spa->spg)) - seq_printf(seq, "%-10d ", spa->spg->id); - else /* k2u for task or spg is dead */ + if (!spa->spg) /* k2u to task */ seq_printf(seq, "%-10s ", "None"); - mutex_unlock(&sp_mutex); + else { + down_read(&spa->spg->rw_lock); + if (spg_valid(spa->spg)) /* k2u to group */ + seq_printf(seq, "%-10d ", spa->spg->id); + else /* spg is dead */ + seq_printf(seq, "%-10s ", "Dead"); + up_read(&spa->spg->rw_lock); + }
seq_printf(seq, "%2s%-14lx %2s%-14lx %-13ld ", "0x", spa->va_start, @@ -2682,9 +2697,9 @@ void spg_overview_show(struct seq_file *seq) atomic_read(&spg_stat.spa_total_num)); }
- mutex_lock(&sp_mutex); + down_read(&sp_group_sem); idr_for_each(&sp_group_idr, idr_spg_stat_cb, seq); - mutex_unlock(&sp_mutex); + up_read(&sp_group_sem);
if (seq != NULL) seq_puts(seq, "\n"); @@ -2720,7 +2735,6 @@ static int idr_proc_stat_cb(int id, void *p, void *data) */ long sp_alloc_nsize, non_sp_res, sp_res, non_sp_shm;
- mutex_lock(&sp_mutex); /* * a task which is the target of k2u(to task) but without adding to a * sp group should be handled correctly. @@ -2728,6 +2742,10 @@ static int idr_proc_stat_cb(int id, void *p, void *data) * problem */ spg = __sp_find_spg(id, SPG_ID_DEFAULT); + if (!spg) + goto out; + + down_read(&spg->rw_lock); if (!spg_valid(spg)) { spg_id = 0; sp_alloc_nsize = 0; @@ -2737,6 +2755,7 @@ static int idr_proc_stat_cb(int id, void *p, void *data) sp_alloc_nsize = byte2kb(atomic64_read(&spg->alloc_nsize)); sp_res = byte2kb(atomic64_read(&spg->alloc_size)); } + up_read(&spg->rw_lock);
anon = get_mm_counter(mm, MM_ANONPAGES); file = get_mm_counter(mm, MM_FILEPAGES); @@ -2758,9 +2777,7 @@ static int idr_proc_stat_cb(int id, void *p, void *data) page2kb(mm->total_vm), page2kb(total_rss), page2kb(shmem), non_sp_shm);
-out_unlock: - mutex_unlock(&sp_mutex); - +out: return 0; }
@@ -2891,34 +2908,33 @@ EXPORT_SYMBOL(sharepool_no_page);
void sp_group_exit(struct mm_struct *mm) { - struct sp_group *spg = NULL; - bool is_alive = true, unlock; + struct sp_group *spg = mm->sp_group; + bool is_alive;
- if (!enable_ascend_share_pool) + if (!spg || !enable_ascend_share_pool) return;
- spg = mm->sp_group; - /* * Recall we add mm->users by 1 deliberately in sp_group_add_task(). * If the mm_users is 2, it means that the mm is ready to be freed * because the last owner of this mm is in exiting procedure: * do_exit() -> exit_mm() -> mmput() -> THIS function. */ + down_write(&spg->rw_lock); if (spg_valid(spg) && atomic_read(&mm->mm_users) == MM_WOULD_FREE) { - spg_exit_lock(&unlock); - down_write(&spg->rw_lock); if (list_is_singular(&spg->procs)) is_alive = spg->is_alive = false; - list_del(&mm->sp_node); + list_del(&mm->sp_node); /* affect spg->procs */ up_write(&spg->rw_lock); + if (!is_alive) blocking_notifier_call_chain(&sp_notifier_chain, 0, mm->sp_group); /* match with get_task_mm() in sp_group_add_task() */ atomic_dec(&mm->mm_users); - spg_exit_unlock(unlock); + return; } + up_write(&spg->rw_lock); }
struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask,
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
__sp_group_drop_locked() actually doesn't need to be protected by any locks.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 167f990d8d9e2..16aadd3ebefa3 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -503,7 +503,7 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) }
/* The caller must hold sp_mutex. */ -static void __sp_group_drop_locked(struct sp_group *spg) +static void sp_group_drop(struct sp_group *spg) { if (atomic_dec_and_test(&spg->use_count)) free_sp_group(spg); @@ -732,7 +732,7 @@ int sp_group_add_task(int pid, int spg_id)
out_drop_group: if (unlikely(ret)) - __sp_group_drop_locked(spg); + sp_group_drop(spg); out_put_mm: /* No need to put the mm if the sp group adds this mm successfully */ if (unlikely(ret)) @@ -781,7 +781,7 @@ void sp_group_post_exit(struct mm_struct *mm) idr_remove(&sp_stat_idr, mm->sp_stat_id); up_write(&sp_stat_sem);
- __sp_group_drop_locked(spg); + sp_group_drop(spg);
kfree(stat); } @@ -2922,6 +2922,7 @@ void sp_group_exit(struct mm_struct *mm) */ down_write(&spg->rw_lock); if (spg_valid(spg) && atomic_read(&mm->mm_users) == MM_WOULD_FREE) { + /* a dead group should NOT be reactive again */ if (list_is_singular(&spg->procs)) is_alive = spg->is_alive = false; list_del(&mm->sp_node); /* affect spg->procs */
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: doc bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Update outdated comments.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 16aadd3ebefa3..3e77992d954c2 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -322,7 +322,6 @@ static void free_sp_group(struct sp_group *spg) kfree(spg); }
-/* The caller must hold sp_mutex. */ static struct sp_group *__sp_find_spg(int pid, int spg_id) { struct sp_group *spg; @@ -382,7 +381,6 @@ int sp_group_id_by_pid(int pid) } EXPORT_SYMBOL_GPL(sp_group_id_by_pid);
-/* The caller must hold sp_mutex. */ static struct sp_group *find_or_alloc_sp_group(int spg_id) { struct sp_group *spg; @@ -470,7 +468,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id)
static void __sp_area_drop_locked(struct sp_area *spa);
-/* The caller must hold sp_mutex. */ +/* The caller must down_write(&mm->mmap_sem) */ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) { struct sp_area *spa, *prev = NULL; @@ -502,7 +500,6 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) spin_unlock(&sp_area_lock); }
-/* The caller must hold sp_mutex. */ static void sp_group_drop(struct sp_group *spg) { if (atomic_dec_and_test(&spg->use_count)) @@ -818,8 +815,6 @@ static unsigned long cached_vstart; /* affected by SP_DVPP and sp_config_dvpp_r * Allocate a region of VA from the share pool. * @size - the size of VA to allocate * - * The caller must hold must sp_mutex when input parameter spg is not NULL - * * Return NULL if fail. */ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, @@ -1135,7 +1130,12 @@ static void sp_try_to_compact(void) sp_add_work_compact(); }
-/* The caller must hold sp_mutex. */ +/* + * The function calls of do_munmap() won't change any non-atomic member + * of struct sp_group. Please review the following chain: + * do_munmap -> remove_vma_list -> remove_vma -> sp_area_drop -> + * __sp_area_drop_locked -> sp_free_area + */ static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size) { @@ -1152,7 +1152,6 @@ static void sp_munmap(struct mm_struct *mm, unsigned long addr, up_write(&mm->mmap_sem); }
-/* The caller must hold sp_mutex. */ static void __sp_free(struct sp_group *spg, unsigned long addr, unsigned long size, struct mm_struct *stop) { @@ -2657,7 +2656,7 @@ void spa_overview_show(struct seq_file *seq) } }
-/* the caller must hold sp_mutex */ +/* the caller must hold sp_group_sem */ static int idr_spg_stat_cb(int id, void *p, void *data) { struct sp_group *spg = p;
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
After getting the pointer of sp_group spg by calling __sp_find_spg(), the memory of spg may be released if spg is dead and free_sp_group() is called.
To solve this problem, we increase the refcount of spg when call __sp_find_spg(). Users should call sp_group_drop() after finishing using it.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 131 +++++++++++++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 51 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 3e77992d954c2..445ee392b585d 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -196,21 +196,6 @@ static bool host_svm_sp_enable = false;
int sysctl_share_pool_hugepage_enable = 1;
-static void free_sp_group(struct sp_group *spg); - -/* the caller make sure spg is not NULL */ -static bool sp_group_get(struct sp_group *spg) -{ - down_read(&spg->rw_lock); - if (spg_valid(spg) && atomic_inc_not_zero(&spg->use_count)) { - up_read(&spg->rw_lock); - return true; - } - up_read(&spg->rw_lock); - - return false; -} - static unsigned long spa_size(struct sp_area *spa) { return spa->real_size; @@ -322,7 +307,8 @@ static void free_sp_group(struct sp_group *spg) kfree(spg); }
-static struct sp_group *__sp_find_spg(int pid, int spg_id) +/* user must call sp_group_drop() after use */ +static struct sp_group *__sp_find_spg_locked(int pid, int spg_id) { struct sp_group *spg; int ret = 0; @@ -347,20 +333,41 @@ static struct sp_group *__sp_find_spg(int pid, int spg_id) task_lock(tsk); if (tsk->mm == NULL) spg = NULL; - else + else { spg = tsk->mm->sp_group; + /* don't revive a dead group */ + if (!spg || !atomic_inc_not_zero(&spg->use_count)) + spg = NULL; + } task_unlock(tsk);
put_task_struct(tsk); } else { - down_read(&sp_group_sem); spg = idr_find(&sp_group_idr, spg_id); - up_read(&sp_group_sem); + /* don't revive a dead group */ + if (!spg || !atomic_inc_not_zero(&spg->use_count)) + spg = NULL; }
return spg; }
+static struct sp_group *__sp_find_spg(int pid, int spg_id) +{ + struct sp_group *spg; + + down_read(&sp_group_sem); + spg = __sp_find_spg_locked(pid, spg_id); + up_read(&sp_group_sem); + return spg; +} + +static void sp_group_drop(struct sp_group *spg) +{ + if (atomic_dec_and_test(&spg->use_count)) + free_sp_group(spg); +} + int sp_group_id_by_pid(int pid) { struct sp_group *spg; @@ -377,6 +384,7 @@ int sp_group_id_by_pid(int pid) spg_id = spg->id; up_read(&spg->rw_lock);
+ sp_group_drop(spg); return spg_id; } EXPORT_SYMBOL_GPL(sp_group_id_by_pid); @@ -387,9 +395,8 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) int ret; char name[20];
- down_read(&sp_group_sem); - spg = idr_find(&sp_group_idr, spg_id); - up_read(&sp_group_sem); + down_write(&sp_group_sem); + spg = __sp_find_spg_locked(current->pid, spg_id);
if (!spg) { struct user_struct *user = NULL; @@ -401,6 +408,15 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) pr_err("share pool: alloc spg failed due to lack of memory\n"); return ERR_PTR(-ENOMEM); } + ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id + 1, + GFP_KERNEL); + up_write(&sp_group_sem); + if (ret < 0) { + if (printk_ratelimit()) + pr_err("share pool: create group idr alloc failed\n"); + goto out_kfree; + } + spg->id = spg_id; atomic_set(&spg->spa_num, 0); atomic64_set(&spg->size, 0); @@ -417,16 +433,6 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id)
init_rwsem(&spg->rw_lock);
- down_write(&sp_group_sem); - ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id + 1, - GFP_KERNEL); - up_write(&sp_group_sem); - if (ret < 0) { - if (printk_ratelimit()) - pr_err("share pool: create group idr alloc failed\n"); - goto out_kfree; - } - sprintf(name, "sp_group_%d", spg_id); spg->file = shmem_kernel_file_setup(name, MAX_LFS_FILESIZE, VM_NORESERVE); @@ -449,8 +455,15 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) goto out_fput; } } else { - if (!sp_group_get(spg)) + up_write(&sp_group_sem); + down_read(&spg->rw_lock); + if (!spg_valid(spg)) { + up_read(&spg->rw_lock); + sp_group_drop(spg); return ERR_PTR(-ENODEV); + } + up_read(&spg->rw_lock); + /* spg->use_count has increased due to __sp_find_spg() */ }
return spg; @@ -500,12 +513,6 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) spin_unlock(&sp_area_lock); }
-static void sp_group_drop(struct sp_group *spg) -{ - if (atomic_dec_and_test(&spg->use_count)) - free_sp_group(spg); -} - /** * sp_group_add_task - add a process to an sp_group * @pid: the pid of the task to be added @@ -541,9 +548,7 @@ int sp_group_add_task(int pid, int spg_id) }
if (spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) { - down_read(&sp_group_sem); - spg = idr_find(&sp_group_idr, spg_id); - up_read(&sp_group_sem); + spg = __sp_find_spg(pid, spg_id);
if (!spg) { if (printk_ratelimit()) @@ -557,9 +562,12 @@ int sp_group_add_task(int pid, int spg_id) if (printk_ratelimit()) pr_err("share pool: task add group failed because group id %d " "is dead\n", spg_id); + sp_group_drop(spg); return -EINVAL; } up_read(&spg->rw_lock); + + sp_group_drop(spg); }
if (spg_id == SPG_ID_AUTO) { @@ -778,6 +786,7 @@ void sp_group_post_exit(struct mm_struct *mm) idr_remove(&sp_stat_idr, mm->sp_stat_id); up_write(&sp_stat_sem);
+ /* match with sp_group_add_task -> find_or_alloc_sp_group */ sp_group_drop(spg);
kfree(stat); @@ -1286,12 +1295,12 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, */ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) { - struct sp_group *spg = NULL; + struct sp_group *spg, *spg_tmp; struct sp_area *spa = NULL; struct sp_proc_stat *stat; unsigned long sp_addr; unsigned long mmap_addr; - void *p = ERR_PTR(-ENODEV); + void *p; /* return value */ struct mm_struct *mm; struct file *file; unsigned long size_aligned; @@ -1339,21 +1348,28 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) return ERR_PTR(ret); } spg = current->mm->sp_group; + /* + * increase use_count deliberately, due to __sp_find_spg is + * matched with sp_group_drop + */ + atomic_inc(&spg->use_count); } else { /* other scenes */ if (spg_id != SPG_ID_DEFAULT) { - down_read(&sp_group_sem); - /* the caller should be a member of the sp group */ - if (spg != idr_find(&sp_group_idr, spg_id)) { - up_read(&sp_group_sem); - return ERR_PTR(-EINVAL); + spg_tmp = __sp_find_spg(current->pid, spg_id); + if (spg != spg_tmp) { + sp_group_drop(spg); + if (spg_tmp) + sp_group_drop(spg_tmp); + return ERR_PTR(-ENODEV); } - up_read(&sp_group_sem); + sp_group_drop(spg_tmp); } }
down_read(&spg->rw_lock); if (!spg_valid(spg)) { up_read(&spg->rw_lock); + sp_group_drop(spg); pr_err("share pool: sp alloc failed, spg is dead\n"); return ERR_PTR(-ENODEV); } @@ -1481,6 +1497,8 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) if (spa && !IS_ERR(spa)) __sp_area_drop(spa);
+ sp_group_drop(spg); + sp_dump_stack(); sp_try_to_compact(); return p; @@ -1797,6 +1815,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, if (printk_ratelimit()) pr_err("share pool: k2spg invalid spg id %d\n", spg_id); uva = ERR_PTR(-EINVAL); + sp_group_drop(spg); goto out; }
@@ -1812,12 +1831,14 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); uva = spa; + sp_group_drop(spg); goto out; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { up_read(&spg->rw_lock); pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); + sp_group_drop(spg); goto out_drop_spa; }
@@ -1832,6 +1853,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, uva = ERR_PTR(-ENODEV); } up_read(&spg->rw_lock); + sp_group_drop(spg);
accounting: if (!IS_ERR(uva)) { @@ -2472,6 +2494,7 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid)
host_svm_sp_enable = true;
+ sp_group_drop(spg); return true; } EXPORT_SYMBOL_GPL(sp_config_dvpp_range); @@ -2533,8 +2556,10 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns,
/* eliminate potential ABBA deadlock */ stat = sp_get_proc_stat(task->mm->sp_stat_id); - if (!stat) + if (unlikely(!stat)) { + sp_group_drop(spg); return 0; + }
/* print the file header */ seq_printf(m, "%-8s %-9s %-13s\n", @@ -2543,10 +2568,13 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, spg_id, byte2kb(atomic64_read(&stat->alloc_size)), hugepage_failures); + + sp_group_drop(spg); return 0; } up_read(&spg->rw_lock);
+ sp_group_drop(spg); return 0; }
@@ -2755,6 +2783,7 @@ static int idr_proc_stat_cb(int id, void *p, void *data) sp_res = byte2kb(atomic64_read(&spg->alloc_size)); } up_read(&spg->rw_lock); + sp_group_drop(spg);
anon = get_mm_counter(mm, MM_ANONPAGES); file = get_mm_counter(mm, MM_FILEPAGES);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
After getting the pointer of struct sp_proc_stat stat by calling sp_get_proc_stat(), the memory of stat may be released if target process exits.
To solve this problem, we increase the refcount of stat when call sp_get_proc_stat(). Users should call sp_proc_stat_drop() after finishing using it.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 7 +++ mm/oom_kill.c | 4 +- mm/share_pool.c | 120 +++++++++++++++++++++++++------------ 3 files changed, 91 insertions(+), 40 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 356781bfe3e0a..d94d48f57798c 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -109,6 +109,8 @@ struct sp_walk_data {
/* per process memory usage statistics indexed by tgid */ struct sp_proc_stat { + atomic_t use_count; + int tgid; struct mm_struct *mm; char comm[TASK_COMM_LEN]; /* @@ -170,6 +172,7 @@ extern int sp_unregister_notifier(struct notifier_block *nb); extern bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid); extern bool is_sharepool_addr(unsigned long addr); extern struct sp_proc_stat *sp_get_proc_stat(int tgid); +extern void sp_proc_stat_drop(struct sp_proc_stat *stat); extern void spa_overview_show(struct seq_file *seq); extern void spg_overview_show(struct seq_file *seq); extern void proc_sharepool_init(void); @@ -373,6 +376,10 @@ static inline struct sp_proc_stat *sp_get_proc_stat(int tgid) return NULL; }
+static inline void sp_proc_stat_drop(struct sp_proc_stat *stat) +{ +} + static inline void spa_overview_show(struct seq_file *seq) { } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9554786cfddcd..0fc1c15062825 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -488,10 +488,12 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) task->tgid, task->mm->total_vm, get_mm_rss(task->mm)); if (!stat) pr_cont("%-9c %-9c ", '-', '-'); - else + else { pr_cont("%-9ld %-9ld ", /* byte to KB */ atomic64_read(&stat->alloc_size) >> 10, atomic64_read(&stat->k2u_size) >> 10); + sp_proc_stat_drop(stat); + } pr_cont("%8ld %8lu %5hd %s\n", mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), diff --git a/mm/share_pool.c b/mm/share_pool.c index 445ee392b585d..8727a096107b3 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -87,47 +87,72 @@ static DEFINE_IDA(sp_group_id_ida);
/* idr of all sp_proc_stats */ static DEFINE_IDR(sp_stat_idr); -/* rw semaphore for sp_stat_idr */ +/* rw semaphore for sp_stat_idr and mm->sp_stat_id */ static DECLARE_RWSEM(sp_stat_sem);
/* for kthread buff_module_guard_work */ static struct sp_proc_stat kthread_stat = {0};
+/* The caller must hold sp_stat_sem */ +static struct sp_proc_stat *sp_get_proc_stat_locked(int tgid) +{ + struct sp_proc_stat *stat; + + stat = idr_find(&sp_stat_idr, tgid); + if (stat) + atomic_inc(&stat->use_count); + + /* maybe NULL or not, we always return it */ + return stat; +} + /* * The caller must ensure no concurrency problem * for task_struct and mm_struct. + * + * The user must call sp_proc_stat_drop() after use. */ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, struct mm_struct *mm) { struct sp_proc_stat *stat; - int id = mm->sp_stat_id; - int tgid = tsk->tgid; + int id, tgid = tsk->tgid; int ret;
+ down_write(&sp_stat_sem); + id = mm->sp_stat_id; if (id) { - stat = sp_get_proc_stat(id); /* other threads in the same process may have initialized it */ - if (stat) + stat = sp_get_proc_stat_locked(tgid); + if (stat) { + up_write(&sp_stat_sem); return stat; + } else { + /* if enter this branch, that's our mistake */ + pr_err("share pool: sp_init_proc_stat invalid id %d\n", id); + return ERR_PTR(-EBUSY); + } }
stat = kzalloc(sizeof(*stat), GFP_KERNEL); if (stat == NULL) { + up_write(&sp_stat_sem); if (printk_ratelimit()) pr_err("share pool: alloc proc stat failed due to lack of memory\n"); return ERR_PTR(-ENOMEM); }
+ /* use_count = 2: match with sp_proc_stat_drop */ + atomic_set(&stat->use_count, 2); atomic64_set(&stat->alloc_size, 0); atomic64_set(&stat->k2u_size, 0); + stat->tgid = tgid; stat->mm = mm; get_task_comm(stat->comm, tsk);
- down_write(&sp_stat_sem); ret = idr_alloc(&sp_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); - up_write(&sp_stat_sem); if (ret < 0) { + up_write(&sp_stat_sem); if (printk_ratelimit()) pr_err("share pool: proc stat idr alloc failed %d\n", ret); kfree(stat); @@ -135,6 +160,7 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, }
mm->sp_stat_id = ret; + up_write(&sp_stat_sem); return stat; }
@@ -727,13 +753,10 @@ int sp_group_add_task(int pid, int spg_id) } up_write(&spg->rw_lock);
- if (unlikely(ret)) { - down_write(&sp_stat_sem); - idr_remove(&sp_stat_idr, mm->sp_stat_id); - up_write(&sp_stat_sem); - kfree(stat); - mm->sp_stat_id = 0; - } + /* double drop when fail: ensure release stat */ + if (unlikely(ret)) + sp_proc_stat_drop(stat); + sp_proc_stat_drop(stat); /* match with sp_init_proc_stat */
out_drop_group: if (unlikely(ret)) @@ -780,16 +803,15 @@ void sp_group_post_exit(struct mm_struct *mm) "It applied %ld aligned KB, k2u shared %ld aligned KB\n", stat->comm, mm->sp_stat_id, mm->sp_group->id, byte2kb(alloc_size), byte2kb(k2u_size)); - }
- down_write(&sp_stat_sem); - idr_remove(&sp_stat_idr, mm->sp_stat_id); - up_write(&sp_stat_sem); + /* match with sp_get_proc_stat in THIS function */ + sp_proc_stat_drop(stat); + /* match with sp_init_proc_stat, we expect stat is released after this call */ + sp_proc_stat_drop(stat); + }
/* match with sp_group_add_task -> find_or_alloc_sp_group */ sp_group_drop(spg); - - kfree(stat); }
/* the caller must hold sp_area_lock */ @@ -1240,9 +1262,10 @@ int sp_free(unsigned long addr) atomic64_sub(spa->real_size, &kthread_stat.alloc_size); } else { stat = sp_get_proc_stat(current->mm->sp_stat_id); - if (stat) + if (stat) { atomic64_sub(spa->real_size, &stat->alloc_size); - else + sp_proc_stat_drop(stat); + } else BUG(); }
@@ -1489,8 +1512,10 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id)
if (!IS_ERR(p)) { stat = sp_get_proc_stat(current->mm->sp_stat_id); - if (stat) + if (stat) { atomic64_add(size_aligned, &stat->alloc_size); + sp_proc_stat_drop(stat); + } }
/* this will free spa if mmap failed */ @@ -1769,13 +1794,12 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, /* * Process statistics initialization. if the target process has been * added to a sp group, then stat will be returned immediately. - * I believe there is no need to free stat in error handling branches. */ stat = sp_init_proc_stat(tsk, mm); if (IS_ERR(stat)) { uva = stat; pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); - goto out; + goto out_put_mm; }
spg = __sp_find_spg(pid, SPG_ID_DEFAULT); @@ -1785,7 +1809,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, if (printk_ratelimit()) pr_err("share pool: k2task invalid spg id %d\n", spg_id); uva = ERR_PTR(-EINVAL); - goto out; + goto out_drop_proc_stat; } spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); if (IS_ERR(spa)) { @@ -1794,7 +1818,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); uva = spa; - goto out; + goto out_drop_proc_stat; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { @@ -1815,8 +1839,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, if (printk_ratelimit()) pr_err("share pool: k2spg invalid spg id %d\n", spg_id); uva = ERR_PTR(-EINVAL); - sp_group_drop(spg); - goto out; + goto out_drop_spg; }
if (enable_share_k2u_spg) @@ -1831,14 +1854,12 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); uva = spa; - sp_group_drop(spg); - goto out; + goto out_drop_spg; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { up_read(&spg->rw_lock); pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); - sp_group_drop(spg); goto out_drop_spa; }
@@ -1853,7 +1874,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, uva = ERR_PTR(-ENODEV); } up_read(&spg->rw_lock); - sp_group_drop(spg);
accounting: if (!IS_ERR(uva)) { @@ -1868,7 +1888,12 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size,
out_drop_spa: __sp_area_drop(spa); -out: +out_drop_spg: + if (spg) + sp_group_drop(spg); +out_drop_proc_stat: + sp_proc_stat_drop(stat); +out_put_mm: mmput(mm); out_put_task: put_task_struct(tsk); @@ -2298,9 +2323,10 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp atomic64_sub(spa->real_size, &kthread_stat.k2u_size); } else { stat = sp_get_proc_stat(current->mm->sp_stat_id); - if (stat) + if (stat) { atomic64_sub(spa->real_size, &stat->k2u_size); - else + sp_proc_stat_drop(stat); + } else WARN(1, "share pool: %s: null process stat\n", __func__); }
@@ -2525,18 +2551,33 @@ __setup("enable_sp_share_k2u_spg", enable_share_k2u_to_group);
/*** Statistical and maintenance functions ***/
+/* user must call sp_proc_stat_drop() after use */ struct sp_proc_stat *sp_get_proc_stat(int tgid) { struct sp_proc_stat *stat;
down_read(&sp_stat_sem); - stat = idr_find(&sp_stat_idr, tgid); + stat = sp_get_proc_stat_locked(tgid); up_read(&sp_stat_sem); - - /* maybe NULL or not, we always return it */ return stat; }
+static void free_sp_proc_stat(struct sp_proc_stat *stat) +{ + stat->mm->sp_stat_id = 0; + down_write(&sp_stat_sem); + idr_remove(&sp_stat_idr, stat->tgid); + up_write(&sp_stat_sem); + kfree(stat); +} + +/* the caller make sure stat is not NULL */ +void sp_proc_stat_drop(struct sp_proc_stat *stat) +{ + if (atomic_dec_and_test(&stat->use_count)) + free_sp_proc_stat(stat); +} + int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -2569,6 +2610,7 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, byte2kb(atomic64_read(&stat->alloc_size)), hugepage_failures);
+ sp_proc_stat_drop(stat); sp_group_drop(spg); return 0; }
From: Bixuan Cui cuibixuan@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The share pool feature belongs to the memory subsystem. Therefore, the sysctl interface is more suitable to be placed in the vm_table.
Signed-off-by: Bixuan Cui cuibixuan@huawei.com Reviewed-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sysctl.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97a24290f0750..f27ed383227bb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1268,28 +1268,6 @@ static struct ctl_table kern_table[] = { .extra2 = &three, },
-#endif -#ifdef CONFIG_ASCEND_SHARE_POOL - { - /* 0: disable, 1: enable */ - .procname = "share_pool_hugepage_enable", - .data = &sysctl_share_pool_hugepage_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, - { - /* 0: map_unlock, 1: map_lock */ - .procname = "share_pool_map_lock_enable", - .data = &sysctl_share_pool_map_lock_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, #endif { } }; @@ -1801,6 +1779,26 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + /* 0: disable, 1: enable */ + .procname = "share_pool_hugepage_enable", + .data = &sysctl_share_pool_hugepage_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + /* 0: map_unlock, 1: map_lock */ + .procname = "share_pool_map_lock_enable", + .data = &sysctl_share_pool_map_lock_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } };
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The is_vmalloc_sharepool agrument is defined of different type when the sharepool is turned off, it will occur the warning like this:
/include/linux/share_pool.h:462:20: note: expected ‘struct vm_struct *’ but argument is of type ‘long unsigned int’
Fix this warning.
Fixes: ad4504322d9e ("ascend: sharepool: don't enable the vmalloc to use hugepage default") Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index d94d48f57798c..859efd3525f35 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -466,12 +466,12 @@ static inline void *buff_vzalloc_hugepage_user(unsigned long size) return NULL; }
-static inline bool is_vmalloc_huge(struct vm_struct *vm) +static inline bool is_vmalloc_huge(unsigned long vm_flags) { return NULL; }
-static inline bool is_vmalloc_sharepool(struct vm_struct *vm) +static inline bool is_vmalloc_sharepool(unsigned long vm_flags) { return NULL; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
sysctl_share_pool_hugepage_enable is no longer used.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sysctl.c | 10 ---------- mm/share_pool.c | 2 -- 2 files changed, 12 deletions(-)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f27ed383227bb..cd2d114f3391c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1779,16 +1779,6 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one, }, - { - /* 0: disable, 1: enable */ - .procname = "share_pool_hugepage_enable", - .data = &sysctl_share_pool_hugepage_enable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, { /* 0: map_unlock, 1: map_lock */ .procname = "share_pool_map_lock_enable", diff --git a/mm/share_pool.c b/mm/share_pool.c index 8727a096107b3..26fbe3d7cced0 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -220,8 +220,6 @@ static DEFINE_SPINLOCK(sp_area_lock); static struct rb_root sp_area_root = RB_ROOT; static bool host_svm_sp_enable = false;
-int sysctl_share_pool_hugepage_enable = 1; - static unsigned long spa_size(struct sp_area *spa) { return spa->real_size;
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
sp_group_exit() and sp_group_post_exit() should be put together, and so shall it be for free_sp_group() and sp_group_drop().
This helps the code to be more readable, and improves cache hit ratio.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 94 ++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 47 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 26fbe3d7cced0..37a5b94726f67 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -331,6 +331,12 @@ static void free_sp_group(struct sp_group *spg) kfree(spg); }
+static void sp_group_drop(struct sp_group *spg) +{ + if (atomic_dec_and_test(&spg->use_count)) + free_sp_group(spg); +} + /* user must call sp_group_drop() after use */ static struct sp_group *__sp_find_spg_locked(int pid, int spg_id) { @@ -386,12 +392,6 @@ static struct sp_group *__sp_find_spg(int pid, int spg_id) return spg; }
-static void sp_group_drop(struct sp_group *spg) -{ - if (atomic_dec_and_test(&spg->use_count)) - free_sp_group(spg); -} - int sp_group_id_by_pid(int pid) { struct sp_group *spg; @@ -771,47 +771,6 @@ int sp_group_add_task(int pid, int spg_id) } EXPORT_SYMBOL_GPL(sp_group_add_task);
-void sp_group_post_exit(struct mm_struct *mm) -{ - struct sp_proc_stat *stat; - struct sp_group *spg = mm->sp_group; - long alloc_size, k2u_size; - - if (!spg || !enable_ascend_share_pool) - return; - - stat = sp_get_proc_stat(mm->sp_stat_id); - /* - * There are two basic scenarios when a process in the share pool is - * exiting but its share pool memory usage is not 0. - * 1. Process A called sp_alloc(), but it terminates without calling - * sp_free(). Then its share pool memory usage is a positive number. - * 2. Process A never called sp_alloc(), and process B in the same spg - * called sp_alloc() to get an addr u. Then A gets u somehow and - * called sp_free(u). Now A's share pool memory usage is a negative - * number. Notice B's memory usage will be a positive number. - * - * We decide to print a info when seeing both of the scenarios. - */ - if (stat) { - alloc_size = atomic64_read(&stat->alloc_size); - k2u_size = atomic64_read(&stat->k2u_size); - if (alloc_size != 0 || k2u_size != 0) - pr_info("share pool: process %s(%d) of sp group %d exits. " - "It applied %ld aligned KB, k2u shared %ld aligned KB\n", - stat->comm, mm->sp_stat_id, mm->sp_group->id, - byte2kb(alloc_size), byte2kb(k2u_size)); - - /* match with sp_get_proc_stat in THIS function */ - sp_proc_stat_drop(stat); - /* match with sp_init_proc_stat, we expect stat is released after this call */ - sp_proc_stat_drop(stat); - } - - /* match with sp_group_add_task -> find_or_alloc_sp_group */ - sp_group_drop(spg); -} - /* the caller must hold sp_area_lock */ static void __insert_sp_area(struct sp_area *spa) { @@ -3006,6 +2965,47 @@ void sp_group_exit(struct mm_struct *mm) up_write(&spg->rw_lock); }
+void sp_group_post_exit(struct mm_struct *mm) +{ + struct sp_proc_stat *stat; + struct sp_group *spg = mm->sp_group; + long alloc_size, k2u_size; + + if (!spg || !enable_ascend_share_pool) + return; + + stat = sp_get_proc_stat(mm->sp_stat_id); + /* + * There are two basic scenarios when a process in the share pool is + * exiting but its share pool memory usage is not 0. + * 1. Process A called sp_alloc(), but it terminates without calling + * sp_free(). Then its share pool memory usage is a positive number. + * 2. Process A never called sp_alloc(), and process B in the same spg + * called sp_alloc() to get an addr u. Then A gets u somehow and + * called sp_free(u). Now A's share pool memory usage is a negative + * number. Notice B's memory usage will be a positive number. + * + * We decide to print a info when seeing both of the scenarios. + */ + if (stat) { + alloc_size = atomic64_read(&stat->alloc_size); + k2u_size = atomic64_read(&stat->k2u_size); + if (alloc_size != 0 || k2u_size != 0) + pr_info("share pool: process %s(%d) of sp group %d exits. " + "It applied %ld aligned KB, k2u shared %ld aligned KB\n", + stat->comm, mm->sp_stat_id, mm->sp_group->id, + byte2kb(alloc_size), byte2kb(k2u_size)); + + /* match with sp_get_proc_stat in THIS function */ + sp_proc_stat_drop(stat); + /* match with sp_init_proc_stat, we expect stat is released after this call */ + sp_proc_stat_drop(stat); + } + + /* match with sp_group_add_task -> find_or_alloc_sp_group */ + sp_group_drop(spg); +} + struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, unsigned int page_order, int node) {
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
For the implementation of Linux, statistics of RSS has a maximum 64 pages deviation (256KB) but the track of share pool are all precise. So the calculation results may be negative and confuse people.
We decide to show zeros when the results are negative. It is still imprecise, but maybe better.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 37a5b94726f67..99e63bb7762ac 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2788,9 +2788,14 @@ static int idr_proc_stat_cb(int id, void *p, void *data) file = get_mm_counter(mm, MM_FILEPAGES); shmem = get_mm_counter(mm, MM_SHMEMPAGES); total_rss = anon + file + shmem; + /* + * Statistics of RSS has a maximum 64 pages deviation (256KB). + * Please check_sync_rss_stat(). + */ non_sp_res = page2kb(total_rss) - sp_alloc_nsize; + non_sp_res = non_sp_res < 0 ? 0 : non_sp_res; non_sp_shm = page2kb(shmem) - sp_alloc_nsize; - non_sp_shm = non_sp_shm < 0 ? 0 : non_sp_shm; /* to be investigated */ + non_sp_shm = non_sp_shm < 0 ? 0 : non_sp_shm;
seq_printf(seq, "%-8d ", id); if (spg_id == 0)
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
------------------------------------------------- According to kernel comment, printk_ratelimited is not recommended, we should use pr_<level>_ratelimited instead.
This also helps to reduce cyclomatic complexity.
In addition, %pK is not useful when we want to know the virtual memory address. We decide to use pr_debug() and %lx, only root users have the permission to switch /sys/kernel/debug/dynamic_debug/control which meets the security requirements.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 212 ++++++++++++++++-------------------------------- 1 file changed, 72 insertions(+), 140 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 99e63bb7762ac..c674370204b62 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -129,7 +129,7 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, return stat; } else { /* if enter this branch, that's our mistake */ - pr_err("share pool: sp_init_proc_stat invalid id %d\n", id); + pr_err_ratelimited("share pool: proc stat invalid id %d\n", id); return ERR_PTR(-EBUSY); } } @@ -137,8 +137,7 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, stat = kzalloc(sizeof(*stat), GFP_KERNEL); if (stat == NULL) { up_write(&sp_stat_sem); - if (printk_ratelimit()) - pr_err("share pool: alloc proc stat failed due to lack of memory\n"); + pr_err_ratelimited("share pool: alloc proc stat failed due to lack of memory\n"); return ERR_PTR(-ENOMEM); }
@@ -153,8 +152,7 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, ret = idr_alloc(&sp_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); if (ret < 0) { up_write(&sp_stat_sem); - if (printk_ratelimit()) - pr_err("share pool: proc stat idr alloc failed %d\n", ret); + pr_err_ratelimited("share pool: proc stat idr alloc failed %d\n", ret); kfree(stat); return ERR_PTR(ret); } @@ -428,16 +426,14 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id)
spg = kzalloc(sizeof(*spg), GFP_KERNEL); if (spg == NULL) { - if (printk_ratelimit()) - pr_err("share pool: alloc spg failed due to lack of memory\n"); + pr_err_ratelimited("share pool: alloc spg failed due to lack of memory\n"); return ERR_PTR(-ENOMEM); } ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id + 1, GFP_KERNEL); up_write(&sp_group_sem); if (ret < 0) { - if (printk_ratelimit()) - pr_err("share pool: create group idr alloc failed\n"); + pr_err_ratelimited("share pool: create group idr alloc failed\n"); goto out_kfree; }
@@ -461,9 +457,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) spg->file = shmem_kernel_file_setup(name, MAX_LFS_FILESIZE, VM_NORESERVE); if (IS_ERR(spg->file)) { - if (printk_ratelimit()) - pr_err("share pool: file setup for small page failed %ld\n", - PTR_ERR(spg->file)); + pr_err("share pool: file setup for small page failed %ld\n", PTR_ERR(spg->file)); ret = PTR_ERR(spg->file); goto out_idr; } @@ -472,9 +466,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) VM_NORESERVE, &user, HUGETLB_ANONHUGE_INODE, hsize_log); if (IS_ERR(spg->file_hugetlb)) { - if (printk_ratelimit()) - pr_err("share pool: file setup for hugepage failed %ld\n", - PTR_ERR(spg->file_hugetlb)); + pr_err("share pool: file setup for hugepage failed %ld\n", PTR_ERR(spg->file_hugetlb)); ret = PTR_ERR(spg->file_hugetlb); goto out_fput; } @@ -566,8 +558,7 @@ int sp_group_add_task(int pid, int spg_id)
if ((spg_id < SPG_ID_MIN || spg_id > SPG_ID_AUTO) && spg_id != SPG_ID_DVPP_PASS_THROUGH) { - if (printk_ratelimit()) - pr_err("share pool: task add group failed due to invalid group id %d\n", spg_id); + pr_err_ratelimited("share pool: task add group failed, invalid group id %d\n", spg_id); return -EINVAL; }
@@ -575,17 +566,14 @@ int sp_group_add_task(int pid, int spg_id) spg = __sp_find_spg(pid, spg_id);
if (!spg) { - if (printk_ratelimit()) - pr_err("share pool: spg %d hasn't been created\n", spg_id); + pr_err_ratelimited("share pool: spg %d hasn't been created\n", spg_id); return -EINVAL; }
down_read(&spg->rw_lock); if (!spg_valid(spg)) { up_read(&spg->rw_lock); - if (printk_ratelimit()) - pr_err("share pool: task add group failed because group id %d " - "is dead\n", spg_id); + pr_err_ratelimited("share pool: task add group failed, group id %d is dead\n", spg_id); sp_group_drop(spg); return -EINVAL; } @@ -598,9 +586,7 @@ int sp_group_add_task(int pid, int spg_id) spg_id = ida_alloc_range(&sp_group_id_ida, SPG_ID_AUTO_MIN, SPG_ID_AUTO_MAX, GFP_ATOMIC); if (spg_id < 0) { - if (printk_ratelimit()) - pr_err("share pool: task add group failed when automatically " - "generate group id failed\n"); + pr_err_ratelimited("share pool: task add group failed, auto generate group id failed\n"); return spg_id; } id_newly_generated = true; @@ -611,9 +597,7 @@ int sp_group_add_task(int pid, int spg_id) SPG_ID_DVPP_PASS_THROUGH_MIN, SPG_ID_DVPP_PASS_THROUGH_MAX, GFP_ATOMIC); if (spg_id < 0) { - if (printk_ratelimit()) - pr_err("share pool: task add group failed when automatically " - "generate group id failed in DVPP pass through\n"); + pr_err_ratelimited("share pool: task add group failed, DVPP auto generate group id failed\n"); return spg_id; } id_newly_generated = true; @@ -677,7 +661,7 @@ int sp_group_add_task(int pid, int spg_id) stat = sp_init_proc_stat(tsk, mm); if (IS_ERR(stat)) { ret = PTR_ERR(stat); - pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); + pr_err_ratelimited("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); goto out_drop_group; }
@@ -726,11 +710,9 @@ int sp_group_add_task(int pid, int spg_id) if (populate) { ret = do_mm_populate(mm, spa->va_start, populate, 0); if (ret) { - if (printk_ratelimit()) { - pr_warn("share pool: task add group failed when mm populate " - "failed (potential no enough memory): %d " - "spa type is %d\n", ret, spa->type); - } + pr_warn_ratelimited("share pool: task add group failed, mm populate failed " + "(potential no enough memory when -12): %d, spa type is %d\n", + ret, spa->type); down_write(&mm->mmap_sem); sp_munmap_task_areas(mm, spa->link.next); up_write(&mm->mmap_sem); @@ -821,8 +803,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, vend = MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE; } else { if (!spg) { - if (printk_ratelimit()) - pr_err("share pool: don't allow k2u(task) in host svm multiprocess scene\n"); + pr_err_ratelimited("share pool: don't allow k2u(task) in host svm multiprocess scene\n"); return ERR_PTR(-EINVAL); } vstart = spg->dvpp_va_start; @@ -832,8 +813,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags,
spa = kmalloc(sizeof(struct sp_area), GFP_KERNEL); if (unlikely(!spa)) { - if (printk_ratelimit()) - pr_err("share pool: alloc spa failed due to lack of memory\n"); + pr_err_ratelimited("share pool: alloc spa failed due to lack of memory\n"); return ERR_PTR(-ENOMEM); }
@@ -1183,16 +1163,13 @@ int sp_free(unsigned long addr) } } else { /* spa == NULL */ ret = -EINVAL; - if (printk_ratelimit()) - pr_err("share pool: sp free invalid input addr %pK\n", (void *)addr); + pr_debug("share pool: sp free invalid input addr %lx\n", (unsigned long)addr); goto out; }
if (spa->type != SPA_TYPE_ALLOC) { ret = -EINVAL; - if (printk_ratelimit()) - pr_err("share pool: sp free failed, addr %pK is not from sp_alloc\n", - (void *)addr); + pr_debug("share pool: sp free failed, addr %lx is not from sp alloc\n", (unsigned long)addr); goto drop_spa; }
@@ -1296,14 +1273,12 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) spg_id = mdc_default_group_id;
if (spg_id != SPG_ID_DEFAULT && spg_id < SPG_ID_MIN) { - if (printk_ratelimit()) - pr_err("share pool: allocation failed due to invalid group id %d\n", spg_id); + pr_err_ratelimited("share pool: allocation failed, invalid group id %d\n", spg_id); return ERR_PTR(-EINVAL); }
if (sp_flags & ~(SP_HUGEPAGE_ONLY | SP_HUGEPAGE | SP_DVPP)) { - if (printk_ratelimit()) - pr_err("share pool: allocation failed due to invalid flag %lu\n", sp_flags); + pr_err_ratelimited("share pool: allocation failed, invalid flag %lx\n", sp_flags); return ERR_PTR(-EINVAL); }
@@ -1323,8 +1298,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) * The judgment is added to prevent exit in this case. */ if (ret < 0 && (ret != -EEXIST)) { - pr_err("share pool: allocation failed due to add group error %d in DVPP pass through scenario", - ret); + pr_err_ratelimited("share pool: allocation failed, add group error %d in DVPP pass through\n", ret); return ERR_PTR(ret); } spg = current->mm->sp_group; @@ -1350,7 +1324,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) if (!spg_valid(spg)) { up_read(&spg->rw_lock); sp_group_drop(spg); - pr_err("share pool: sp alloc failed, spg is dead\n"); + pr_err_ratelimited("share pool: sp alloc failed, spg is dead\n"); return ERR_PTR(-ENODEV); }
@@ -1364,10 +1338,8 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) try_again: spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_ALLOC); if (IS_ERR(spa)) { - if (printk_ratelimit()) - pr_err("share pool: allocation failed due to alloc spa failure " - "(potential no enough virtual memory when -75): %ld\n", - PTR_ERR(spa)); + pr_err_ratelimited("share pool: allocation failed due to alloc spa failure " + "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); p = spa; goto out; } @@ -1397,8 +1369,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) vma = find_vma(mm, sp_addr); if (unlikely(!vma)) { up_write(&mm->mmap_sem); - pr_err("share pool: allocation failed due to find %pK vma failure\n", - (void *)sp_addr); + pr_debug("share pool: allocation failed due to find %lx vma failure\n", (unsigned long)sp_addr); p = ERR_PTR(-EINVAL); goto out; } @@ -1435,10 +1406,8 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) if (ret) { __sp_free(spg, sp_addr, size_aligned, list_next_entry(mm, sp_node)); - - if (printk_ratelimit()) - pr_warn("share pool: allocation failed due to mm populate failed" - "(potential no enough memory when -12): %d\n", ret); + pr_warn_ratelimited("share pool: allocation failed due to mm populate failed" + "(potential no enough memory when -12): %d\n", ret); p = ERR_PTR(ret);
mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; @@ -1496,15 +1465,13 @@ static int is_vmap_hugepage(unsigned long addr) struct vm_struct *area;
if (unlikely(!addr)) { - if (printk_ratelimit()) - pr_err("share pool: null pointer when judge vmap addr\n"); + pr_err_ratelimited("share pool: null pointer when judge vmap addr\n"); return -EINVAL; }
area = find_vm_area((void *)addr); if (unlikely(!area)) { - if (printk_ratelimit()) - pr_err("share pool: failed to find vm area(%lx)\n", addr); + pr_err_ratelimited("share pool: failed to find vm area(%lx)\n", addr); return -EINVAL; }
@@ -1570,8 +1537,8 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); if (ret) { do_munmap(mm, ret_addr, spa_size(spa), NULL); - pr_err("share pool: remap vmalloc hugepage failed, " - "ret %d, kva is %pK\n", ret, (void *)kva); + pr_debug("share pool: remap vmalloc hugepage failed, " + "ret %d, kva is %lx\n", ret, (unsigned long)kva); ret_addr = ret; goto put_mm; } @@ -1711,8 +1678,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, check_interrupt_context();
if (sp_flags & ~SP_DVPP) { - if (printk_ratelimit()) - pr_err("share pool: k2u sp_flags %lu error\n", sp_flags); + pr_err_ratelimited("share pool: k2u sp_flags %lx error\n", sp_flags); return ERR_PTR(-EINVAL); }
@@ -1723,7 +1689,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, } else if (is_hugepage == 0) { /* do nothing */ } else { - pr_err("share pool: k2u kva not vmalloc address\n"); + pr_err_ratelimited("share pool: k2u kva not vmalloc address\n"); return ERR_PTR(is_hugepage); }
@@ -1755,7 +1721,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, stat = sp_init_proc_stat(tsk, mm); if (IS_ERR(stat)) { uva = stat; - pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); + pr_err_ratelimited("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); goto out_put_mm; }
@@ -1763,24 +1729,21 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, if (spg == NULL) { /* k2u to task */ if (spg_id != SPG_ID_NONE && spg_id != SPG_ID_DEFAULT) { - if (printk_ratelimit()) - pr_err("share pool: k2task invalid spg id %d\n", spg_id); + pr_err_ratelimited("share pool: k2task invalid spg id %d\n", spg_id); uva = ERR_PTR(-EINVAL); goto out_drop_proc_stat; } spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); if (IS_ERR(spa)) { - if (printk_ratelimit()) - pr_err("share pool: k2u(task) failed due to alloc spa failure " - "(potential no enough virtual memory when -75): %ld\n", - PTR_ERR(spa)); + pr_err_ratelimited("share pool: k2u(task) failed due to alloc spa failure " + "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); uva = spa; goto out_drop_proc_stat; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { up_read(&spg->rw_lock); - pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); + pr_debug("share pool: %s: the kva %lx is not valid\n", __func__, (unsigned long)kva_aligned); goto out_drop_spa; }
@@ -1793,8 +1756,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, /* k2u to group */ if (spg_id != SPG_ID_DEFAULT && spg_id != spg->id) { up_read(&spg->rw_lock); - if (printk_ratelimit()) - pr_err("share pool: k2spg invalid spg id %d\n", spg_id); + pr_err_ratelimited("share pool: k2spg invalid spg id %d\n", spg_id); uva = ERR_PTR(-EINVAL); goto out_drop_spg; } @@ -1806,17 +1768,15 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size,
if (IS_ERR(spa)) { up_read(&spg->rw_lock); - if (printk_ratelimit()) - pr_err("share pool: k2u(spg) failed due to alloc spa failure " - "(potential no enough virtual memory when -75): %ld\n", - PTR_ERR(spa)); + pr_err_ratelimited("share pool: k2u(spg) failed due to alloc spa failure " + "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); uva = spa; goto out_drop_spg; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { up_read(&spg->rw_lock); - pr_err("share pool: %s: the kva %pK is not valid\n", __func__, (void *)kva_aligned); + pr_err("share pool: %s: the kva %lx is not valid\n", __func__, (unsigned long)kva_aligned); goto out_drop_spa; }
@@ -1827,7 +1787,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size,
} else { /* group is dead, return -ENODEV */ - pr_err("share pool: failed to make k2u, sp group is dead\n"); + pr_err_ratelimited("share pool: failed to make k2u, sp group is dead\n"); uva = ERR_PTR(-ENODEV); } up_read(&spg->rw_lock); @@ -1839,8 +1799,8 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, } else { /* associate vma and spa */ if (!vmalloc_area_clr_flag(spa, kva_aligned, VM_SHAREPOOL)) - pr_warn("share pool: %s: the kva %pK is not valid\n", - __func__, (void *)kva_aligned); + pr_warn("share pool: %s: the kva %lx is not valid\n", + __func__, (unsigned long)kva_aligned); }
out_drop_spa: @@ -1867,8 +1827,7 @@ static int sp_pte_entry(pte_t *pte, unsigned long addr, struct sp_walk_data *sp_walk_data;
if (unlikely(!pte_present(*pte))) { - if (printk_ratelimit()) - pr_err("share pool: the page of addr %pK unexpectedly not in RAM\n", (void *)addr); + pr_debug("share pool: the page of addr %lx unexpectedly not in RAM\n", (unsigned long)addr); return -EFAULT; }
@@ -1894,9 +1853,7 @@ static int sp_test_walk(unsigned long addr, unsigned long next, static int sp_pte_hole(unsigned long start, unsigned long end, struct mm_walk *walk) { - if (printk_ratelimit()) - pr_err("share pool: hole [%pK, %pK) appeared unexpectedly\n", - (void *)start, (void *)end); + pr_debug("share pool: hole [%lx, %lx) appeared unexpectedly\n", (unsigned long)start, (unsigned long)end); return -EFAULT; }
@@ -1909,9 +1866,7 @@ static int sp_hugetlb_entry(pte_t *ptep, unsigned long hmask, struct sp_walk_data *sp_walk_data;
if (unlikely(!pte_present(pte))) { - if (printk_ratelimit()) - pr_err("share pool: the page of addr %pK unexpectedly " - "not in RAM\n", (void *)addr); + pr_err_ratelimited("share pool: the page of addr %lx unexpectedly not in RAM\n", (unsigned long)addr); return -EFAULT; }
@@ -1967,8 +1922,7 @@ static int __sp_walk_page_range(unsigned long uva, unsigned long size, */ vma = find_vma(mm, uva); if (!vma) { - if (printk_ratelimit()) - pr_err("share pool: u2k input uva %pK is invalid\n", (void *)uva); + pr_debug("share pool: u2k input uva %lx is invalid\n", (unsigned long)uva); return -EINVAL; } if ((is_vm_hugetlb_page(vma)) || is_vm_huge_special(vma)) @@ -1995,16 +1949,14 @@ static int __sp_walk_page_range(unsigned long uva, unsigned long size, size_aligned = ALIGN(uva + size, page_size) - uva_aligned;
if (uva_aligned + size_aligned < uva_aligned) { - if (printk_ratelimit()) - pr_err("share pool: overflow happened in walk page range\n"); + pr_err_ratelimited("share pool: overflow happened in walk page range\n"); return -EINVAL; }
page_nr = size_aligned / page_size; pages = kvmalloc(page_nr * sizeof(struct page *), GFP_KERNEL); if (!pages) { - if (printk_ratelimit()) - pr_err("share pool: alloc page array failed in walk page range\n"); + pr_err_ratelimited("share pool: alloc page array failed in walk page range\n"); return -ENOMEM; } sp_walk_data->pages = pages; @@ -2076,7 +2028,7 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) down_write(&mm->mmap_sem); ret = __sp_walk_page_range(uva, size, mm, &sp_walk_data); if (ret) { - pr_err("share pool: walk page range failed, ret %d\n", ret); + pr_err_ratelimited("share pool: walk page range failed, ret %d\n", ret); up_write(&mm->mmap_sem); mmput(mm); p = ERR_PTR(ret); @@ -2093,8 +2045,7 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) mmput(mm);
if (!p) { - if (printk_ratelimit()) - pr_err("share pool: vmap(huge) in u2k failed\n"); + pr_err("share pool: vmap(huge) in u2k failed\n"); __sp_walk_page_free(&sp_walk_data); p = ERR_PTR(-ENOMEM); goto out_put_task; @@ -2154,15 +2105,13 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp spa = __find_sp_area(ALIGN_DOWN(uva, PAGE_SIZE)); if (!spa) { ret = -EINVAL; - if (printk_ratelimit()) - pr_err("share pool: invalid input uva %pK in unshare uva\n", - (void *)uva); + pr_debug("share pool: invalid input uva %lx in unshare uva\n", (unsigned long)uva); goto out; } }
if (spa->type != SPA_TYPE_K2TASK && spa->type != SPA_TYPE_K2SPG) { - pr_err("share pool: this spa should not be unshare here\n"); + pr_err_ratelimited("share pool: this spa should not be unshare here\n"); ret = -EINVAL; goto out_drop_area; } @@ -2178,25 +2127,19 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp
if (size_aligned < ALIGN(size, page_size)) { ret = -EINVAL; - if (printk_ratelimit()) - pr_err("share pool: unshare uva failed due to invalid parameter size %lu\n", - size); + pr_err_ratelimited("share pool: unshare uva failed due to invalid parameter size %lu\n", size); goto out_drop_area; }
if (spa->type == SPA_TYPE_K2TASK) { if (spg_id != SPG_ID_NONE && spg_id != SPG_ID_DEFAULT) { - if (printk_ratelimit()) - pr_err("share pool: unshare uva(to task) failed, " - "invalid spg id %d\n", spg_id); + pr_err_ratelimited("share pool: unshare uva(to task) failed, invalid spg id %d\n", spg_id); ret = -EINVAL; goto out_drop_area; }
if (!spa->mm) { - if (printk_ratelimit()) - pr_err("share pool: unshare uva(to task) failed, " - "none spa owner\n"); + pr_err_ratelimited("share pool: unshare uva(to task) failed, none spa owner\n"); ret = -EINVAL; goto out_drop_area; } @@ -2210,16 +2153,13 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp */ mm = get_task_mm(current->group_leader); if (!mm) { - if (printk_ratelimit()) - pr_info("share pool: no need to unshare uva(to task), " - "target process mm is exiting\n"); + pr_info_ratelimited("share pool: no need to unshare uva(to task), " + "target process mm is exiting\n"); goto out_clr_flag; }
if (spa->mm != mm) { - if (printk_ratelimit()) - pr_err("share pool: unshare uva(to task) failed, " - "spa not belong to the task\n"); + pr_err_ratelimited("share pool: unshare uva(to task) failed, spa not belong to the task\n"); ret = -EINVAL; mmput(mm); goto out_drop_area; @@ -2236,9 +2176,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp } } else if (spa->type == SPA_TYPE_K2SPG) { if (spg_id < 0) { - if (printk_ratelimit()) - pr_err("share pool: unshare uva(to group) failed, " - "invalid spg id %d\n", spg_id); + pr_err_ratelimited("share pool: unshare uva(to group) failed, invalid spg id %d\n", spg_id); ret = -EINVAL; goto out_drop_area; } @@ -2252,18 +2190,16 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp down_read(&spa->spg->rw_lock); if (!spg_valid(spa->spg)) { up_read(&spa->spg->rw_lock); - if (printk_ratelimit()) - pr_info("share pool: no need to unshare uva(to group), " - "sp group of spa is dead\n"); + pr_info_ratelimited("share pool: no need to unshare uva(to group), " + "sp group of spa is dead\n"); goto out_clr_flag; } up_read(&spa->spg->rw_lock);
/* alway allow kthread and dvpp channel destroy procedure */ if (current->mm && current->mm->sp_group != spa->spg) { - if (printk_ratelimit()) - pr_err("share pool: unshare uva(to group) failed, " - "caller process doesn't belong to target group\n"); + pr_err_ratelimited("share pool: unshare uva(to group) failed, " + "caller process doesn't belong to target group\n"); ret = -EINVAL; goto out_drop_area; } @@ -2318,14 +2254,12 @@ static int sp_unshare_kva(unsigned long kva, unsigned long size) step = PAGE_SIZE; is_hugepage = false; } else { - if (printk_ratelimit()) - pr_err("share pool: check vmap hugepage failed, ret %d\n", ret); + pr_err_ratelimited("share pool: check vmap hugepage failed, ret %d\n", ret); return -EINVAL; }
if (kva_aligned + size_aligned < kva_aligned) { - if (printk_ratelimit()) - pr_err("share pool: overflow happened in unshare kva\n"); + pr_err_ratelimited("share pool: overflow happened in unshare kva\n"); return -EINVAL; }
@@ -2371,8 +2305,7 @@ int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) ret = sp_unshare_kva(va, size); } else { /* regard user and kernel address ranges as bad address */ - if (printk_ratelimit()) - pr_err("share pool: unshare addr %pK is not a user or kernel addr", (void *)va); + pr_debug("share pool: unshare addr %lx is not a user or kernel addr\n", (unsigned long)va); ret = -EFAULT; }
@@ -2393,8 +2326,7 @@ int sp_walk_page_range(unsigned long uva, unsigned long size, check_interrupt_context();
if (unlikely(!sp_walk_data)) { - if (printk_ratelimit()) - pr_err("share pool: null pointer when walk page range\n"); + pr_err_ratelimited("share pool: null pointer when walk page range\n"); return -EINVAL; } if (!tsk || (tsk->flags & PF_EXITING))
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
We encounter a call trace as follows:
[ 243.545984] Call trace: [ 243.545984] find_vma+0x90/0xa0 [ 243.545985] find_extend_vma+0x2c/0xd0 [ 243.545985] __get_user_pages+0x94/0x378 [ 243.545985] get_dump_page+0x50/0x80 [ 243.545986] elf_core_dump+0x560/0x8d8 [ 243.545986] do_coredump+0x508/0xe40 [ 243.545986] get_signal+0x130/0x788 [ 243.545987] do_signal+0x1d4/0x290 [ 243.545987] do_notify_resume+0x150/0x1c0 [ 243.545988] work_pending+0x8/0x10
elf_core_dump() doesn't hold mmap_sem because the other threads in the same thread group are killed and blocked in exit_mm(), waiting for calling coredump_finish().
However, share pool operations can modify the mm of any process in the same share group and lead to concurrent problems when coredump happens.
Solution: in share pool operations, check whether coredump happened with mm->core_state.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 52 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 5 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index c674370204b62..958e75ae4f7f3 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -696,6 +696,15 @@ int sp_group_add_task(int pid, int spg_id) }
down_write(&mm->mmap_sem); + if (unlikely(mm->core_state)) { + sp_munmap_task_areas(mm, &spa->link); + up_write(&mm->mmap_sem); + ret = -EBUSY; + pr_err("share pool: task add group: encountered coredump, abort\n"); + spin_lock(&sp_area_lock); + break; + } + addr = sp_mmap(mm, file, spa, &populate); if (IS_ERR_VALUE(addr)) { sp_munmap_task_areas(mm, &spa->link); @@ -1110,6 +1119,11 @@ static void sp_munmap(struct mm_struct *mm, unsigned long addr, int err;
down_write(&mm->mmap_sem); + if (unlikely(mm->core_state)) { + up_write(&mm->mmap_sem); + pr_info("share pool: munmap: encoutered coredump\n"); + return; + }
err = do_munmap(mm, addr, size, NULL); if (err) { @@ -1351,6 +1365,12 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) struct vm_area_struct *vma;
down_write(&mm->mmap_sem); + if (unlikely(mm->core_state)) { + up_write(&mm->mmap_sem); + pr_info("share pool: allocation encountered coredump\n"); + continue; + } + mmap_addr = sp_mmap(mm, file, spa, &populate); if (IS_ERR_VALUE(mmap_addr)) { up_write(&mm->mmap_sem); @@ -1521,6 +1541,11 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, }
down_write(&mm->mmap_sem); + if (unlikely(mm->core_state)) { + pr_err("share pool: k2u mmap: encountered coredump, abort\n"); + ret_addr = -EBUSY; + goto put_mm; + }
ret_addr = sp_mmap(mm, file, spa, &populate); if (IS_ERR_VALUE(ret_addr)) { @@ -2002,7 +2027,7 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) int ret = 0; struct task_struct *tsk; struct mm_struct *mm; - void *p = ERR_PTR(-ENODEV); + void *p = ERR_PTR(-ESRCH); struct sp_walk_data sp_walk_data = { .page_count = 0, }; @@ -2017,15 +2042,20 @@ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) else get_task_struct(tsk); rcu_read_unlock(); - if (ret) { - p = ERR_PTR(ret); + if (ret) goto out; - }
mm = get_task_mm(tsk); if (mm == NULL) goto out_put_task; down_write(&mm->mmap_sem); + if (unlikely(mm->core_state)) { + up_write(&mm->mmap_sem); + pr_err("share pool: u2k: encountered coredump, abort\n"); + mmput(mm); + goto out_put_task; + } + ret = __sp_walk_page_range(uva, size, mm, &sp_walk_data); if (ret) { pr_err_ratelimited("share pool: walk page range failed, ret %d\n", ret); @@ -2166,6 +2196,13 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp }
down_write(&mm->mmap_sem); + if (unlikely(mm->core_state)) { + ret = 0; + up_write(&mm->mmap_sem); + mmput(mm); + goto out_drop_area; + } + ret = do_munmap(mm, uva_aligned, size_aligned, NULL); up_write(&mm->mmap_sem); mmput(mm); @@ -2341,7 +2378,12 @@ int sp_walk_page_range(unsigned long uva, unsigned long size,
sp_walk_data->page_count = 0; down_write(&mm->mmap_sem); - ret = __sp_walk_page_range(uva, size, mm, sp_walk_data); + if (likely(!mm->core_state)) + ret = __sp_walk_page_range(uva, size, mm, sp_walk_data); + else { + pr_err("share pool: walk page range: encoutered coredump\n"); + ret = -ESRCH; + } up_write(&mm->mmap_sem);
mmput(mm);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
/proc/sharepool/spa_stat now can show the pid of applier process, which helps to debug and check memleak.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 958e75ae4f7f3..e7d8b63138a8b 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -213,6 +213,7 @@ struct sp_area { enum spa_type type; /* where spa born from */ struct mm_struct *mm; /* owner of k2u(task) */ unsigned long kva; /* shared kva */ + pid_t applier; /* the original applier process */ }; static DEFINE_SPINLOCK(sp_area_lock); static struct rb_root sp_area_root = RB_ROOT; @@ -797,7 +798,8 @@ static unsigned long cached_vstart; /* affected by SP_DVPP and sp_config_dvpp_r * Return NULL if fail. */ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, - struct sp_group *spg, enum spa_type type) + struct sp_group *spg, enum spa_type type, + pid_t applier) { struct sp_area *spa, *first, *err; struct rb_node *n; @@ -914,6 +916,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, spa->type = type; spa->mm = NULL; spa->kva = 0; /* NULL pointer */ + spa->applier = applier;
if (spa_inc_usage(type, size, (flags & SP_DVPP))) { err = ERR_PTR(-EINVAL); @@ -1350,7 +1353,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) size_aligned = ALIGN(size, PAGE_SIZE); } try_again: - spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_ALLOC); + spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_ALLOC, current->tgid); if (IS_ERR(spa)) { pr_err_ratelimited("share pool: allocation failed due to alloc spa failure " "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); @@ -1758,7 +1761,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, uva = ERR_PTR(-EINVAL); goto out_drop_proc_stat; } - spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); + spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK, tsk->tgid); if (IS_ERR(spa)) { pr_err_ratelimited("share pool: k2u(task) failed due to alloc spa failure " "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); @@ -1787,9 +1790,9 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, }
if (enable_share_k2u_spg) - spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); + spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG, tsk->tgid); else - spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); + spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK, tsk->tgid);
if (IS_ERR(spa)) { up_read(&spg->rw_lock); @@ -2577,7 +2580,7 @@ static void rb_spa_stat_show(struct seq_file *seq) up_read(&spa->spg->rw_lock); }
- seq_printf(seq, "%2s%-14lx %2s%-14lx %-13ld ", + seq_printf(seq, "%2s%-14lx %2s%-14lx %-10ld ", "0x", spa->va_start, "0x", spa->va_end, byte2kb(spa->real_size)); @@ -2602,7 +2605,8 @@ static void rb_spa_stat_show(struct seq_file *seq) else seq_printf(seq, "%-5s ", "N");
- seq_printf(seq, "%-10d\n", atomic_read(&spa->use_count)); + seq_printf(seq, "%-8d ", spa->applier); + seq_printf(seq, "%-8d\n", atomic_read(&spa->use_count));
spin_lock(&sp_area_lock); } @@ -2712,8 +2716,8 @@ static int spa_stat_show(struct seq_file *seq, void *offset) spg_overview_show(seq); spa_overview_show(seq); /* print the file header */ - seq_printf(seq, "%-10s %-16s %-16s %-13s %-7s %-5s %-10s\n", - "Group ID", "va_start", "va_end", "Aligned KB", "Type", "Huge", "Ref"); + seq_printf(seq, "%-10s %-16s %-16s %-10s %-7s %-5s %-8s %-8s\n", + "Group ID", "va_start", "va_end", "Size(KB)", "Type", "Huge", "PID", "Ref"); rb_spa_stat_show(seq); return 0; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Let function names be general. Rename buff_vzalloc_user to vzalloc_user_account. Rename buff_vzalloc_hugepage_user to vzalloc_hugepage_user_account.
To support NUMA configuration, we also introduce parameter *node*, which means NUMA node id.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 8 ++++---- mm/share_pool.c | 18 ++++++++++-------- 2 files changed, 14 insertions(+), 12 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 859efd3525f35..b3041654084d6 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -253,8 +253,8 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm,
extern void *vmalloc_hugepage(unsigned long size); extern void *vmalloc_hugepage_user(unsigned long size); -extern void *buff_vzalloc_user(unsigned long size); -extern void *buff_vzalloc_hugepage_user(unsigned long size); +extern void *vzalloc_user_account(unsigned long size, int node); +extern void *vzalloc_hugepage_user_account(unsigned long size, int node);
void sp_exit_mm(struct mm_struct *mm);
@@ -456,12 +456,12 @@ static inline void *vmalloc_hugepage_user(unsigned long size) return NULL; }
-static inline void *buff_vzalloc_user(unsigned long size) +static inline void *vzalloc_user_account(unsigned long size, int node) { return NULL; }
-static inline void *buff_vzalloc_hugepage_user(unsigned long size) +static inline void *vzalloc_hugepage_user_account(unsigned long size, int node) { return NULL; } diff --git a/mm/share_pool.c b/mm/share_pool.c index e7d8b63138a8b..bb7afb6c0b7c6 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -3043,9 +3043,10 @@ void *vmalloc_hugepage_user(unsigned long size) EXPORT_SYMBOL(vmalloc_hugepage_user);
/** - * buff_vzalloc_user - allocate zeroed virtually contiguous memory + * vzalloc_user_account - allocate zeroed virtually contiguous memory * for userspace * @size: allocation size + * @node: NUMA node id * * The resulting memory area is zeroed so it can be mapped to userspace * without leaking data. @@ -3053,19 +3054,20 @@ EXPORT_SYMBOL(vmalloc_hugepage_user); * Compare to vmalloc_user(), this is a customized function because * __GFP_ACCOUNT is used to limit memory usage. */ -void *buff_vzalloc_user(unsigned long size) +void *vzalloc_user_account(unsigned long size, int node) { return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT, PAGE_KERNEL, - VM_USERMAP, NUMA_NO_NODE, + VM_USERMAP, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(buff_vzalloc_user); +EXPORT_SYMBOL(vzalloc_user_account);
/** - * buff_vzalloc_hugepage_user - allocate virtually contiguous hugetlb memory + * vzalloc_hugepage_user_account - allocate virtually contiguous hugetlb memory * for userspace * @size: allocation size + * @node: NUMA node id * * Allocate enough huge pages to cover @size and map them into * contiguous kernel virtual space. The resulting memory area @@ -3076,17 +3078,17 @@ EXPORT_SYMBOL(buff_vzalloc_user); * Compare to vmalloc_hugepage_user(), this is a customized function because * __GFP_ACCOUNT is used to limit memory usage. */ -void *buff_vzalloc_hugepage_user(unsigned long size) +void *vzalloc_hugepage_user_account(unsigned long size, int node) { /* PMD hugepage aligned */ size = PMD_ALIGN(size);
return __vmalloc_node_range(size, PMD_SIZE, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT, PAGE_KERNEL, - VM_HUGE_PAGES | VM_USERMAP, NUMA_NO_NODE, + VM_HUGE_PAGES | VM_USERMAP, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(buff_vzalloc_hugepage_user); +EXPORT_SYMBOL(vzalloc_hugepage_user_account);
int enable_ascend_share_pool;
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
We decide to remove sp_mutex thoroughly and use sp_group_sem instead.
Moreover, we use down_read(&spg->rw_lock) instead of down_write(), this increases parallelism of sp_add_group_task(), especially when there are many normal pages in the target sp_group.
Test: First a process is added into sp_group 1, then it calls sp_alloc() to get 3.5G normal page memory and 5.5G hugepage memory. Then 30 processes are created and added to sp_group 1 concurrently. The result is as follows.
without this patch: 15.0s with this patch: 4.5s gain: 70%
Tested-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index bb7afb6c0b7c6..6bcd01efd6bae 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -71,9 +71,6 @@ int sysctl_sp_debug_mode;
int sysctl_share_pool_map_lock_enable;
-/* for inter-group operations */ -static DEFINE_MUTEX(sp_mutex); - /* idr of all sp_groups */ static DEFINE_IDR(sp_group_idr); /* rw semaphore for sp_group_idr */ @@ -412,13 +409,13 @@ int sp_group_id_by_pid(int pid) } EXPORT_SYMBOL_GPL(sp_group_id_by_pid);
+/* the caller must hold sp_group_sem */ static struct sp_group *find_or_alloc_sp_group(int spg_id) { struct sp_group *spg; int ret; char name[20];
- down_write(&sp_group_sem); spg = __sp_find_spg_locked(current->pid, spg_id);
if (!spg) { @@ -432,7 +429,6 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) } ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id + 1, GFP_KERNEL); - up_write(&sp_group_sem); if (ret < 0) { pr_err_ratelimited("share pool: create group idr alloc failed\n"); goto out_kfree; @@ -472,7 +468,6 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) goto out_fput; } } else { - up_write(&sp_group_sem); down_read(&spg->rw_lock); if (!spg_valid(spg)) { up_read(&spg->rw_lock); @@ -488,9 +483,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id) out_fput: fput(spg->file); out_idr: - down_write(&sp_group_sem); idr_remove(&sp_group_idr, spg_id); - up_write(&sp_group_sem); out_kfree: kfree(spg); return ERR_PTR(ret); @@ -604,7 +597,7 @@ int sp_group_add_task(int pid, int spg_id) id_newly_generated = true; }
- mutex_lock(&sp_mutex); + down_write(&sp_group_sem);
rcu_read_lock();
@@ -618,6 +611,7 @@ int sp_group_add_task(int pid, int spg_id) if (ret) { if (id_newly_generated) free_sp_group_id((unsigned int)spg_id); + up_write(&sp_group_sem); goto out_unlock; }
@@ -636,9 +630,11 @@ int sp_group_add_task(int pid, int spg_id) mm = get_task_mm(tsk->group_leader); if (!mm) { ret = -ESRCH; + up_write(&sp_group_sem); goto out_put_task; } else if (mm->sp_group) { ret = -EEXIST; + up_write(&sp_group_sem); goto out_put_mm; }
@@ -647,6 +643,7 @@ int sp_group_add_task(int pid, int spg_id) ret = PTR_ERR(spg); if (id_newly_generated) free_sp_group_id((unsigned int)spg_id); + up_write(&sp_group_sem); goto out_put_mm; }
@@ -654,10 +651,14 @@ int sp_group_add_task(int pid, int spg_id) if (sysctl_ac_mode == AC_SINGLE_OWNER) { if (spg->owner != current->group_leader) { ret = -EPERM; + up_write(&sp_group_sem); goto out_drop_group; } }
+ mm->sp_group = spg; + up_write(&sp_group_sem); + /* per process statistics initialization */ stat = sp_init_proc_stat(tsk, mm); if (IS_ERR(stat)) { @@ -666,10 +667,7 @@ int sp_group_add_task(int pid, int spg_id) goto out_drop_group; }
- mm->sp_group = spg; - - down_write(&spg->rw_lock); - list_add_tail(&mm->sp_node, &spg->procs); + down_read(&spg->rw_lock); /* * create mappings of existing shared memory segments into this * new process' page table. @@ -735,22 +733,25 @@ int sp_group_add_task(int pid, int spg_id) } __sp_area_drop_locked(prev); spin_unlock(&sp_area_lock); + up_read(&spg->rw_lock);
- if (unlikely(ret)) { - /* spg->procs is modified, spg->rw_lock should be put below */ - list_del(&mm->sp_node); - mm->sp_group = NULL; - } - up_write(&spg->rw_lock); + sp_proc_stat_drop(stat); /* match with sp_init_proc_stat */
/* double drop when fail: ensure release stat */ if (unlikely(ret)) sp_proc_stat_drop(stat); - sp_proc_stat_drop(stat); /* match with sp_init_proc_stat */
out_drop_group: - if (unlikely(ret)) + if (unlikely(ret)) { + down_write(&sp_group_sem); + mm->sp_group = NULL; + up_write(&sp_group_sem); sp_group_drop(spg); + } else { + down_write(&spg->rw_lock); + list_add_tail(&mm->sp_node, &spg->procs); + up_write(&spg->rw_lock); + } out_put_mm: /* No need to put the mm if the sp group adds this mm successfully */ if (unlikely(ret)) @@ -758,7 +759,6 @@ int sp_group_add_task(int pid, int spg_id) out_put_task: put_task_struct(tsk); out_unlock: - mutex_unlock(&sp_mutex); return ret == 0 ? spg_id : ret; } EXPORT_SYMBOL_GPL(sp_group_add_task);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
We find there are two differnt errnos returned when do_mm_polulate is called and current thread is being killed, which may confuse logger readers.
1. do_mm_populate -> populate_vma_page_range -> __get_user_pages. if fatal_signal_pending fails, return -ERESTARTSYS(-512). 2. __get_user_pages -> follow_hugetlb_page. if fatal_signal_pending fails, return -EFAULT(-14).
So we call fatal_signal_pending after do_mm_populte fails in share pool and give info about the thread killing event.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 6bcd01efd6bae..d8ea53c3ab6ae 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -718,8 +718,11 @@ int sp_group_add_task(int pid, int spg_id) if (populate) { ret = do_mm_populate(mm, spa->va_start, populate, 0); if (ret) { - pr_warn_ratelimited("share pool: task add group failed, mm populate failed " - "(potential no enough memory when -12): %d, spa type is %d\n", + if (unlikely(fatal_signal_pending(current))) + pr_warn_ratelimited("share pool: task add group failed, current thread is killed\n"); + else + pr_warn_ratelimited("share pool: task add group failed, mm populate failed " + "(potential no enough memory when -12): %d, spa type is %d\n", ret, spa->type); down_write(&mm->mmap_sem); sp_munmap_task_areas(mm, spa->link.next); @@ -1427,10 +1430,12 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) sp_add_work_compact(); } if (ret) { - __sp_free(spg, sp_addr, size_aligned, - list_next_entry(mm, sp_node)); - pr_warn_ratelimited("share pool: allocation failed due to mm populate failed" - "(potential no enough memory when -12): %d\n", ret); + __sp_free(spg, sp_addr, size_aligned, list_next_entry(mm, sp_node)); + if (unlikely(fatal_signal_pending(current))) + pr_warn_ratelimited("share pool: allocation failed, current thread is killed\n"); + else + pr_warn_ratelimited("share pool: allocation failed due to mm populate failed" + "(potential no enough memory when -12): %d\n", ret); p = ERR_PTR(ret);
mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE;
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
In AI training scene, the sp area range is specified by host user, it will result in an address offset calculation error, so fix it to avoid the offset overflow.
If the user needs to specify the sp area range, the user should reserve the virtual memory space by themselvs, otherwise the sp area would be used for none share pool process.
Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index d8ea53c3ab6ae..42028e1498f5d 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -214,7 +214,7 @@ struct sp_area { }; static DEFINE_SPINLOCK(sp_area_lock); static struct rb_root sp_area_root = RB_ROOT; -static bool host_svm_sp_enable = false; +static bool sp_area_customized;
static unsigned long spa_size(struct sp_area *spa) { @@ -409,6 +409,18 @@ int sp_group_id_by_pid(int pid) } EXPORT_SYMBOL_GPL(sp_group_id_by_pid);
+static loff_t addr_to_offset(unsigned long addr, struct sp_area *spa) +{ + if (sp_area_customized == false) + return (loff_t)(addr - MMAP_SHARE_POOL_START); + + if (spa && spa->spg) + return (loff_t)(addr - spa->spg->dvpp_va_start); + + pr_err("share pool: the addr is not belong to share pool range\n"); + return addr; +} + /* the caller must hold sp_group_sem */ static struct sp_group *find_or_alloc_sp_group(int spg_id) { @@ -812,7 +824,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, unsigned long size_align = PMD_ALIGN(size); /* va aligned to 2M */
if ((flags & SP_DVPP)) { - if (host_svm_sp_enable == false) { + if (sp_area_customized == false) { vstart = MMAP_SHARE_POOL_16G_START; vend = MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE; } else { @@ -1204,7 +1216,7 @@ int sp_free(unsigned long addr)
/* Free the memory of the backing shmem or hugetlbfs */ mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; - offset = addr - MMAP_SHARE_POOL_START; + offset = addr_to_offset(addr, spa); ret = vfs_fallocate(spa_file(spa), mode, offset, spa_size(spa)); if (ret) pr_err("share pool: sp free fallocate failed: %d\n", ret); @@ -1241,7 +1253,7 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, unsigned long flags = MAP_FIXED | MAP_SHARED | MAP_POPULATE | MAP_SHARE_POOL; unsigned long vm_flags = VM_NORESERVE | VM_SHARE_POOL | VM_DONTCOPY; - unsigned long pgoff = (addr - MMAP_SHARE_POOL_START) >> PAGE_SHIFT; + unsigned long pgoff = addr_to_offset(addr, spa) >> PAGE_SHIFT;
/* Mark the mapped region to be locked. After the MAP_LOCKED is enable, * multiple tasks will preempt resources, causing performance loss. @@ -1439,7 +1451,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) p = ERR_PTR(ret);
mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; - offset = sp_addr - MMAP_SHARE_POOL_START; + offset = addr_to_offset(sp_addr, spa);
ret = vfs_fallocate(spa_file(spa), mode, offset, spa_size(spa)); if (ret) @@ -2457,7 +2469,7 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) spg->dvpp_multi_spaces = true; up_write(&spg->rw_lock);
- host_svm_sp_enable = true; + sp_area_customized = true;
sp_group_drop(spg); return true; @@ -2467,10 +2479,20 @@ EXPORT_SYMBOL_GPL(sp_config_dvpp_range); /* Check whether the address belongs to the share pool. */ bool is_sharepool_addr(unsigned long addr) { - if (host_svm_sp_enable == false) - return addr >= MMAP_SHARE_POOL_START && addr < (MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE); + struct sp_area *spa; + bool ret = false; + + if (sp_area_customized == false) + return addr >= MMAP_SHARE_POOL_START && + addr < (MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE); + + spa = __find_sp_area(addr); + if (spa && spa->spg) + ret = addr >= spa->spg->dvpp_va_start && + addr < (spa->spg->dvpp_va_start + spa->spg->dvpp_size);
- return addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_END; + __sp_area_drop(spa); + return ret; } EXPORT_SYMBOL_GPL(is_sharepool_addr);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
A task without adding to an sp group (such as only calls sp_k2u_to_task) should be shown in /proc/sharepool/proc_stat correctly.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 42028e1498f5d..03b8b99dd3e4c 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2766,15 +2766,17 @@ static int idr_proc_stat_cb(int id, void *p, void *data) */ long sp_alloc_nsize, non_sp_res, sp_res, non_sp_shm;
+ anon = get_mm_counter(mm, MM_ANONPAGES); + file = get_mm_counter(mm, MM_FILEPAGES); + shmem = get_mm_counter(mm, MM_SHMEMPAGES); + total_rss = anon + file + shmem; + /* - * a task which is the target of k2u(to task) but without adding to a - * sp group should be handled correctly. - * No longer mmget_not_zero(mm) but a process (k2u to task) may have - * problem + * a task without adding to an sp group should be handled correctly. */ spg = __sp_find_spg(id, SPG_ID_DEFAULT); if (!spg) - goto out; + goto non_spg;
down_read(&spg->rw_lock); if (!spg_valid(spg)) { @@ -2789,10 +2791,6 @@ static int idr_proc_stat_cb(int id, void *p, void *data) up_read(&spg->rw_lock); sp_group_drop(spg);
- anon = get_mm_counter(mm, MM_ANONPAGES); - file = get_mm_counter(mm, MM_FILEPAGES); - shmem = get_mm_counter(mm, MM_SHMEMPAGES); - total_rss = anon + file + shmem; /* * Statistics of RSS has a maximum 64 pages deviation (256KB). * Please check_sync_rss_stat(). @@ -2813,8 +2811,15 @@ static int idr_proc_stat_cb(int id, void *p, void *data) sp_res, non_sp_res, page2kb(mm->total_vm), page2kb(total_rss), page2kb(shmem), non_sp_shm); + return 0;
-out: +non_spg: + seq_printf(seq, "%-8d %-8c %-9d %-9ld %-9d %-10ld %-8ld %-7ld %-7ld %-10ld\n", + id, '-', 0, + byte2kb(atomic64_read(&stat->k2u_size)), + 0, page2kb(total_rss), + page2kb(mm->total_vm), page2kb(total_rss), + page2kb(shmem), page2kb(shmem)); return 0; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Now a process only calls sp_k2u(to task) but not in any sp group will lead to memleak of struct sp_proc_stat, after the process exits.
We should decouple the release of struct sp_group from the release of struct sp_proc_stat.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 4 +- mm/oom_kill.c | 2 +- mm/share_pool.c | 110 +++++++++++++++++++++---------------- 3 files changed, 66 insertions(+), 50 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index b3041654084d6..98629ad0c0c8a 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -171,7 +171,7 @@ extern int sp_register_notifier(struct notifier_block *nb); extern int sp_unregister_notifier(struct notifier_block *nb); extern bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid); extern bool is_sharepool_addr(unsigned long addr); -extern struct sp_proc_stat *sp_get_proc_stat(int tgid); +extern struct sp_proc_stat *sp_get_proc_stat_ref(int tgid); extern void sp_proc_stat_drop(struct sp_proc_stat *stat); extern void spa_overview_show(struct seq_file *seq); extern void spg_overview_show(struct seq_file *seq); @@ -371,7 +371,7 @@ static inline bool is_sharepool_addr(unsigned long addr) return false; }
-static inline struct sp_proc_stat *sp_get_proc_stat(int tgid) +static inline struct sp_proc_stat *sp_get_proc_stat_ref(int tgid) { return NULL; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0fc1c15062825..b67676c0d9a10 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -481,7 +481,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) }
if (ascend_sp_oom_show()) { - stat = sp_get_proc_stat(task->tgid); + stat = sp_get_proc_stat_ref(task->tgid);
pr_cont("[%7d] %5d %5d %8lu %8lu ", task->pid, from_kuid(&init_user_ns, task_uid(task)), diff --git a/mm/share_pool.c b/mm/share_pool.c index 03b8b99dd3e4c..1c37ff84b0b16 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -96,8 +96,19 @@ static struct sp_proc_stat *sp_get_proc_stat_locked(int tgid) struct sp_proc_stat *stat;
stat = idr_find(&sp_stat_idr, tgid); - if (stat) - atomic_inc(&stat->use_count); + + /* maybe NULL or not, we always return it */ + return stat; +} + +/* The caller must hold sp_stat_sem */ +static struct sp_proc_stat *sp_get_proc_stat_ref_locked(int tgid) +{ + struct sp_proc_stat *stat; + + stat = idr_find(&sp_stat_idr, tgid); + if (!stat || !atomic_inc_not_zero(&stat->use_count)) + stat = NULL;
/* maybe NULL or not, we always return it */ return stat; @@ -106,8 +117,6 @@ static struct sp_proc_stat *sp_get_proc_stat_locked(int tgid) /* * The caller must ensure no concurrency problem * for task_struct and mm_struct. - * - * The user must call sp_proc_stat_drop() after use. */ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, struct mm_struct *mm) @@ -138,8 +147,7 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, return ERR_PTR(-ENOMEM); }
- /* use_count = 2: match with sp_proc_stat_drop */ - atomic_set(&stat->use_count, 2); + atomic_set(&stat->use_count, 1); atomic64_set(&stat->alloc_size, 0); atomic64_set(&stat->k2u_size, 0); stat->tgid = tgid; @@ -159,6 +167,27 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, return stat; }
+static struct sp_proc_stat *sp_get_proc_stat(int tgid) +{ + struct sp_proc_stat *stat; + + down_read(&sp_stat_sem); + stat = sp_get_proc_stat_locked(tgid); + up_read(&sp_stat_sem); + return stat; +} + +/* user must call sp_proc_stat_drop() after use */ +struct sp_proc_stat *sp_get_proc_stat_ref(int tgid) +{ + struct sp_proc_stat *stat; + + down_read(&sp_stat_sem); + stat = sp_get_proc_stat_ref_locked(tgid); + up_read(&sp_stat_sem); + return stat; +} + /* statistics of all sp area, protected by sp_area_lock */ struct sp_spa_stat { unsigned int total_num; @@ -750,9 +779,6 @@ int sp_group_add_task(int pid, int spg_id) spin_unlock(&sp_area_lock); up_read(&spg->rw_lock);
- sp_proc_stat_drop(stat); /* match with sp_init_proc_stat */ - - /* double drop when fail: ensure release stat */ if (unlikely(ret)) sp_proc_stat_drop(stat);
@@ -1228,11 +1254,10 @@ int sp_free(unsigned long addr) atomic64_sub(spa->real_size, &kthread_stat.alloc_size); } else { stat = sp_get_proc_stat(current->mm->sp_stat_id); - if (stat) { + if (stat) atomic64_sub(spa->real_size, &stat->alloc_size); - sp_proc_stat_drop(stat); - } else - BUG(); + else + WARN(1, "share pool: %s: null process stat\n", __func__); }
drop_spa: @@ -1478,10 +1503,10 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id)
if (!IS_ERR(p)) { stat = sp_get_proc_stat(current->mm->sp_stat_id); - if (stat) { + if (stat) atomic64_add(size_aligned, &stat->alloc_size); - sp_proc_stat_drop(stat); - } + else + WARN(1, "share pool: %s: null process stat\n", __func__); }
/* this will free spa if mmap failed */ @@ -1776,14 +1801,14 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, if (spg_id != SPG_ID_NONE && spg_id != SPG_ID_DEFAULT) { pr_err_ratelimited("share pool: k2task invalid spg id %d\n", spg_id); uva = ERR_PTR(-EINVAL); - goto out_drop_proc_stat; + goto out_put_mm; } spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK, tsk->tgid); if (IS_ERR(spa)) { pr_err_ratelimited("share pool: k2u(task) failed due to alloc spa failure " "(potential no enough virtual memory when -75): %ld\n", PTR_ERR(spa)); uva = spa; - goto out_drop_proc_stat; + goto out_put_mm; }
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { @@ -1853,8 +1878,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, out_drop_spg: if (spg) sp_group_drop(spg); -out_drop_proc_stat: - sp_proc_stat_drop(stat); out_put_mm: mmput(mm); out_put_task: @@ -2273,10 +2296,9 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp atomic64_sub(spa->real_size, &kthread_stat.k2u_size); } else { stat = sp_get_proc_stat(current->mm->sp_stat_id); - if (stat) { + if (stat) atomic64_sub(spa->real_size, &stat->k2u_size); - sp_proc_stat_drop(stat); - } else + else WARN(1, "share pool: %s: null process stat\n", __func__); }
@@ -2512,21 +2534,10 @@ __setup("enable_sp_share_k2u_spg", enable_share_k2u_to_group);
/*** Statistical and maintenance functions ***/
-/* user must call sp_proc_stat_drop() after use */ -struct sp_proc_stat *sp_get_proc_stat(int tgid) -{ - struct sp_proc_stat *stat; - - down_read(&sp_stat_sem); - stat = sp_get_proc_stat_locked(tgid); - up_read(&sp_stat_sem); - return stat; -} - static void free_sp_proc_stat(struct sp_proc_stat *stat) { - stat->mm->sp_stat_id = 0; down_write(&sp_stat_sem); + stat->mm->sp_stat_id = 0; idr_remove(&sp_stat_idr, stat->tgid); up_write(&sp_stat_sem); kfree(stat); @@ -2557,7 +2568,7 @@ int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, up_read(&spg->rw_lock);
/* eliminate potential ABBA deadlock */ - stat = sp_get_proc_stat(task->mm->sp_stat_id); + stat = sp_get_proc_stat_ref(task->mm->sp_stat_id); if (unlikely(!stat)) { sp_group_drop(spg); return 0; @@ -2986,10 +2997,16 @@ void sp_group_post_exit(struct mm_struct *mm) struct sp_group *spg = mm->sp_group; long alloc_size, k2u_size;
- if (!spg || !enable_ascend_share_pool) + if (!enable_ascend_share_pool || !mm->sp_stat_id) return;
stat = sp_get_proc_stat(mm->sp_stat_id); + if (stat) { + alloc_size = atomic64_read(&stat->alloc_size); + k2u_size = atomic64_read(&stat->k2u_size); + } else + WARN(1, "share pool: can't find sp proc stat\n"); + /* * There are two basic scenarios when a process in the share pool is * exiting but its share pool memory usage is not 0. @@ -3000,25 +3017,24 @@ void sp_group_post_exit(struct mm_struct *mm) * called sp_free(u). Now A's share pool memory usage is a negative * number. Notice B's memory usage will be a positive number. * - * We decide to print a info when seeing both of the scenarios. + * We decide to print an info when seeing both of the scenarios. + * + * A process not in an sp group doesn't need to print because there + * wont't be any memory which is not freed. */ - if (stat) { - alloc_size = atomic64_read(&stat->alloc_size); - k2u_size = atomic64_read(&stat->k2u_size); + if (spg) { if (alloc_size != 0 || k2u_size != 0) pr_info("share pool: process %s(%d) of sp group %d exits. " "It applied %ld aligned KB, k2u shared %ld aligned KB\n", stat->comm, mm->sp_stat_id, mm->sp_group->id, byte2kb(alloc_size), byte2kb(k2u_size));
- /* match with sp_get_proc_stat in THIS function */ - sp_proc_stat_drop(stat); - /* match with sp_init_proc_stat, we expect stat is released after this call */ - sp_proc_stat_drop(stat); + /* match with sp_group_add_task -> find_or_alloc_sp_group */ + sp_group_drop(spg); }
- /* match with sp_group_add_task -> find_or_alloc_sp_group */ - sp_group_drop(spg); + /* match with sp_init_proc_stat, we expect stat is released after this call */ + sp_proc_stat_drop(stat); }
struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask,
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
1. Add parameter checking in sp_alloc. 2. Add variable initialization in sp_group_exit.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 1c37ff84b0b16..d0189985b2d83 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1329,6 +1329,11 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) if (enable_mdc_default_group) spg_id = mdc_default_group_id;
+ if (unlikely(!size)) { + pr_err_ratelimited("share pool: allocation failed, invalid size %lu\n", size); + return ERR_PTR(-EINVAL); + } + if (spg_id != SPG_ID_DEFAULT && spg_id < SPG_ID_MIN) { pr_err_ratelimited("share pool: allocation failed, invalid group id %d\n", spg_id); return ERR_PTR(-EINVAL); @@ -2962,7 +2967,7 @@ EXPORT_SYMBOL(sharepool_no_page); void sp_group_exit(struct mm_struct *mm) { struct sp_group *spg = mm->sp_group; - bool is_alive; + bool is_alive = true;
if (!spg || !enable_ascend_share_pool) return;
From: Zhou Guanghui zhouguanghui1@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
---------------------------------------------
During the u2k, the page is being migrated. Wait until the migration is complete.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 1 + mm/share_pool.c | 41 +++++++++++++++++++++++++++++++++----- 2 files changed, 37 insertions(+), 5 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 98629ad0c0c8a..9d81fc7a94c7c 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -105,6 +105,7 @@ struct sp_walk_data { unsigned long uva_aligned; unsigned long page_size; bool is_hugepage; + pmd_t *pmd; };
/* per process memory usage statistics indexed by tgid */ diff --git a/mm/share_pool.c b/mm/share_pool.c index d0189985b2d83..dd90f5d1742dd 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -46,6 +46,7 @@ #include <linux/hugetlb.h> #include <linux/compaction.h> #include <linux/preempt.h> +#include <linux/swapops.h>
/* access control mode macros */ #define AC_NONE 0 @@ -1893,21 +1894,50 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, } EXPORT_SYMBOL_GPL(sp_make_share_k2u);
+static int sp_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct sp_walk_data *sp_walk_data = walk->private; + + sp_walk_data->pmd = pmd; + return 0; +} + static int sp_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - struct page *page = pte_page(*pte); - struct sp_walk_data *sp_walk_data; + struct page *page; + struct sp_walk_data *sp_walk_data = walk->private; + pmd_t *pmd = sp_walk_data->pmd; + spinlock_t *ptl; + +retry: + pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
if (unlikely(!pte_present(*pte))) { - pr_debug("share pool: the page of addr %lx unexpectedly not in RAM\n", (unsigned long)addr); - return -EFAULT; + swp_entry_t entry; + + if (pte_none(*pte)) + goto no_page; + entry = pte_to_swp_entry(*pte); + if (!is_migration_entry(entry)) + goto no_page; + pte_unmap_unlock(pte, ptl); + migration_entry_wait(walk->mm, pmd, addr); + goto retry; }
- sp_walk_data = walk->private; + page = pte_page(*pte); get_page(page); + pte_unmap_unlock(pte, ptl); sp_walk_data->pages[sp_walk_data->page_count++] = page; return 0; + +no_page: + pte_unmap_unlock(pte, ptl); + pr_debug("share pool: the page of addr %lx unexpectedly not in RAM\n", + (unsigned long)addr); + return -EFAULT; }
static int sp_test_walk(unsigned long addr, unsigned long next, @@ -2010,6 +2040,7 @@ static int __sp_walk_page_range(unsigned long uva, unsigned long size, } else { sp_walk_data->is_hugepage = false; sp_walk.pte_entry = sp_pte_entry; + sp_walk.pmd_entry = sp_pmd_entry; }
sp_walk_data->page_size = page_size;
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Fix warnings: missing braces around initializer
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index dd90f5d1742dd..f186efadd3e6c 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -89,7 +89,7 @@ static DEFINE_IDR(sp_stat_idr); static DECLARE_RWSEM(sp_stat_sem);
/* for kthread buff_module_guard_work */ -static struct sp_proc_stat kthread_stat = {0}; +static struct sp_proc_stat kthread_stat;
/* The caller must hold sp_stat_sem */ static struct sp_proc_stat *sp_get_proc_stat_locked(int tgid) @@ -203,7 +203,7 @@ struct sp_spa_stat { unsigned long dvpp_va_size; };
-static struct sp_spa_stat spa_stat = {0}; +static struct sp_spa_stat spa_stat;
/* statistics of all sp group born from sp_alloc and k2u(spg) */ struct sp_spg_stat { @@ -211,7 +211,7 @@ struct sp_spg_stat { atomic64_t spa_total_size; };
-static struct sp_spg_stat spg_stat = {0}; +static struct sp_spg_stat spg_stat;
/*** Global share pool VA allocator ***/
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Fix warnings: symbol 'XX' was not declared. Should it be static?
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index f186efadd3e6c..b5ae71d0f551b 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -59,11 +59,11 @@ #define page2kb(page_num) ((page_num) << (PAGE_SHIFT - 10))
/* mdc scene hack */ -int enable_mdc_default_group; +static int __read_mostly enable_mdc_default_group; static const int mdc_default_group_id = 1;
/* share the uva to the whole group */ -int enable_share_k2u_spg; +static int __read_mostly enable_share_k2u_spg;
/* access control mode */ int sysctl_ac_mode = AC_NONE;
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: doc bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Update function documentation, this also fix warnings: Function parameter or member 'PARAM_NAME' not described in 'FUNC_NAME'
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 136 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 96 insertions(+), 40 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index b5ae71d0f551b..4c9105722c35e 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -418,6 +418,14 @@ static struct sp_group *__sp_find_spg(int pid, int spg_id) return spg; }
+/** + * sp_group_id_by_pid() - Get the sp_group ID of a process. + * @pid: pid of target process. + * + * Return: + * >0 - the sp_group ID. + * -ENODEV - target process doesn't belong to any sp_group. + */ int sp_group_id_by_pid(int pid) { struct sp_group *spg; @@ -566,14 +574,16 @@ static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) }
/** - * sp_group_add_task - add a process to an sp_group - * @pid: the pid of the task to be added - * @spg_id: the ID of the sp_group + * sp_group_add_task() - Add a process to an share group (sp_group). + * @pid: the pid of the task to be added. + * @spg_id: the ID of the sp_group. * * A thread group can't be added to more than one sp_group. * - * Return: The manually allocated ID is between [SPG_ID_MIN, SPG_ID_MAX] - * The automatically allocated ID is between [SPG_ID_AUTO_MIN, SPG_ID_AUTO_MAX] + * Return: A postive group number for success, -errno on failure. + * + * The manually specified ID is between [SPG_ID_MIN, SPG_ID_MAX]. + * The automatically allocated ID is between [SPG_ID_AUTO_MIN, SPG_ID_AUTO_MAX]. * When negative, the return value is -errno. */ int sp_group_add_task(int pid, int spg_id) @@ -833,11 +843,15 @@ static struct rb_node *free_sp_area_cache; static unsigned long cached_hole_size; static unsigned long cached_vstart; /* affected by SP_DVPP and sp_config_dvpp_range() */
-/* - * Allocate a region of VA from the share pool. - * @size - the size of VA to allocate +/** + * sp_alloc_area() - Allocate a region of VA from the share pool. + * @size: the size of VA to allocate. + * @flags: how to allocate the memory. + * @spg: the share group that the memory is allocated to. + * @type: the type of the region. + * @applier: the pid of the task which allocates the region. * - * Return NULL if fail. + * Return: a valid pointer for success, NULL on failure. */ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, struct sp_group *spg, enum spa_type type, @@ -1192,11 +1206,14 @@ static void __sp_free(struct sp_group *spg, unsigned long addr, } }
-/* - * Free the memory allocated by sp_alloc() - * @addr - the starting VA of the memory +/** + * sp_free() - Free the memory allocated by sp_alloc(). + * @addr: the starting VA of the memory. * - * Return fail if the memory can't be found or was not allocted by share pool. + * Return: + * * 0 - success. + * * -EINVAL - the memory can't be found or was not allocted by share pool. + * * -EPERM - the caller has no permision to free the memory. */ int sp_free(unsigned long addr) { @@ -1301,12 +1318,16 @@ static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, }
/** - * Allocate shared memory for all the processes in the same sp_group - * size - the size of memory to allocate - * sp_flags - how to allocate the memory - * spg_id - the share group that the memory is allocated to. + * sp_alloc() - Allocate shared memory for all the processes in a sp_group. + * @size: the size of memory to allocate. + * @sp_flags: how to allocate the memory. + * @spg_id: the share group that the memory is allocated to. * * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + * + * Return: + * * if succeed, return the starting kernel address of the shared memory. + * * if fail, return the pointer of -errno. */ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) { @@ -1527,9 +1548,14 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) } EXPORT_SYMBOL_GPL(sp_alloc);
-/* - * return value: >0 means this is a hugepage addr - * =0 means a normal addr. <0 means an errno. +/** + * is_vmap_hugepage() - Check if a kernel address belongs to vmalloc family. + * @addr: the kernel space address to be checked. + * + * Return: + * * >0 - a vmalloc hugepage addr. + * * =0 - a normal vmalloc addr. + * * -errno - failure. */ static int is_vmap_hugepage(unsigned long addr) { @@ -1648,12 +1674,14 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, }
/** - * Share kernel memory to a specified task + * sp_make_share_kva_to_task() - Share kernel memory to a specified task. * @kva: the VA of shared kernel memory * @spa: the sp area associated with the shared user address * @mm: mm_struct of target task * - * Return: the shared user address to start at + * Return: + * * if succeed, return the shared user address to start at. + * * if fail, return the pointer of -errno. */ static void *sp_make_share_kva_to_task(unsigned long kva, struct sp_area *spa, struct mm_struct *mm) @@ -1725,17 +1753,19 @@ static bool vmalloc_area_clr_flag(struct sp_area *spa, unsigned long kva, unsign }
/** - * Share kernel memory to a specified process or sp_group - * @kva: the VA of shared kernel memory - * @size: the size of shared kernel memory + * sp_make_share_k2u() - Share kernel memory to a specified process or sp_group. + * @kva: the VA of shared kernel memory. + * @size: the size of shared kernel memory. * @sp_flags: how to allocate the memory. We only support SP_DVPP. * @pid: the pid of the specified process - * @spg_id: currently, only support default value(SPG_ID_DEFAULT) and other values - * are useless. - * - * Return: the shared target user address to start at + * @spg_id: the share group that the memory is shared to. * * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + * Share kernel memory to a specified task if spg_id == SPG_ID_NONE. + * + * Return: + * * if succeed, return the shared user address to start at. + * * if fail, return the pointer of -errno. */ void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned long sp_flags, int pid, int spg_id) @@ -1980,6 +2010,12 @@ static int sp_hugetlb_entry(pte_t *ptep, unsigned long hmask, }
/** + * __sp_walk_page_range() - Walk page table with caller specific callbacks. + * @uva: the start VA of user memory. + * @size: the size of user memory. + * @mm: mm struct of the target task. + * @sp_walk_data: a structure of a page pointer array. + * * the caller must hold mm->mmap_sem * * Notes for parameter alignment: @@ -2093,13 +2129,14 @@ static void __sp_walk_page_free(struct sp_walk_data *data) }
/** - * Share user memory of a specified process to kernel + * sp_make_share_u2k() - Share user memory of a specified process to kernel. * @uva: the VA of shared user memory * @size: the size of shared user memory * @pid: the pid of the specified process * - * Return: if success, return the starting kernel address of the shared memory. - * if failed, return the pointer of -errno. + * Return: + * * if success, return the starting kernel address of the shared memory. + * * if failed, return the pointer of -errno. */ void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) { @@ -2396,15 +2433,16 @@ static int sp_unshare_kva(unsigned long kva, unsigned long size) }
/** - * Unshare the kernel or user memory which shared by calling sp_make_share_{k2u,u2k}(). + * sp_unshare() - Unshare the kernel or user memory which shared by calling + * sp_make_share_{k2u,u2k}(). * @va: the specified virtual address of memory * @size: the size of unshared memory * @pid: the pid of the specified process if the VA is user address * @spg_id: the ID of the specified sp_group if the VA is user address * - * Return -errno if fail. - * * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + * + * Return: 0 for success, -errno on failure. */ int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) { @@ -2429,8 +2467,16 @@ int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) EXPORT_SYMBOL_GPL(sp_unshare);
/** - * Return 0 when success. - * When return value < 0, information in sp_walk_data is useless + * sp_walk_page_range() - Walk page table with caller specific callbacks. + * @uva: the start VA of user memory. + * @size: the size of user memory. + * @tsk: task struct of the target task. + * @sp_walk_data: a structure of a page pointer array. + * + * Return: 0 for success, -errno on failure. + * + * When return 0, sp_walk_data describing [uva, uva+size) can be used. + * When return -errno, information in sp_walk_data is useless. */ int sp_walk_page_range(unsigned long uva, unsigned long size, struct task_struct *tsk, struct sp_walk_data *sp_walk_data) @@ -2471,6 +2517,10 @@ int sp_walk_page_range(unsigned long uva, unsigned long size, } EXPORT_SYMBOL_GPL(sp_walk_page_range);
+/** + * sp_walk_page_free() - Free the sp_walk_data structure. + * @sp_walk_data: a structure of a page pointer array to be freed. + */ void sp_walk_page_free(struct sp_walk_data *sp_walk_data) { check_interrupt_context(); @@ -2495,13 +2545,14 @@ int sp_unregister_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(sp_unregister_notifier);
/** - * user can config the share pool start addrese of each Da-vinci device + * sp_config_dvpp_range() - User can config the share pool start address + * of each Da-vinci device. * @start: the value of share pool start * @size: the value of share pool * @device_id: the num of Da-vinci device * @pid: the pid of device process * - * Return false if parameter invalid of has been set up. + * Return true for success, false if parameter invalid of has been set up. */ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) { @@ -2534,7 +2585,12 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) } EXPORT_SYMBOL_GPL(sp_config_dvpp_range);
-/* Check whether the address belongs to the share pool. */ +/** + * is_sharepool_addr() - Check if a user memory address belongs to share pool. + * @addr: the userspace address to be checked. + * + * Return true if addr belongs to share pool, or false vice versa. + */ bool is_sharepool_addr(unsigned long addr) { struct sp_area *spa;
From: Zhou Guanghui zhouguanghui1@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------------
In the share pool scenario, when the shared memory is applied for, the do_mm_populate function is performed at the same time, that is, the corresponding pages are allocated. In the current share pool implementation, the memory is charged to the memcg of the first task added to this share pool group.
This is unreasonable and may cause memcg of first task oom. So, we should charge the pages to the memcg of current task.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/shmem.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/mm/shmem.c b/mm/shmem.c index 54c5cc0610e6d..90e75a0fa5bc6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -36,6 +36,7 @@ #include <linux/uio.h> #include <linux/khugepaged.h> #include <linux/hugetlb.h> +#include <linux/share_pool.h>
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
@@ -1769,7 +1770,8 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, * bring it back from swap or allocate. */ sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = vma ? vma->vm_mm : current->mm; + charge_mm = vma && !sp_check_vm_share_pool(vma->vm_flags) ? + vma->vm_mm : current->mm;
if (swap.val) { /* Look it up and read it in.. */
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
We did an compaction experiment on Hi1951. First creating external fragmentation, then echo 1 > /proc/sys/vm/compact_memory.
Fragmentation size | 0G | 2G | 4G | 8G Compaction time(s) | 0.07 | 0.85 | 1.7 | 3.4
Obviously, sysctl_compaction_handler has a big performance impact.
We optimize share pool compact procedure as follows: 1. At most one compact daemon is allowed. 2. Creating a compact daemon is only allowed when the last one is finished at least *sysctl_sp_compact_interval* seconds ago.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 4 ++++ kernel/sysctl.c | 18 ++++++++++++++++++ mm/share_pool.c | 24 +++++++++++++++++++++--- 3 files changed, 43 insertions(+), 3 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 9d81fc7a94c7c..164efeb81889d 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -40,6 +40,10 @@ extern int enable_ascend_share_pool;
extern int sysctl_share_pool_map_lock_enable;
+extern int sysctl_sp_compact_enable; +extern unsigned long sysctl_sp_compact_interval; +extern unsigned long sysctl_sp_compact_interval_max; + #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC extern bool vmap_allow_huge; #endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index cd2d114f3391c..c5d4395efd430 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1789,6 +1789,24 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "sharepool_compact_enable", + .data = &sysctl_sp_compact_enable, + .maxlen = sizeof(sysctl_sp_compact_enable), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "sharepool_compact_interval", + .data = &sysctl_sp_compact_interval, + .maxlen = sizeof(sysctl_sp_compact_interval), + .mode = 0600, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero_ul, + .extra2 = &sysctl_sp_compact_interval_max, + }, #endif { } }; diff --git a/mm/share_pool.c b/mm/share_pool.c index 4c9105722c35e..ad842151f1c71 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1127,26 +1127,44 @@ void sp_area_drop(struct vm_area_struct *vma) spin_unlock(&sp_area_lock); }
-static unsigned long last_jiffies; +int sysctl_sp_compact_enable; +unsigned long sysctl_sp_compact_interval = 30UL; +unsigned long sysctl_sp_compact_interval_max = 1000UL; +static unsigned long compact_last_jiffies; +static unsigned long compact_daemon_status; +#define COMPACT_START 1 +#define COMPACT_STOP 0 + static void sp_compact_nodes(struct work_struct *work) { sysctl_compaction_handler(NULL, 1, NULL, NULL, NULL);
kfree(work); + + compact_last_jiffies = jiffies; + cmpxchg(&compact_daemon_status, COMPACT_START, COMPACT_STOP); }
static void sp_add_work_compact(void) { struct work_struct *compact_work;
- if (!time_after(jiffies, last_jiffies + 10 * HZ)) + if (!sysctl_sp_compact_enable) + return; + + /* experimental compaction time: 4GB->1.7s, 8GB->3.4s */ + if (!time_after(jiffies, + compact_last_jiffies + sysctl_sp_compact_interval * HZ)) + return; + + if (cmpxchg(&compact_daemon_status, COMPACT_STOP, COMPACT_START) == + COMPACT_START) return;
compact_work = kzalloc(sizeof(*compact_work), GFP_KERNEL); if (!compact_work) return;
- last_jiffies = jiffies; INIT_WORK(compact_work, sp_compact_nodes); schedule_work(compact_work); }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Address range [MMAP_SHARE_POOL_START, MMAP_SHARE_POOL_16G_START) and [MMAP_SHARE_POOL_16G_START, MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE) is always valid.
Signed-off-by: Peng Wu wupeng58@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index ad842151f1c71..995db20a1d3b9 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2603,6 +2603,12 @@ bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) } EXPORT_SYMBOL_GPL(sp_config_dvpp_range);
+static bool is_sp_normal_addr(unsigned long addr) +{ + return addr >= MMAP_SHARE_POOL_START && + addr < MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE; +} + /** * is_sharepool_addr() - Check if a user memory address belongs to share pool. * @addr: the userspace address to be checked. @@ -2615,13 +2621,13 @@ bool is_sharepool_addr(unsigned long addr) bool ret = false;
if (sp_area_customized == false) - return addr >= MMAP_SHARE_POOL_START && - addr < (MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE); + return is_sp_normal_addr(addr);
spa = __find_sp_area(addr); if (spa && spa->spg) - ret = addr >= spa->spg->dvpp_va_start && - addr < (spa->spg->dvpp_va_start + spa->spg->dvpp_size); + ret = (addr >= spa->spg->dvpp_va_start && + addr < spa->spg->dvpp_va_start + spa->spg->dvpp_size) || + is_sp_normal_addr(addr);
__sp_area_drop(spa); return ret;
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: perf bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
We encounter a problem as follows:
[ 3057. 75094] share pool: task add group failed, current thread is killed [ 3057. 75152] [ascend] [drv_buff] [buff_mv_pid_node_to_recycle_list 872] rosnode:12273,12273 release empty list node pid 12273, group_id 1 [ 3057. 76380] [ascend] [ERROR] [drv_buff] [buff_req_ioctl_pid_add_group 443] rosnode:12297,12297 pid add group failed, pid:12297, grp_id:1, ret -512 [ 3057. 76382] [ascend] [drv_buff] [buff_ioctl 841] rosnode:12297,12297 buff_req_ioctl_handlers failed. ret:-512 [ 3057. 76452] Unable to handle kernel paging request at virtual address dead000000000108 [ 3057. 76454] Mem abort info: [ 3057. 76456] ESR = 0x96000044 [ 3057. 76457] Exception class = DABT (current EL), IL = 32 bits [ 3057. 76458] SET = 0, FnV = 0 [ 3057. 76459] EA = 0, S1PTW = 0 [ 3057. 76460] Data abort info: [ 3057. 76461] ISV = 0, ISS = 0x00000044 [ 3057. 76462] CM = 0, WnR = 1 [ 3057. 76463] [dead000000000108] address between user and kernel address ranges [ 3057. 76466] Internal error: Oops: 96000044 [#1] SMP [ 3057. 76469] Process rosnode (pid: 12308, stack limit = 0x0000000012aa85df) [ 3057. 76473] CPU: 10 PID: 12308 Comm: rosnode Tainted: P C O 4.19.95-1.h1.AOS2.0.aarch64 #1 [ 3057. 76474] Hardware name: evb (DT) [ 3057. 76476] pstate: 20400009 (nzCv daif +PAN -UAO) [ 3057. 76483] pc : sp_group_exit+0x94/0x130 [ 3057. 76486] lr : sp_group_exit+0x48/0x130 [ 3057. 76486] sp : ffff00001a163c10 [ 3057. 76487] pmr_save: 000000e0 [ 3057. 76489] x29: ffff00001a163c10 x28: ffff800887e2a940 [ 3057. 76491] x27: 0000000000000000 x26: ffff800d8098ca40 [ 3057. 76492] x25: ffff80089a879168 x24: ffff00001a163dd0 [ 3057. 76494] x23: 0000000000000000 x22: 0000000000000002 [ 3057. 76495] x21: ffff800896e73088 x20: ffff80089a879100 [ 3057. 76496] x19: ffff800896e73000 x18: ffff7e002ca9a4f4 [ 3057. 76498] x17: 0000000000000001 x16: 0000000000000001 [ 3057. 76499] x15: 0400000000000000 x14: ffff800bd5d0d050 [ 3057. 76500] x13: 0000000000000001 x12: 0000000000000000 [ 3057. 76502] x11: 0000000000000000 x10: 00000000000009e0 [ 3057. 76503] x9 : ffff00001a163a90 x8 : ffff800887e2b380 [ 3057. 76505] x7 : 00000000000000b4 x6 : 0000001b5b9081bb [ 3057. 76506] x5 : dead000000000100 x4 : dead000000000200 [ 3057. 76507] x3 : dead000000000100 x2 : dead000000000200 [ 3057. 76508] x1 : ffff800d81365400 x0 : ffff800896e73088 [ 3057. 76510] Call trace: [ 3057. 76513] sp_group_exit+0x94/0x130 [ 3057. 76517] mmput+0x20/0x170 [ 3057. 76519] do_exit+0x338/0xb38 [ 3057. 76520] do_group_exit+0x3c/0xe8 [ 3057. 76522] get_signal+0x14c/0x7d8 [ 3057. 76524] do_signal+0x88/0x290 [ 3057. 76525] do_notify_resume+0x150/0x3c8 [ 3057. 76528] work_pending+0x8/0x10 [ 3057. 76530] Code: d2804004 f2fbd5a5 f2fbd5a4 aa1503e0 (f9000462) [ 3057. 76534] [kbox] unable to set sctrl register, maybe the domain is not SD, continue [ 3057. 76535] [kbox] catch die event on cpu 10 [ 3057. 76537] [kbox] catch die event, start logging [ 3057. 76540] [kbox] die info:Oops:0044 [ 3057. 76540] [kbox] start to collect
If process A adds process B into an sp_group and B is killed at the mean time, then the calling of sp_group_add_task for B is failed and
list_del(&mm->sp_node);
is executed. Notice there is also an execution of this code in sp_group_exit for B, so mm->sp_node is double freed.
The addr of sp_node->next is LIST_POISON1, which is dead000000000108 in arm64.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 995db20a1d3b9..b44af9a7c233e 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -3089,22 +3089,22 @@ void sp_group_exit(struct mm_struct *mm) * because the last owner of this mm is in exiting procedure: * do_exit() -> exit_mm() -> mmput() -> THIS function. */ - down_write(&spg->rw_lock); - if (spg_valid(spg) && atomic_read(&mm->mm_users) == MM_WOULD_FREE) { + if (atomic_read(&mm->mm_users) == MM_WOULD_FREE) { + down_write(&spg->rw_lock); /* a dead group should NOT be reactive again */ - if (list_is_singular(&spg->procs)) + if (spg_valid(spg) && list_is_singular(&spg->procs)) is_alive = spg->is_alive = false; - list_del(&mm->sp_node); /* affect spg->procs */ + if (mm->sp_group) /* concurrency handle of sp_group_add_task */ + list_del(&mm->sp_node); /* affect spg->procs */ up_write(&spg->rw_lock);
if (!is_alive) blocking_notifier_call_chain(&sp_notifier_chain, 0, mm->sp_group); + /* match with get_task_mm() in sp_group_add_task() */ atomic_dec(&mm->mm_users); - return; } - up_write(&spg->rw_lock); }
void sp_group_post_exit(struct mm_struct *mm)
From: Peng Wu wupeng58@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------
Adding a function for getting node id, which can be used to alloc share pool memory on a specified memory node.
Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: chenweilong chenweilong@huawei.com Reviewed-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 15 +++++++++--- mm/share_pool.c | 47 +++++++++++++++++++++++++++++++++----- 2 files changed, 53 insertions(+), 9 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 164efeb81889d..b0b2750e7bbe1 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -10,6 +10,10 @@ #define SP_HUGEPAGE (1 << 0) #define SP_HUGEPAGE_ONLY (1 << 1) #define SP_DVPP (1 << 2) +#define DEVICE_ID_MASK 0x3ff +#define DEVICE_ID_SHIFT 32 +#define SP_FLAG_MASK (SP_HUGEPAGE | SP_HUGEPAGE_ONLY | SP_DVPP | \ + (_AC(DEVICE_ID_MASK, UL) << DEVICE_ID_SHIFT))
#define SPG_ID_NONE -1 /* not associated with sp_group, only for specified thread */ #define SPG_ID_DEFAULT 0 /* use the spg id of current thread */ @@ -22,7 +26,7 @@ #define SPG_ID_DVPP_PASS_THROUGH_MAX 899999 #define SPG_ID_DVPP_PASS_THROUGH 900000
-#define MAX_DEVID 1 /* the max num of Da-vinci devices */ +#define MAX_DEVID 2 /* the max num of Da-vinci devices */
/* to align the pointer to the (next) PMD boundary */ #define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE) @@ -54,9 +58,9 @@ extern bool vmap_allow_huge; * |-------------------- 8T -------------------|---|------ 8T ------------| * | Device 0 | Device 1 |...| | * |----------------------------------------------------------------------| - * |- 16G -|- 16G -|- 16G -|- 16G -| | | | | + * |------------- 16G -------------| 16G | | | * | DVPP GROUP0 | DVPP GROUP1 | ... | ... |...| sp normal memory | - * | svm | sp | svm | sp | | | | | + * | sp | sp | | | | | * |----------------------------------------------------------------------| * * The host SVM feature reserves 8T virtual memory by mmap, and due to the @@ -181,6 +185,7 @@ extern void sp_proc_stat_drop(struct sp_proc_stat *stat); extern void spa_overview_show(struct seq_file *seq); extern void spg_overview_show(struct seq_file *seq); extern void proc_sharepool_init(void); +extern int sp_node_id(struct vm_area_struct *vma);
static inline struct task_struct *sp_get_task(struct mm_struct *mm) { @@ -485,6 +490,10 @@ static inline void sp_free_pages(struct page *page, struct vm_struct *area) { }
+static inline int sp_node_id(struct vm_area_struct *vma) +{ + return numa_node_id(); +} #endif
#endif /* LINUX_SHARE_POOL_H */ diff --git a/mm/share_pool.c b/mm/share_pool.c index b44af9a7c233e..90930e4a8dfe4 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -241,6 +241,7 @@ struct sp_area { struct mm_struct *mm; /* owner of k2u(task) */ unsigned long kva; /* shared kva */ pid_t applier; /* the original applier process */ + int node_id; /* memory node */ }; static DEFINE_SPINLOCK(sp_area_lock); static struct rb_root sp_area_root = RB_ROOT; @@ -863,11 +864,13 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, unsigned long vend = MMAP_SHARE_POOL_16G_START; unsigned long addr; unsigned long size_align = PMD_ALIGN(size); /* va aligned to 2M */ + int node_id = (flags >> DEVICE_ID_SHIFT) & DEVICE_ID_MASK;
if ((flags & SP_DVPP)) { if (sp_area_customized == false) { - vstart = MMAP_SHARE_POOL_16G_START; - vend = MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE; + vstart = MMAP_SHARE_POOL_16G_START + + node_id * MMAP_SHARE_POOL_16G_SIZE; + vend = vstart + MMAP_SHARE_POOL_16G_SIZE; } else { if (!spg) { pr_err_ratelimited("share pool: don't allow k2u(task) in host svm multiprocess scene\n"); @@ -878,7 +881,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, } }
- spa = kmalloc(sizeof(struct sp_area), GFP_KERNEL); + spa = __kmalloc_node(sizeof(struct sp_area), GFP_KERNEL, node_id); if (unlikely(!spa)) { pr_err_ratelimited("share pool: alloc spa failed due to lack of memory\n"); return ERR_PTR(-ENOMEM); @@ -973,6 +976,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, spa->mm = NULL; spa->kva = 0; /* NULL pointer */ spa->applier = applier; + spa->node_id = node_id;
if (spa_inc_usage(type, size, (flags & SP_DVPP))) { err = ERR_PTR(-EINVAL); @@ -1379,7 +1383,7 @@ void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) return ERR_PTR(-EINVAL); }
- if (sp_flags & ~(SP_HUGEPAGE_ONLY | SP_HUGEPAGE | SP_DVPP)) { + if (sp_flags & (~SP_FLAG_MASK)) { pr_err_ratelimited("share pool: allocation failed, invalid flag %lx\n", sp_flags); return ERR_PTR(-EINVAL); } @@ -2606,7 +2610,8 @@ EXPORT_SYMBOL_GPL(sp_config_dvpp_range); static bool is_sp_normal_addr(unsigned long addr) { return addr >= MMAP_SHARE_POOL_START && - addr < MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE; + addr < MMAP_SHARE_POOL_16G_START + + MAX_DEVID * MMAP_SHARE_POOL_16G_SIZE; }
/** @@ -2634,6 +2639,26 @@ bool is_sharepool_addr(unsigned long addr) } EXPORT_SYMBOL_GPL(is_sharepool_addr);
+int sp_node_id(struct vm_area_struct *vma) +{ + struct sp_area *spa; + int node_id = numa_node_id(); + + if (!enable_ascend_share_pool) + return node_id; + + if (vma) { + spa = __find_sp_area(vma->vm_start); + if (spa) { + node_id = spa->node_id; + __sp_area_drop(spa); + } + } + + return node_id; +} +EXPORT_SYMBOL_GPL(sp_node_id); + static int __init mdc_default_group(char *s) { enable_mdc_default_group = 1; @@ -2999,6 +3024,16 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm, unsigned long haddr = address & huge_page_mask(h); bool new_page = false; int err; + int node_id; + struct sp_area *spa; + + spa = __find_sp_area(vma->vm_start); + if (!spa) { + pr_err("share pool: vma is invalid, not from sp mmap\n"); + return ret; + } + node_id = spa->node_id; + __sp_area_drop(spa);
retry: page = find_lock_page(mapping, idx); @@ -3010,7 +3045,7 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm, page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { page = alloc_huge_page_node(hstate_file(vma->vm_file), - numa_mem_id()); + node_id); if (!page) page = ERR_PTR(-ENOMEM); }
From: Peng Wu wupeng58@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
---------------------------------------------
In some scenarios, users need to specify memory nodes to apply for shared memory.
Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: chenweilong chenweilong@huawei.com Reviewed-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/shmem.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c index 90e75a0fa5bc6..90303da1061e5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -84,6 +84,7 @@ static struct vfsmount *shm_mnt; #include <asm/pgtable.h>
#include "internal.h" +#include <linux/share_pool.h>
#define BLOCKS_PER_PAGE (PAGE_SIZE/512) #define VM_ACCT(size) (PAGE_ALIGN(size) >> PAGE_SHIFT) @@ -1542,8 +1543,13 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, return page; }
+static int shmem_node_id(struct vm_area_struct *vma) +{ + return sp_node_id(vma); +} + static struct page *shmem_alloc_hugepage(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index, int node_id) { struct vm_area_struct pvma; struct inode *inode = &info->vfs_inode; @@ -1566,7 +1572,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
shmem_pseudo_vma_init(&pvma, info, hindex); page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + HPAGE_PMD_ORDER, &pvma, 0, node_id, true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); @@ -1574,13 +1580,14 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, }
static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) + struct shmem_inode_info *info, pgoff_t index, + int node_id) { struct vm_area_struct pvma; struct page *page;
shmem_pseudo_vma_init(&pvma, info, index); - page = alloc_page_vma(gfp, &pvma, 0); + page = alloc_page_vma_node(gfp, &pvma, 0, node_id); shmem_pseudo_vma_destroy(&pvma);
return page; @@ -1588,7 +1595,7 @@ static struct page *shmem_alloc_page(gfp_t gfp,
static struct page *shmem_alloc_and_acct_page(gfp_t gfp, struct inode *inode, - pgoff_t index, bool huge) + pgoff_t index, bool huge, int node_id) { struct shmem_inode_info *info = SHMEM_I(inode); struct page *page; @@ -1603,9 +1610,9 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, goto failed;
if (huge) - page = shmem_alloc_hugepage(gfp, info, index); + page = shmem_alloc_hugepage(gfp, info, index, node_id); else - page = shmem_alloc_page(gfp, info, index); + page = shmem_alloc_page(gfp, info, index, node_id); if (page) { __SetPageLocked(page); __SetPageSwapBacked(page); @@ -1654,7 +1661,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - newpage = shmem_alloc_page(gfp, info, index); + newpage = shmem_alloc_page(gfp, info, index, numa_node_id()); if (!newpage) return -ENOMEM;
@@ -1730,6 +1737,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, int error; int once = 0; int alloced = 0; + int node_id;
if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; @@ -1881,11 +1889,15 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, goto alloc_nohuge; }
+ node_id = shmem_node_id(vma); + alloc_huge: - page = shmem_alloc_and_acct_page(gfp, inode, index, true); + page = shmem_alloc_and_acct_page(gfp, inode, index, true, + node_id); if (IS_ERR(page)) { -alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode, - index, false); +alloc_nohuge: + page = shmem_alloc_and_acct_page(gfp, inode, + index, false, node_id); } if (IS_ERR(page)) { int retry = 5; @@ -2377,7 +2389,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, }
if (!*pagep) { - page = shmem_alloc_page(gfp, info, pgoff); + page = shmem_alloc_page(gfp, info, pgoff, numa_node_id()); if (!page) goto out_unacct_blocks;
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
---------------------------------------------------
When mmput is called concurrently, the judgment of "mm_users == 2" in sp_group_exit is not atomic with atomic_dec_and_test in mmput. The judgment of "mm_users == 2" may never be valid. As a result, mm leakage occurs.
For example, in a typical scenario, a process has two threads, with the mmget is performed in sp_group_add_task. In this case, mm_users is 3. When two threads exit at the same time, the judgment of "mm_users == 2" fail.
Therefore, the judgment and atomic_dec_and_test are put in the spg rw_lock to ensure the serialization of the whole process.
Signed-off-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 5 +++-- kernel/fork.c | 3 ++- mm/share_pool.c | 25 ++++++++++++++++++++----- 3 files changed, 25 insertions(+), 8 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index b0b2750e7bbe1..c03b83beaf63c 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -156,7 +156,7 @@ static inline void sp_init_mm(struct mm_struct *mm) }
extern int sp_group_add_task(int pid, int spg_id); -extern void sp_group_exit(struct mm_struct *mm); +extern int sp_group_exit(struct mm_struct *mm); extern void sp_group_post_exit(struct mm_struct *mm); extern int sp_group_id_by_pid(int pid); extern int sp_group_walk(int spg_id, void *data, int (*func)(struct mm_struct *mm, void *)); @@ -299,8 +299,9 @@ static inline int sp_group_add_task(int pid, int spg_id) return -EPERM; }
-static inline void sp_group_exit(struct mm_struct *mm) +static inline int sp_group_exit(struct mm_struct *mm) { + return 0; }
static inline void sp_group_post_exit(struct mm_struct *mm) diff --git a/kernel/fork.c b/kernel/fork.c index 22ed43ed527de..d091b680e5e62 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1086,7 +1086,8 @@ void mmput(struct mm_struct *mm) { might_sleep();
- sp_group_exit(mm); + if (sp_group_exit(mm)) + return;
if (atomic_dec_and_test(&mm->mm_users)) __mmput(mm); diff --git a/mm/share_pool.c b/mm/share_pool.c index 90930e4a8dfe4..61bbbd772c847 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -3110,14 +3110,20 @@ EXPORT_SYMBOL(sharepool_no_page);
#define MM_WOULD_FREE 2
-void sp_group_exit(struct mm_struct *mm) +int sp_group_exit(struct mm_struct *mm) { struct sp_group *spg = mm->sp_group; bool is_alive = true;
if (!spg || !enable_ascend_share_pool) - return; + return 0;
+ /* + * The judgment of mm->mm_users == MM_WOULD_FREE and atomic_dec_and_test + * must be atomic. Otherwise, mm->mm_users == MM_WOULD_FREE may never be + * true due to the gap in the middle. + */ + down_write(&spg->rw_lock); /* * Recall we add mm->users by 1 deliberately in sp_group_add_task(). * If the mm_users is 2, it means that the mm is ready to be freed @@ -3125,21 +3131,30 @@ void sp_group_exit(struct mm_struct *mm) * do_exit() -> exit_mm() -> mmput() -> THIS function. */ if (atomic_read(&mm->mm_users) == MM_WOULD_FREE) { - down_write(&spg->rw_lock); /* a dead group should NOT be reactive again */ if (spg_valid(spg) && list_is_singular(&spg->procs)) is_alive = spg->is_alive = false; if (mm->sp_group) /* concurrency handle of sp_group_add_task */ list_del(&mm->sp_node); /* affect spg->procs */ + /* match with get_task_mm() in sp_group_add_task() */ + atomic_dec(&mm->mm_users); up_write(&spg->rw_lock);
if (!is_alive) blocking_notifier_call_chain(&sp_notifier_chain, 0, mm->sp_group);
- /* match with get_task_mm() in sp_group_add_task() */ - atomic_dec(&mm->mm_users); + return 0; } + + if (atomic_dec_and_test(&mm->mm_users)) { + up_write(&spg->rw_lock); + WARN(1, "Invalid user counting\n"); + return 0; + } + + up_write(&spg->rw_lock); + return 1; }
void sp_group_post_exit(struct mm_struct *mm)
From: Peng Wu wupeng58@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------
Setting an initial value to variable node_id in function shmem_getpage_gfp. Otherwise, Oops is triggered in some scenarios.
[20987.530901] Internal error: Oops: 96000007 [#1] SMP [20987.541162] Modules linked in: cfg80211 rfkill ib_isert iscsi_target_mod rpcrdma ib_srpt target_core_mod dm_mirror dm_region_hash ib_srp scsi_transport_srp dm_log sunrpc dm_mod ib_ipoib rdma_ucm ib_uverbs ib_iser ib_umad rdma_cm ib_cm iw_cm aes_ce_blk crypto_simd cryptd hns_roce_hw_v2 aes_ce_cipher ghash_ce hns_roce sha1_ce ib_core sg ipmi_ssif hi_sfc sbsa_gwdt mtd sch_fq_codel ip_tables realtek hclge hinic sha2_ce sha256_arm64 hns3 ipmi_si hisi_sas_v3_hw hibmc_drm host_edma_drv hnae3 hisi_sas_main ipmi_devintf ipmi_msghandler [20987.639396] Process move_pages03 (pid: 40173, stack limit = 0x00000000804b9d00) [20987.654773] CPU: 50 PID: 40173 Comm: move_pages03 Kdump: loaded Not tainted 4.19.195+ #1 [20987.671794] Hardware name: Huawei TaiShan 2280 V2/BC82AMDD, BIOS 1.08 12/14/2019 [20987.687355] pstate: 80400009 (Nzcv daif +PAN -UAO) [20987.697433] pc : __alloc_pages_nodemask+0x7c/0xdc0 [20987.707510] lr : alloc_pages_vma+0xac/0x318 [20987.716304] sp : ffff0001537cb690 [20987.723268] x29: ffff0001537cb690 x28: 00000000006200ca [20987.734439] x27: 0000000000000000 x26: ffff802fd24439c8 [20987.745610] x25: 0000000000000000 x24: 00000000ffff0000 [20987.756782] x23: 0000000000000000 x22: 0000000000000000 [20987.767952] x21: 00000000ffff0000 x20: ffff000009b69000 [20987.779123] x19: ffff802fd24439c8 x18: 0000000000000000 [20987.790294] x17: 0000000000000000 x16: 0000000000000000 [20987.801466] x15: 0000000000000000 x14: 0000000000000000 [20987.812637] x13: 0000000000000000 x12: 0000000000000000 [20987.823808] x11: ffff000009b69748 x10: 0000000000000040 [20987.834978] x9 : 0000000000000000 x8 : ffff0001537cb978 [20987.846149] x7 : 0000000000000000 x6 : 000000000000003f [20987.857320] x5 : 0000000000000000 x4 : 00000000007fffff [20987.868491] x3 : ffff000009b6c998 x2 : 0000000000000000 [20987.879662] x1 : 0000000000250015 x0 : ffff000009b69788 [20987.890833] Call trace: [20987.895970] __alloc_pages_nodemask+0x7c/0xdc0 [20987.905312] alloc_pages_vma+0xac/0x318 [20987.913374] shmem_alloc_page+0x6c/0xc0 [20987.921436] shmem_alloc_and_acct_page+0x124/0x1f8 [20987.931510] shmem_getpage_gfp+0x16c/0x1028 [20987.940305] shmem_fault+0x94/0x2a0 [20987.947636] __do_fault+0x50/0x220 [20987.954784] do_shared_fault+0x28/0x228 [20987.962846] __handle_mm_fault+0x610/0x8f0 [20987.971457] handle_mm_fault+0xe4/0x1d8 [20987.979520] do_page_fault+0x210/0x4f8 [20987.987398] do_translation_fault+0xa8/0xbc [20987.996192] do_mem_abort+0x68/0x118 [20988.003706] el0_da+0x24/0x28 [20988.009941] Code: b9404c64 72a004a1 b9401062 0a04039c (f875d800)
Fixes: d3edfd4f60bae ("share_pool: Alloc shared memory on a specified memory node") Signed-off-by: Peng Wu wupeng58@huawei.com Reviewed-by: 为珑 陈 chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/mm/shmem.c b/mm/shmem.c index 90303da1061e5..8d32d49a4d7ba 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1737,7 +1737,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, int error; int once = 0; int alloced = 0; - int node_id; + int node_id = shmem_node_id(vma);
if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; @@ -1889,7 +1889,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, goto alloc_nohuge; }
- node_id = shmem_node_id(vma);
alloc_huge: page = shmem_alloc_and_acct_page(gfp, inode, index, true,
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
We found a concurrency problem of sp_group_add_task and sp_free which lead to memory leak.
After process A calls __sp_free and vfs_fallocate but before calling __sp_area_drop, process B is being added to the same group by a manager process, the *dead* spa freed by sp_free may be mapped into process B again, then do_mm_populate is called.
When sp_group_add_task is finished, this spa is dropped and can't be seen in /proc/sharepool/spa_stat, but the memory of spa still reside in the group. It can only be freed when the group is dead.
To fix the problem, we add a member is_dead in spa. We can access it when spg->rw_lock is held. This may sound a little strange if not realizing the life cycle of spa has a direct relation with sp group.
Suggested-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huwei.com Reviewed-by: 为珑 陈 chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 61bbbd772c847..3e4e05f20c5ce 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -233,6 +233,7 @@ struct sp_area { unsigned long region_vstart; /* belong to normal region or DVPP region */ unsigned long flags; bool is_hugepage; + bool is_dead; atomic_t use_count; /* How many vmas use this VA region */ struct rb_node rb_node; /* address sorted rbtree */ struct list_head link; /* link to the spg->head */ @@ -736,6 +737,10 @@ int sp_group_add_task(int pid, int spg_id) prev = spa;
atomic_inc(&spa->use_count); + + if (spa->is_dead == true) + continue; + spin_unlock(&sp_area_lock);
if (spa->type == SPA_TYPE_K2SPG && spa->kva) { @@ -970,6 +975,7 @@ static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, spa->region_vstart = vstart; spa->flags = flags; spa->is_hugepage = (flags & SP_HUGEPAGE); + spa->is_dead = false; spa->spg = spg; atomic_set(&spa->use_count, 1); spa->type = type; @@ -1271,10 +1277,14 @@ int sp_free(unsigned long addr) goto drop_spa; }
- if (!spg_valid(spa->spg)) + down_write(&spa->spg->rw_lock); + if (!spg_valid(spa->spg)) { + up_write(&spa->spg->rw_lock); goto drop_spa; - - sp_dump_stack(); + } + /* the life cycle of spa has a direct relation with sp group */ + spa->is_dead = true; + up_write(&spa->spg->rw_lock);
down_read(&spa->spg->rw_lock);
@@ -1303,6 +1313,7 @@ int sp_free(unsigned long addr) drop_spa: __sp_area_drop(spa); out: + sp_dump_stack(); sp_try_to_compact(); return ret; } @@ -2362,15 +2373,6 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp goto out_drop_area; }
- down_read(&spa->spg->rw_lock); - if (!spg_valid(spa->spg)) { - up_read(&spa->spg->rw_lock); - pr_info_ratelimited("share pool: no need to unshare uva(to group), " - "sp group of spa is dead\n"); - goto out_clr_flag; - } - up_read(&spa->spg->rw_lock); - /* alway allow kthread and dvpp channel destroy procedure */ if (current->mm && current->mm->sp_group != spa->spg) { pr_err_ratelimited("share pool: unshare uva(to group) failed, " @@ -2379,6 +2381,17 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp goto out_drop_area; }
+ down_write(&spa->spg->rw_lock); + if (!spg_valid(spa->spg)) { + up_write(&spa->spg->rw_lock); + pr_info_ratelimited("share pool: no need to unshare uva(to group), " + "sp group of spa is dead\n"); + goto out_clr_flag; + } + /* the life cycle of spa has a direct relation with sp group */ + spa->is_dead = true; + up_write(&spa->spg->rw_lock); + down_read(&spa->spg->rw_lock); __sp_free(spa->spg, uva_aligned, size_aligned, NULL); up_read(&spa->spg->rw_lock);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
atomic64_t is type long in arm64, and type long long in arm32. So use %lld to print.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/oom_kill.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b67676c0d9a10..19b0b266437c4 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -489,9 +489,9 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) if (!stat) pr_cont("%-9c %-9c ", '-', '-'); else { - pr_cont("%-9ld %-9ld ", /* byte to KB */ - atomic64_read(&stat->alloc_size) >> 10, - atomic64_read(&stat->k2u_size) >> 10); + pr_cont("%-9lld %-9lld ", /* byte to KB */ + (long long)atomic64_read(&stat->alloc_size) >> 10, + (long long)atomic64_read(&stat->k2u_size) >> 10); sp_proc_stat_drop(stat); } pr_cont("%8ld %8lu %5hd %s\n",
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
We found a hungtask problem when do direct compact in __alloc_pages_nodemask:
vmalloc_hugepage_user -> __vmalloc_node_range -> __vmalloc_area_node -> sp_alloc_pages -> alloc_huge_page_node -> alloc_fresh_huge_page -> __alloc_pages_nodemask.
Set PF_MEMALLOC then direct reclaim and direct compact won't be called.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 3e4e05f20c5ce..e5a71e25964c1 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -3219,9 +3219,15 @@ void sp_group_post_exit(struct mm_struct *mm) struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, unsigned int page_order, int node) { - if (area->flags & VM_HUGE_PAGES) - return hugetlb_alloc_hugepage(NUMA_NO_NODE, HUGETLB_ALLOC_NONE); - else + struct page *page; + unsigned int noreclaim_flag = 0; + + if (area->flags & VM_HUGE_PAGES) { + noreclaim_flag = memalloc_noreclaim_save(); + page = hugetlb_alloc_hugepage(NUMA_NO_NODE, HUGETLB_ALLOC_NONE); + memalloc_noreclaim_restore(noreclaim_flag); + return page; + } else return alloc_pages_node(node, mask, page_order); }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Let memory compact to be configurable.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/mm/share_pool.c b/mm/share_pool.c index e5a71e25964c1..ea2ea37111b9c 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -3226,6 +3226,7 @@ struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, noreclaim_flag = memalloc_noreclaim_save(); page = hugetlb_alloc_hugepage(NUMA_NO_NODE, HUGETLB_ALLOC_NONE); memalloc_noreclaim_restore(noreclaim_flag); + sp_try_to_compact(); return page; } else return alloc_pages_node(node, mask, page_order);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Only root can enable pr_debug printing.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index ea2ea37111b9c..ebc2d04c01145 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -1601,7 +1601,7 @@ static int is_vmap_hugepage(unsigned long addr)
area = find_vm_area((void *)addr); if (unlikely(!area)) { - pr_err_ratelimited("share pool: failed to find vm area(%lx)\n", addr); + pr_debug("share pool: failed to find vm area(%lx)\n", addr); return -EINVAL; }
@@ -1659,7 +1659,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa,
ret_addr = sp_mmap(mm, file, spa, &populate); if (IS_ERR_VALUE(ret_addr)) { - pr_err("share pool: k2u mmap failed %lx\n", ret_addr); + pr_debug("share pool: k2u mmap failed %lx\n", ret_addr); goto put_mm; } BUG_ON(ret_addr != spa->va_start); @@ -1915,7 +1915,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size,
if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { up_read(&spg->rw_lock); - pr_err("share pool: %s: the kva %lx is not valid\n", __func__, (unsigned long)kva_aligned); + pr_debug("share pool: %s: the kva %lx is not valid\n", __func__, (unsigned long)kva_aligned); goto out_drop_spa; }
@@ -1938,7 +1938,7 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, } else { /* associate vma and spa */ if (!vmalloc_area_clr_flag(spa, kva_aligned, VM_SHAREPOOL)) - pr_warn("share pool: %s: the kva %lx is not valid\n", + pr_debug("share pool: %s: the kva %lx is not valid\n", __func__, (unsigned long)kva_aligned); }
@@ -2032,7 +2032,7 @@ static int sp_hugetlb_entry(pte_t *ptep, unsigned long hmask, struct sp_walk_data *sp_walk_data;
if (unlikely(!pte_present(pte))) { - pr_err_ratelimited("share pool: the page of addr %lx unexpectedly not in RAM\n", (unsigned long)addr); + pr_debug("share pool: the page of addr %lx unexpectedly not in RAM\n", (unsigned long)addr); return -EFAULT; }
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Increase the value of sp_stat_sem when failed.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/mm/share_pool.c b/mm/share_pool.c index ebc2d04c01145..607a5f0097a41 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -135,6 +135,7 @@ static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk, up_write(&sp_stat_sem); return stat; } else { + up_write(&sp_stat_sem); /* if enter this branch, that's our mistake */ pr_err_ratelimited("share pool: proc stat invalid id %d\n", id); return ERR_PTR(-EBUSY);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Free id at the end of sp_group_add_task when failed. Benefits are below: 1. Less time to hold locks. 2. Avoid to forget freeing id in other error handling branches.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 607a5f0097a41..6a4da9ac83e14 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -664,10 +664,8 @@ int sp_group_add_task(int pid, int spg_id)
rcu_read_unlock(); if (ret) { - if (id_newly_generated) - free_sp_group_id((unsigned int)spg_id); up_write(&sp_group_sem); - goto out_unlock; + goto out_free_id; }
/* @@ -695,10 +693,8 @@ int sp_group_add_task(int pid, int spg_id)
spg = find_or_alloc_sp_group(spg_id); if (IS_ERR(spg)) { - ret = PTR_ERR(spg); - if (id_newly_generated) - free_sp_group_id((unsigned int)spg_id); up_write(&sp_group_sem); + ret = PTR_ERR(spg); goto out_put_mm; }
@@ -817,7 +813,9 @@ int sp_group_add_task(int pid, int spg_id) mmput(mm); out_put_task: put_task_struct(tsk); -out_unlock: +out_free_id: + if (unlikely(ret) && id_newly_generated) + free_sp_group_id((unsigned int)spg_id); return ret == 0 ? spg_id : ret; } EXPORT_SYMBOL_GPL(sp_group_add_task);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
e80000600000-e80000603000 rw-s 00600000 00:05 1025 /sp_group_1 (deleted) Size: 12 kB KernelPageSize: 4 kB MMUPageSize: 4 kB Rss: 0 kB Pss: 0 kB Shared_Clean: 0 kB Shared_Dirty: 0 kB Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 0 kB Anonymous: 0 kB LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB Private_Hugetlb: 0 kB Swap: 0 kB SwapPss: 0 kB Locked: 0 kB THPeligible: 0 VmFlags: rd wr sh mr mw me ms pf io dc de nr dd sp ~~
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/proc/task_mmu.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 0417343481cd4..8b8129d658e04 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -667,6 +667,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) #endif /* CONFIG_ARCH_HAS_PKEYS */ #ifdef CONFIG_USERSWAP [ilog2(VM_USWAP)] = "us", +#endif +#ifdef CONFIG_ASCEND_SHARE_POOL + [ilog2(VM_SHARE_POOL)] = "sp", #endif }; size_t i;
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
Once sp group is created, the generated id will be freed in sp_group_drop. Before that, we should call free_sp_group_id() when error occurs.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 6a4da9ac83e14..2d9c0a8916211 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -349,6 +349,12 @@ static void free_sp_group_id(unsigned int spg_id) ida_free(&sp_group_id_ida, spg_id); }
+static void free_new_spg_id(bool new, int spg_id) +{ + if (new) + free_sp_group_id(spg_id); +} + static void free_sp_group(struct sp_group *spg) { fput(spg->file); @@ -665,7 +671,8 @@ int sp_group_add_task(int pid, int spg_id) rcu_read_unlock(); if (ret) { up_write(&sp_group_sem); - goto out_free_id; + free_new_spg_id(id_newly_generated, spg_id); + goto out; }
/* @@ -682,12 +689,14 @@ int sp_group_add_task(int pid, int spg_id) */ mm = get_task_mm(tsk->group_leader); if (!mm) { - ret = -ESRCH; up_write(&sp_group_sem); + ret = -ESRCH; + free_new_spg_id(id_newly_generated, spg_id); goto out_put_task; } else if (mm->sp_group) { - ret = -EEXIST; up_write(&sp_group_sem); + ret = -EEXIST; + free_new_spg_id(id_newly_generated, spg_id); goto out_put_mm; }
@@ -695,6 +704,7 @@ int sp_group_add_task(int pid, int spg_id) if (IS_ERR(spg)) { up_write(&sp_group_sem); ret = PTR_ERR(spg); + free_new_spg_id(id_newly_generated, spg_id); goto out_put_mm; }
@@ -813,9 +823,7 @@ int sp_group_add_task(int pid, int spg_id) mmput(mm); out_put_task: put_task_struct(tsk); -out_free_id: - if (unlikely(ret) && id_newly_generated) - free_sp_group_id((unsigned int)spg_id); +out: return ret == 0 ? spg_id : ret; } EXPORT_SYMBOL_GPL(sp_group_add_task);
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
The situation below is not allowed:
int *result = mmap(ADDR, sizeof(int), PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0);
As share pool uses an independent UVA allocation algorithm, it may produce an address that is conflicted with user-specified address.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 14 ++++++++++++++ mm/mmap.c | 12 ++++++++++++ mm/mremap.c | 4 ++++ mm/share_pool.c | 38 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 68 insertions(+)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index c03b83beaf63c..9650f257b3ad7 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -292,6 +292,9 @@ static inline void sp_free_pages(struct page *page, struct vm_struct *area) __free_pages(page, is_vmalloc_huge(area->flags) ? PMD_SHIFT - PAGE_SHIFT : 0); }
+extern bool sp_check_addr(unsigned long addr); +extern bool sp_check_mmap_addr(unsigned long addr, unsigned long flags); + #else
static inline int sp_group_add_task(int pid, int spg_id) @@ -495,6 +498,17 @@ static inline int sp_node_id(struct vm_area_struct *vma) { return numa_node_id(); } + +static inline bool sp_check_addr(unsigned long addr) +{ + return false; +} + +static inline bool sp_check_mmap_addr(unsigned long addr, unsigned long flags) +{ + return false; +} + #endif
#endif /* LINUX_SHARE_POOL_H */ diff --git a/mm/mmap.c b/mm/mmap.c index 9c9a4a98abb21..cb4268f2cf5ab 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2354,6 +2354,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (flags & MAP_FIXED) return addr;
+ if (sp_check_mmap_addr(addr, flags)) + return -EINVAL; + if (addr) { addr = PAGE_ALIGN(addr);
@@ -2405,6 +2408,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (flags & MAP_FIXED) return addr;
+ if (sp_check_mmap_addr(addr, flags)) + return -EINVAL; + /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr); @@ -3111,6 +3117,9 @@ int vm_munmap(unsigned long start, size_t len) struct mm_struct *mm = current->mm; LIST_HEAD(uf);
+ if (sp_check_addr(start)) + return -EINVAL; + if (down_write_killable(&mm->mmap_sem)) return -EINTR;
@@ -3129,6 +3138,9 @@ int do_vm_munmap(struct mm_struct *mm, unsigned long start, size_t len) if (mm == NULL) return -EINVAL;
+ if (sp_check_addr(start)) + return -EINVAL; + if (down_write_killable(&mm->mmap_sem)) return -EINTR;
diff --git a/mm/mremap.c b/mm/mremap.c index 2ac9eaa041d95..238c169dc9694 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -24,6 +24,7 @@ #include <linux/uaccess.h> #include <linux/mm-arch-hooks.h> #include <linux/userfaultfd_k.h> +#include <linux/share_pool.h>
#include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -534,6 +535,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, if (offset_in_page(addr)) return ret;
+ if (sp_check_addr(addr) || sp_check_addr(new_addr)) + return ret; + old_len = PAGE_ALIGN(old_len); new_len = PAGE_ALIGN(new_len);
diff --git a/mm/share_pool.c b/mm/share_pool.c index 2d9c0a8916211..eb5eaa3e0d05a 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -58,6 +58,8 @@ #define byte2mb(size) ((size) >> 20) #define page2kb(page_num) ((page_num) << (PAGE_SHIFT - 10))
+#define PF_DOMAIN_CORE 0x10000000 /* AOS CORE processes in sched.h */ + /* mdc scene hack */ static int __read_mostly enable_mdc_default_group; static const int mdc_default_group_id = 1; @@ -334,6 +336,14 @@ static inline void check_interrupt_context(void) panic("share_pool: can't be used in interrupt context\n"); }
+static inline bool check_aoscore_process(struct task_struct *tsk) +{ + if (tsk->flags & PF_DOMAIN_CORE) + return true; + else + return false; +} + static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, struct sp_area *spa, unsigned long *populate); static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size); @@ -675,6 +685,14 @@ int sp_group_add_task(int pid, int spg_id) goto out; }
+ if (check_aoscore_process(tsk)) { + up_write(&sp_group_sem); + ret = -EACCES; + free_new_spg_id(id_newly_generated, spg_id); + sp_dump_stack(); + goto out_put_task; + } + /* * group_leader: current thread may be exiting in a multithread process * @@ -3030,6 +3048,26 @@ void __init proc_sharepool_init(void)
/*** End of tatistical and maintenance functions ***/
+bool sp_check_addr(unsigned long addr) +{ + if (enable_ascend_share_pool && is_sharepool_addr(addr) && + !check_aoscore_process(current)) { + sp_dump_stack(); + return true; + } else + return false; +} + +bool sp_check_mmap_addr(unsigned long addr, unsigned long flags) +{ + if (enable_ascend_share_pool && is_sharepool_addr(addr) && + !check_aoscore_process(current) && !(flags & MAP_SHARE_POOL)) { + sp_dump_stack(); + return true; + } else + return false; +} + vm_fault_t sharepool_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx,
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4EUVI CVE: NA
-------------------------------------------------
do_mmap checker should be put before MAP_FIXED.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/mmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/mm/mmap.c b/mm/mmap.c index cb4268f2cf5ab..4c8092b2f26fd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2351,12 +2351,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM;
- if (flags & MAP_FIXED) - return addr; - if (sp_check_mmap_addr(addr, flags)) return -EINVAL;
+ if (flags & MAP_FIXED) + return addr; + if (addr) { addr = PAGE_ALIGN(addr);
@@ -2405,12 +2405,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM;
- if (flags & MAP_FIXED) - return addr; - if (sp_check_mmap_addr(addr, flags)) return -EINVAL;
+ if (flags & MAP_FIXED) + return addr; + /* requesting a specific address */ if (addr) { addr = PAGE_ALIGN(addr);