From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The mm->owner is only used for MEMCG currently, but the ascend share pool features will use it later, so make it to a general features and select it for CONFIG_MEMCG.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mm_types.h | 2 +- init/Kconfig | 1 + kernel/exit.c | 4 ++-- kernel/fork.c | 4 ++-- mm/Kconfig | 4 ++++ mm/debug.c | 2 +- 6 files changed, 11 insertions(+), 6 deletions(-)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 178e9dee217a..fcfa9a75c18e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -453,7 +453,7 @@ struct mm_struct { spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MM_OWNER /* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in diff --git a/init/Kconfig b/init/Kconfig index d1427ae4de9e..6880b55901bb 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -705,6 +705,7 @@ config MEMCG bool "Memory controller" select PAGE_COUNTER select EVENTFD + select MM_OWNER help Provides control over the memory footprint of tasks in a cgroup.
diff --git a/kernel/exit.c b/kernel/exit.c index 891d65e3ffd5..4d6f941712b6 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -392,7 +392,7 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) } }
-#ifdef CONFIG_MEMCG +#ifdef CONFIG_MM_OWNER /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ @@ -478,7 +478,7 @@ void mm_update_next_owner(struct mm_struct *mm) task_unlock(c); put_task_struct(c); } -#endif /* CONFIG_MEMCG */ +#endif /* CONFIG_MM_OWNER */
/* * Turn us into a lazy TLB process if we diff --git a/kernel/fork.c b/kernel/fork.c index 768fe41a7ee3..1ac49d1852cf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -958,7 +958,7 @@ static void mm_init_aio(struct mm_struct *mm) static __always_inline void mm_clear_owner(struct mm_struct *mm, struct task_struct *p) { -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MM_OWNER if (mm->owner == p) WRITE_ONCE(mm->owner, NULL); #endif @@ -966,7 +966,7 @@ static __always_inline void mm_clear_owner(struct mm_struct *mm,
static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MM_OWNER mm->owner = p; #endif } diff --git a/mm/Kconfig b/mm/Kconfig index dddeb30d645e..76c2197a3f99 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -299,6 +299,10 @@ config VIRT_TO_BUS deprecated interface virt_to_bus(). All new architectures should probably not select this.
+config MM_OWNER + bool "Enable the ownership the mm owner" + help + This option enables mm_struct's to have an owner.
config MMU_NOTIFIER bool diff --git a/mm/debug.c b/mm/debug.c index 362ce581671e..2da184b16bce 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -129,7 +129,7 @@ void dump_mm(const struct mm_struct *mm) #ifdef CONFIG_AIO "ioctx_table %px\n" #endif -#ifdef CONFIG_MEMCG +#ifdef CONFIG_MM_OWNER "owner %px " #endif "exe_file %px\n"
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
This is a prepare patch for share pool, export new function to vmalloc hugepage and vmap the hugepage to virtually contiguous space.
The new head file share_pool.h is mainly used for share pool features and export some sp_xxx function when ascend_share_pool config is enabled, and do nothing by default.
Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 354 +++++++++++++++++++++++++++++++++++++ include/linux/vmalloc.h | 9 + mm/vmalloc.c | 237 ++++++++++++++++++++++++- 3 files changed, 596 insertions(+), 4 deletions(-) create mode 100644 include/linux/share_pool.h
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h new file mode 100644 index 000000000000..09afbae33d41 --- /dev/null +++ b/include/linux/share_pool.h @@ -0,0 +1,354 @@ +#ifndef LINUX_SHARE_POOL_H +#define LINUX_SHARE_POOL_H + +#include <linux/mman.h> +#include <linux/mm_types.h> +#include <linux/notifier.h> +#include <linux/vmalloc.h> + +#define SP_HUGEPAGE (1 << 0) +#define SP_HUGEPAGE_ONLY (1 << 1) +#define SP_DVPP (1 << 2) + +#define SPG_ID_NONE -1 /* not associated with sp_group, only for specified thread */ +#define SPG_ID_DEFAULT 0 /* use the spg id of current thread */ +#define SPG_ID_MIN 1 /* valid id should be >= 1 */ +#define SPG_ID_MAX 99999 +#define SPG_ID_AUTO_MIN 100000 +#define SPG_ID_AUTO_MAX 199999 +#define SPG_ID_AUTO 200000 /* generate group id automatically */ +#define SPG_ID_DVPP_PASS_THROUGH_MIN 800000 +#define SPG_ID_DVPP_PASS_THROUGH_MAX 899999 +#define SPG_ID_DVPP_PASS_THROUGH 900000 + +#define MAX_DEVID 1 /* the max num of Da-vinci devices */ + +#define VM_HUGE_PAGES 0x00001000 /* use for huge pages */ + +/* to align the pointer to the (next) PMD boundary */ +#define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE) + +/* test whether an address (unsigned long or pointer) is aligned to PMD_SIZE */ +#define PMD_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PMD_SIZE) + +extern int sysctl_share_pool_hugepage_enable; + +extern int sysctl_ac_mode; + +extern int enable_ascend_share_pool; + +/* Processes in the same sp_group can share memory. + * Memory layout for share pool: + * + * |-------------------- 8T -------------------|---|------ 8T ------------| + * | Device 0 | Device 1 |...| | + * |----------------------------------------------------------------------| + * |- 16G -|- 16G -|- 16G -|- 16G -| | | | | + * | DVPP GROUP0 | DVPP GROUP1 | ... | ... |...| sp normal memory | + * | svm | sp | svm | sp | | | | | + * |----------------------------------------------------------------------| + * + * The host SVM feature reserves 8T virtual memory by mmap, and due to the + * restriction of DVPP, while SVM and share pool will both allocate memory + * for DVPP, the memory have to be in the same 32G range. + * + * Share pool reserves 16T memory, with 8T for normal uses and 8T for DVPP. + * Within this 8T DVPP memory, SVM will call sp_config_dvpp_range() to + * tell us which 16G memory range is reserved for share pool . + * + * In some scenarios where there is no host SVM feature, share pool uses + * the default memory setting for DVPP. + */ +struct sp_group { + int id; + struct file *file; + struct file *file_hugetlb; + /* list head of processes */ + struct list_head procs; + /* list of sp_area */ + struct list_head spa_list; + /* number of sp_area */ + atomic_t spa_num; + /* total size of all sp_area from sp_alloc and k2u(spg) */ + atomic_t size; + /* record the number of hugepage allocation failures */ + int hugepage_failures; + /* is_alive == false means it's being destroyed */ + bool is_alive; + /* we define the creator process of a sp_group as owner */ + struct task_struct *owner; + /* dvpp_multi_spaces == true means multiple dvpp 16G spaces are set */ + bool dvpp_multi_spaces; + unsigned long dvpp_va_start; + unsigned long dvpp_size; + atomic_t use_count; +}; + +struct sp_walk_data { + struct page **pages; + unsigned int page_count; + unsigned long uva_aligned; + unsigned long page_size; + bool is_hugepage; +}; + +#ifdef CONFIG_ASCEND_SHARE_POOL + +#define MAP_SHARE_POOL 0x100000 + +#define MMAP_TOP_4G_SIZE 0x100000000UL + +/* 8T size */ +#define MMAP_SHARE_POOL_NORMAL_SIZE 0x80000000000UL +/* 8T size*/ +#define MMAP_SHARE_POOL_DVPP_SIZE 0x80000000000UL +/* 16G size */ +#define MMAP_SHARE_POOL_16G_SIZE 0x400000000UL +#define MMAP_SHARE_POOL_SIZE (MMAP_SHARE_POOL_NORMAL_SIZE + MMAP_SHARE_POOL_DVPP_SIZE) +/* align to 2M hugepage size, and MMAP_SHARE_POOL_TOP_16G_START should be align to 16G */ +#define MMAP_SHARE_POOL_END ((TASK_SIZE - MMAP_SHARE_POOL_DVPP_SIZE) & ~((1 << 21) - 1)) +#define MMAP_SHARE_POOL_START (MMAP_SHARE_POOL_END - MMAP_SHARE_POOL_SIZE) +#define MMAP_SHARE_POOL_16G_START (MMAP_SHARE_POOL_END - MMAP_SHARE_POOL_DVPP_SIZE) + +static inline void sp_init_mm(struct mm_struct *mm) +{ + mm->sp_group = NULL; + INIT_LIST_HEAD(&mm->sp_node); + mm->sp_stat_id = 0; +} + +extern int sp_group_add_task(int pid, int spg_id); +extern void sp_group_exit(struct mm_struct *mm); +extern void sp_group_post_exit(struct mm_struct *mm); +extern int sp_group_id_by_pid(int pid); +extern int sp_group_walk(int spg_id, void *data, int (*func)(struct mm_struct *mm, void *)); +extern int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); + +extern void *sp_alloc(unsigned long size, unsigned long sp_flags, int sp_id); +extern int sp_free(unsigned long addr); +extern void *sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id); +extern void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid); +extern int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id); + +extern void sp_area_drop(struct vm_area_struct *vma); + +extern int sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data); +extern void sp_walk_page_free(struct sp_walk_data *sp_walk_data); + +extern int sp_register_notifier(struct notifier_block *nb); +extern int sp_unregister_notifier(struct notifier_block *nb); +extern bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid); +extern bool is_sharepool_addr(unsigned long addr); +extern void proc_sharepool_init(void); + +static inline struct task_struct *sp_get_task(struct mm_struct *mm) +{ + if (enable_ascend_share_pool) + return mm->owner; + else + return current; +} + +static inline bool sp_check_hugepage(struct page *p) +{ + if (enable_ascend_share_pool && PageHuge(p)) + return true; + + return false; +} + +static inline bool sp_is_enabled(void) +{ + return enable_ascend_share_pool ? true : false; +} + +static inline bool sp_check_vm_huge_page(unsigned long flags) +{ + if (enable_ascend_share_pool && (flags & VM_HUGE_PAGES)) + return true; + + return false; +} + +static inline void sp_area_work_around(struct vm_unmapped_area_info *info) +{ + if (enable_ascend_share_pool) + info->high_limit = min(info->high_limit, MMAP_SHARE_POOL_START); +} + +extern struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, + unsigned int page_order, int node); + +static inline void sp_free_pages(struct page *page, struct vm_struct *area) +{ + if (PageHuge(page)) + put_page(page); + else + __free_pages(page, area->page_order); +} + +static inline bool sp_check_vm_share_pool(unsigned long vm_flags) +{ + if (enable_ascend_share_pool && (vm_flags & VM_SHARE_POOL)) + return true; + + return false; +} + +static inline bool is_vm_huge_special(struct vm_area_struct *vma) +{ + return !!(enable_ascend_share_pool && (vma->vm_flags & VM_HUGE_SPECIAL)); +} + +static inline bool sp_mmap_check(unsigned long flags) +{ + if (enable_ascend_share_pool && (flags & MAP_SHARE_POOL)) + return true; + + return false; +} + +#else + +static inline int sp_group_add_task(int pid, int spg_id) +{ + return -EPERM; +} + +static inline void sp_group_exit(struct mm_struct *mm) +{ +} + +static inline void sp_group_post_exit(struct mm_struct *mm) +{ +} + +static inline int sp_group_id_by_pid(int pid) +{ + return -EPERM; +} + +static inline int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return -EPERM; +} + +static inline void *sp_alloc(unsigned long size, unsigned long sp_flags, int sp_id) +{ + return NULL; +} + +static inline int sp_free(unsigned long addr) +{ + return -EPERM; +} + +static inline void *sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id) +{ + return NULL; +} + +static inline void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) +{ + return NULL; +} +static inline int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) +{ + return -EPERM; +} + +static inline void sp_init_mm(struct mm_struct *mm) +{ +} + +static inline void sp_area_drop(struct vm_area_struct *vma) +{ +} + +static inline int sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + return 0; +} + +static inline void sp_walk_page_free(struct sp_walk_data *sp_walk_data) +{ +} +static inline int sp_register_notifier(struct notifier_block *nb) +{ + return -EPERM; +} + +static inline int sp_unregister_notifier(struct notifier_block *nb) +{ + return -EPERM; +} +static inline bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) +{ + return false; +} + +static inline bool is_sharepool_addr(unsigned long addr) +{ + return false; +} + +static inline void proc_sharepool_init(void) +{ +} + +static inline struct task_struct *sp_get_task(struct mm_struct *mm) +{ + return current; +} +static inline bool sp_check_hugepage(struct page *p) +{ + return false; +} + +static inline bool sp_is_enabled(void) +{ + return false; +} + +static inline bool sp_check_vm_huge_page(unsigned long flags) +{ + return false; +} + +static inline void sp_area_work_around(struct vm_unmapped_area_info *info) +{ +} + +static inline struct page *sp_alloc_pages(void *area, gfp_t mask, + unsigned int page_order, int node) +{ + return NULL; +} + +static inline void sp_free_pages(struct page *page, struct vm_struct *area) +{ +} + +static inline bool sp_check_vm_share_pool(unsigned long vm_flags) +{ + return false; +} + +static inline bool is_vm_huge_special(struct vm_area_struct *vma) +{ + return false; +} + +static inline bool sp_mmap_check(unsigned long flags) +{ + return false; +} +#endif + +#endif /* LINUX_SHARE_POOL_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 07b4b1141ed8..244eedb7591a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -95,6 +95,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); +extern void *vmalloc_hugepage(unsigned long size); +extern void *vmalloc_hugepage_user(unsigned long size); #ifndef CONFIG_MMU extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, @@ -123,6 +125,13 @@ extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, void vmalloc_sync_mappings(void); void vmalloc_sync_unmappings(void);
+extern void *vmap_hugepage(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot); +extern int remap_vmalloc_hugepage_range_partial(struct vm_area_struct *vma, + unsigned long uaddr, void *kaddr, + unsigned long size); +extern int remap_vmalloc_hugepage_range(struct vm_area_struct *vma, + void *addr, unsigned long pgoff); /* * Lowlevel-APIs (not for driver use!) */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2fec803edc90..78f56e719e1d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -33,6 +33,7 @@ #include <linux/bitops.h> #include <linux/rbtree_augmented.h> #include <linux/overflow.h> +#include <linux/share_pool.h>
#include <linux/uaccess.h> #include <asm/tlbflush.h> @@ -478,6 +479,37 @@ static int vmap_pages_range(unsigned long addr, unsigned long end, return err; }
+static int vmap_hugepages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + unsigned int i, nr = (end - addr) >> page_shift; + + for (i = 0; i < nr; i++) { + int err; + + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + __pa(page_address(pages[i])), prot, + page_shift); + if (err) + return err; + + addr += 1UL << page_shift; + } + + return 0; +} + +static int vmap_hugepages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) +{ + int err; + + err = vmap_hugepages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + return err; +} + /** * map_kernel_range_noflush - map kernel VM area with the specified pages * @addr: start of the VM area to map @@ -589,6 +621,22 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) } EXPORT_SYMBOL(vmalloc_to_page);
+/* + * Walk a hugepage vmap address to the struct page it maps. + * return the head page that corresponds to the base page address. + */ +struct page *vmalloc_to_hugepage(const void *vmalloc_addr) +{ + struct page *huge; + + huge = vmalloc_to_page(vmalloc_addr); + if (huge && PageHuge(huge)) + return huge; + else + return NULL; +} +EXPORT_SYMBOL(vmalloc_to_hugepage); + /* * Map a vmalloc()-space virtual address to the physical page frame number. */ @@ -2243,7 +2291,12 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller) { - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, + unsigned long align = 1; + + if (sp_check_vm_huge_page(flags)) + align = PMD_SIZE; + + return __get_vm_area_node(size, align, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, caller); }
@@ -2327,7 +2380,10 @@ static void __vunmap(const void *addr, int deallocate_pages) struct page *page = area->pages[i];
BUG_ON(!page); - __free_pages(page, area->page_order); + if (sp_is_enabled()) + sp_free_pages(page, area); + else + __free_pages(page, area->page_order); }
kvfree(area->pages); @@ -2452,6 +2508,43 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap);
+/** + * vmap_hugepag - map an array of huge pages into virtually contiguous space + * @pages: array of huge page pointers + * @count: number of pages to map + * @flags: vm_area->flags + * @prot: page protection for the mapping + * + * Maps @count pages from @pages into contiguous kernel virtual + * space. + */ +void *vmap_hugepage(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot) +{ + struct vm_struct *area; + unsigned long size; /* In bytes */ + + might_sleep(); + + if (count > totalram_pages) + return NULL; + + size = (unsigned long)count << PMD_SHIFT; + area = get_vm_area_caller(size, flags, __builtin_return_address(0)); + if (!area) + return NULL; + + if (vmap_hugepages_range((unsigned long)area->addr, + (unsigned long)area->addr + size, prot, + pages, PMD_SHIFT) < 0) { + vunmap(area->addr); + return NULL; + } + + return area->addr; +} +EXPORT_SYMBOL(vmap_hugepage); + static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller); @@ -2494,7 +2587,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; int p;
- page = alloc_pages_node(node, alloc_mask|highmem_mask, page_order); + if (sp_is_enabled()) + page = sp_alloc_pages(area, alloc_mask|highmem_mask, + page_order, node); + else + page = alloc_pages_node(node, alloc_mask|highmem_mask, + page_order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; @@ -2562,7 +2660,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, */
size_per_node = size; - if (node == NUMA_NO_NODE) + if (node == NUMA_NO_NODE && !sp_is_enabled()) size_per_node /= num_online_nodes(); if (size_per_node >= PMD_SIZE) { shift = PMD_SHIFT; @@ -2825,6 +2923,55 @@ void *vmalloc_32_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_32_user);
+/** + * vmalloc_hugepage - allocate virtually contiguous hugetlb memory + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage(unsigned long size) +{ + /* PMD hugepage aligned */ + size = PMD_ALIGN(size); + + return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, + NUMA_NO_NODE, __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_hugepage); + +/** + * vmalloc_hugepage_user - allocate virtually contiguous hugetlb memory + * for userspace + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. The resulting memory area + * is zeroed so it can be mapped to userspace without leaking data. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage_user(unsigned long size) +{ + struct vm_struct *area; + void *ret; + + /* 2M hugepa aligned */ + size = PMD_ALIGN(size); + + ret = __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + NUMA_NO_NODE, __builtin_return_address(0)); + if (ret) { + area = find_vm_area(ret); + area->flags |= VM_USERMAP; + } + return ret; +} +EXPORT_SYMBOL(vmalloc_hugepage_user); + + /* * small helper routine , copy contents to buf from addr. * If the page is not present, fill zero. @@ -3150,6 +3297,85 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range);
+/** + * remap_vmalloc_hugepage_range_partial - map vmalloc hugepages + * to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc hugepage kernel memory + * @size: size of map area + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_hugepage_range_partial(struct vm_area_struct *vma, + unsigned long uaddr, void *kaddr, unsigned long size) +{ + struct vm_struct *area; + + size = PMD_ALIGN(size); + + if (!PMD_ALIGNED(uaddr) || !PMD_ALIGNED(kaddr)) + return -EINVAL; + + area = find_vm_area(kaddr); + if (!area) + return -EINVAL; + + if (!(area->flags & VM_USERMAP)) + return -EINVAL; + + if (kaddr + size > area->addr + get_vm_area_size(area)) + return -EINVAL; + + do { + struct page *page = vmalloc_to_hugepage(kaddr); + int ret; + + ret = vm_insert_page(vma, uaddr, page); + if (ret) + return ret; + + uaddr += PMD_SIZE; + kaddr += PMD_SIZE; + size -= PMD_SIZE; + } while (size > 0); + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + + return 0; +} +EXPORT_SYMBOL(remap_vmalloc_hugepage_range_partial); + +/** + * remap_vmalloc_hugepage_range - map vmalloc hugepages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of hugepages into addr before first page to map + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_hugepage_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + return remap_vmalloc_hugepage_range_partial(vma, vma->vm_start, + addr + (pgoff << PMD_SHIFT), + vma->vm_end - vma->vm_start); +} +EXPORT_SYMBOL(remap_vmalloc_hugepage_range); + /* * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose * not to have one. @@ -3611,6 +3837,9 @@ static int s_show(struct seq_file *m, void *p) if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages");
+ if (sp_is_enabled()) + seq_printf(m, " order=%d", v->page_order); + show_numa_info(m, v); seq_putc(m, '\n'); return 0;
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The mm->sp_group is mainly used to find out the group which owns the mm, and the group could use the mm->sp_node to list and find out the mm, the mm->sp_stat_id is used for collecting memory information.
This changes will affect and destroy the kabi only when enable the ascend_share_pool config.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/mm_types.h | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fcfa9a75c18e..886ceb0f91e3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -468,6 +468,11 @@ struct mm_struct { #endif struct user_namespace *user_ns;
+#ifdef CONFIG_ASCEND_SHARE_POOL + struct sp_group *sp_group; + struct list_head sp_node; /* link to sp_group->procs */ + int sp_stat_id; +#endif /* store ref to file /proc/<pid>/exe symlink points to */ struct file __rcu *exe_file; #ifdef CONFIG_MMU_NOTIFIER
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The do_mmap/mmap_region/__mm_populate could only be used to handle the current process, now the share pool need to handle the other process and create memory mmaping, so need to export new function to distinguish different process and handle it, it would not break the current logic and only valid for share pool.
The share pool need to remap the vmalloc pages to user space, so introduce the hugetlb_insert_hugepage to support hugepage remapming.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/hugetlb.h | 4 ++- include/linux/mm.h | 25 +++++++++++++- mm/gup.c | 28 ++++++++++----- mm/hugetlb.c | 42 +++++++++++++++++++++++ mm/memory.c | 7 +++- mm/mmap.c | 76 +++++++++++++++++++++++++++++++++++------ 6 files changed, 160 insertions(+), 22 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 870c29d8b8e4..bd658f44e133 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -382,8 +382,10 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping, const struct hstate *hugetlb_get_hstate(void); struct page *hugetlb_alloc_hugepage(int nid); int hugetlb_insert_hugepage_pte(struct mm_struct *mm, unsigned long addr, - pgprot_t prot, struct page *hpage); + pgprot_t prot, struct page *hpage); #endif +int hugetlb_insert_hugepage(struct vm_area_struct *vma, unsigned long addr, + struct page *hpage, pgprot_t prot);
/* arch callback */ int __init __alloc_bootmem_huge_page(struct hstate *h); diff --git a/include/linux/mm.h b/include/linux/mm.h index 0ff3de89f897..35c2225c6e57 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -230,6 +230,10 @@ extern unsigned int kobjsize(const void *objp); #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ #define VM_PA32BIT 0x400000000 /* Physical address is within 4G */
+#ifdef CONFIG_ASCEND_SHARE_POOL +#define VM_HUGE_SPECIAL 0x800000000 /* Special hugepage flag used by share pool */ +#endif + #ifdef CONFIG_COHERENT_DEVICE #define VM_CDM 0x100000000 /* Contains coherent device memory */ #endif @@ -240,11 +244,13 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) +#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_ARCH_HAS_PKEYS @@ -260,6 +266,12 @@ extern unsigned int kobjsize(const void *objp); #endif #endif /* CONFIG_ARCH_HAS_PKEYS */
+#if defined(CONFIG_ASCEND_SHARE_POOL) +# define VM_SHARE_POOL VM_HIGH_ARCH_5 +#else +# define VM_SHARE_POOL VM_NONE +#endif + #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ #elif defined(CONFIG_PPC) @@ -559,7 +571,7 @@ int region_intersects(resource_size_t offset, size_t size, unsigned long flags, /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); - +struct page *vmalloc_to_hugepage(const void *addr); /* * Determine if an address is within the vmalloc range * @@ -2344,6 +2356,10 @@ extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); +extern unsigned long __do_mmap(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, unsigned long prot, + unsigned long flags, vm_flags_t vm_flags, unsigned long pgoff, + unsigned long *populate, struct list_head *uf); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf);
@@ -2364,14 +2380,21 @@ static inline void mm_populate(unsigned long addr, unsigned long len) /* Ignore errors */ (void) __mm_populate(addr, len, 1); } +extern int do_mm_populate(struct mm_struct *mm, unsigned long addr, unsigned long len, + int ignore_errors); #else static inline void mm_populate(unsigned long addr, unsigned long len) {} +int do_mm_populate(struct mm_struct *mm, unsigned long addr, unsigned long len, + int ignore_errors) +{ +} #endif
/* These take the mm semaphore themselves */ extern int __must_check vm_brk(unsigned long, unsigned long); extern int __must_check vm_brk_flags(unsigned long, unsigned long, unsigned long); extern int vm_munmap(unsigned long, size_t); +extern int do_vm_munmap(struct task_struct *tsk, unsigned long start, size_t len); extern unsigned long __must_check vm_mmap(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); diff --git a/mm/gup.c b/mm/gup.c index 3fc585282f24..707c374d4f6b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -16,6 +16,7 @@ #include <linux/migrate.h> #include <linux/mm_inline.h> #include <linux/sched/mm.h> +#include <linux/share_pool.h>
#include <asm/mmu_context.h> #include <asm/pgtable.h> @@ -1355,6 +1356,7 @@ long populate_vma_page_range(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; unsigned long nr_pages = (end - start) / PAGE_SIZE; int gup_flags; + struct task_struct *tsk;
VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK); @@ -1380,24 +1382,22 @@ long populate_vma_page_range(struct vm_area_struct *vma, if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) gup_flags |= FOLL_FORCE;
+ tsk = sp_get_task(mm); /* * We made sure addr is within a VMA, so the following will * not result in a stack expansion that recurses back here. */ - return __get_user_pages(current, mm, start, nr_pages, gup_flags, + return __get_user_pages(tsk, mm, start, nr_pages, gup_flags, NULL, NULL, nonblocking); }
/* - * __mm_populate - populate and/or mlock pages within a range of address space. - * - * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap - * flags. VMAs must be already marked with the desired vm_flags, and - * mmap_sem must not be held. + * do_mm_populate - populate and/or mlock pages within a range of + * address space for the specified mm_struct. */ -int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +int do_mm_populate(struct mm_struct *mm, unsigned long start, unsigned long len, + int ignore_errors) { - struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; int locked = 0; @@ -1448,6 +1448,18 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) return ret; /* 0 or negative error code */ }
+/* + * __mm_populate - populate and/or mlock pages within a range of address space. + * + * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap + * flags. VMAs must be already marked with the desired vm_flags, and + * mmap_sem must not be held. + */ +int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) +{ + return do_mm_populate(current->mm, start, len, ignore_errors); +} + /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 225dcf7536ae..9d2035632aed 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -25,6 +25,7 @@ #include <linux/swap.h> #include <linux/swapops.h> #include <linux/jhash.h> +#include <linux/share_pool.h>
#include <asm/page.h> #include <asm/pgtable.h> @@ -3961,6 +3962,12 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, }
page = alloc_huge_page(vma, haddr, 0); + if (IS_ERR(page) && sp_check_vm_share_pool(vma->vm_flags)) { + page = alloc_huge_page_node(hstate_file(vma->vm_file), + numa_mem_id()); + if (!page) + page = ERR_PTR(-ENOMEM); + } if (IS_ERR(page)) { /* * Returning error will result in faulting task being @@ -5265,6 +5272,41 @@ int hugetlb_insert_hugepage_pte_by_pa(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(hugetlb_insert_hugepage_pte_by_pa);
+int hugetlb_insert_hugepage(struct vm_area_struct *vma, unsigned long addr, + struct page *hpage, pgprot_t prot) +{ + struct hstate *h = hstate_vma(vma); + int anon_rmap = 0; + spinlock_t *ptl; + pte_t *ptep; + pte_t pte; + struct mm_struct *mm = vma->vm_mm; + + ptep = hugetlb_huge_pte_alloc(mm, addr, huge_page_size(h)); + if (!ptep) + return -ENXIO; + + get_page(hpage); + + ptl = huge_pte_lock(h, mm, ptep); + if (anon_rmap) { + ClearPagePrivate(hpage); + hugepage_add_new_anon_rmap(hpage, vma, addr); + } else { + page_dup_rmap(hpage, true); + } + + pte = make_huge_pte(vma, hpage, ((vma->vm_flags & VM_WRITE) + && (vma->vm_flags & VM_SHARED))); + set_huge_pte_at(mm, addr, ptep, pte); + + hugetlb_count_add(pages_per_huge_page(h), mm); + + spin_unlock(ptl); + + return 0; +} + #ifdef CONFIG_ASCEND_CHARGE_MIGRATE_HUGEPAGES
static int __init ascend_enable_charge_migrate_hugepages(char *s) diff --git a/mm/memory.c b/mm/memory.c index 7503203c8436..e369f3961ad2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -70,6 +70,7 @@ #include <linux/dax.h> #include <linux/oom.h> #include <linux/ktask.h> +#include <linux/share_pool.h>
#include <asm/io.h> #include <asm/mmu_context.h> @@ -1529,7 +1530,11 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, BUG_ON(vma->vm_flags & VM_PFNMAP); vma->vm_flags |= VM_MIXEDMAP; } - return insert_page(vma, addr, page, vma->vm_page_prot); + + if (sp_check_hugepage(page)) + return hugetlb_insert_hugepage(vma, addr, page, vma->vm_page_prot); + else + return insert_page(vma, addr, page, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_page);
diff --git a/mm/mmap.c b/mm/mmap.c index 00702331afc1..e1069c42ec8e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -46,6 +46,7 @@ #include <linux/pkeys.h> #include <linux/oom.h> #include <linux/sched/mm.h> +#include <linux/share_pool.h>
#include <linux/uaccess.h> #include <asm/cacheflush.h> @@ -178,6 +179,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); + sp_area_drop(vma); vm_area_free(vma); return next; } @@ -1119,6 +1121,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL;
+ /* don't merge this kind of vma as sp_area couldn't be merged */ + if (sp_check_vm_share_pool(vm_flags)) + return NULL; + if (prev) next = prev->vm_next; else @@ -1373,16 +1379,20 @@ int unregister_mmap_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(unregister_mmap_notifier); #endif
+static unsigned long __mmap_region(struct mm_struct *mm, + struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, + unsigned long pgoff, struct list_head *uf); + /* * The caller must hold down_write(¤t->mm->mmap_sem). */ -unsigned long do_mmap(struct file *file, unsigned long addr, - unsigned long len, unsigned long prot, - unsigned long flags, vm_flags_t vm_flags, - unsigned long pgoff, unsigned long *populate, - struct list_head *uf) +unsigned long __do_mmap(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + vm_flags_t vm_flags, unsigned long pgoff, + unsigned long *populate, struct list_head *uf) { - struct mm_struct *mm = current->mm; int pkey = 0;
*populate = 0; @@ -1407,6 +1417,10 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr);
+ /* the MAP_DVPP couldn't work with MAP_SHARE_POOL */ + if ((flags & MAP_DVPP) && sp_mmap_check(flags)) + return -EINVAL; + /* Careful about overflows.. */ len = PAGE_ALIGN(len); if (!len) @@ -1568,7 +1582,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, if (is_set_cdmmask()) vm_flags |= ((numanode << CHECKNODE_BITS) & CHECKNODE_MASK);
- addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); + addr = __mmap_region(mm, file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) @@ -1576,6 +1590,17 @@ unsigned long do_mmap(struct file *file, unsigned long addr, return addr; }
+/* + * The caller must hold down_write(¤t->mm->mmap_sem). + */ +unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, vm_flags_t vm_flags, + unsigned long pgoff, unsigned long *populate, struct list_head *uf) +{ + return __do_mmap(current->mm, file, addr, len, prot, flags, vm_flags, pgoff, populate, uf); +} + + unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) @@ -1716,11 +1741,11 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; }
-unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) +static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, + unsigned long addr, unsigned long len, + vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) { - struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; int error; struct rb_node **rb_link, *rb_parent; @@ -1882,6 +1907,13 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return error; }
+unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, + unsigned long pgoff, struct list_head *uf) +{ + return __mmap_region(current->mm, file, addr, len, vm_flags, pgoff, uf); +} + unsigned long unmapped_area(struct vm_unmapped_area_info *info) { /* @@ -2133,6 +2165,8 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags);
+ sp_area_work_around(&info); + return vm_unmapped_area(&info); } #endif @@ -2183,6 +2217,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags);
+ sp_area_work_around(&info); + addr = vm_unmapped_area(&info);
/* @@ -2200,6 +2236,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags);
+ sp_area_work_around(&info); + addr = vm_unmapped_area(&info); }
@@ -2871,6 +2909,22 @@ int vm_munmap(unsigned long start, size_t len) } EXPORT_SYMBOL(vm_munmap);
+int do_vm_munmap(struct task_struct *tsk, unsigned long start, size_t len) +{ + int ret; + struct mm_struct *mm = tsk->mm; + LIST_HEAD(uf); + + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + + ret = do_munmap(mm, start, len, &uf); + up_write(&mm->mmap_sem); + userfaultfd_unmap_complete(mm, &uf); + return ret; +} +EXPORT_SYMBOL(do_vm_munmap); + SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { profile_munmap(addr);
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
Expose the per-task sp_group state value so users can determine the status of the sg_group.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/proc/base.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 5e705fa9a913..b78875fa78f4 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -94,6 +94,7 @@ #include <linux/sched/stat.h> #include <linux/flex_array.h> #include <linux/posix-timers.h> +#include <linux/share_pool.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" @@ -3040,6 +3041,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif +#ifdef CONFIG_ASCEND_SHARE_POOL + ONE("sp_group", S_IRUGO, proc_sp_group_state), +#endif };
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3418,6 +3422,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_LIVEPATCH ONE("patch_state", S_IRUSR, proc_pid_patch_state), #endif +#ifdef CONFIG_ASCEND_SHARE_POOL + ONE("sp_group", S_IRUGO, proc_sp_group_state), +#endif };
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
add new node for /proc/sys/kernel/share_pool_hugepage_enable and /proc/sys/kernel/sharepool_ac_mode.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sysctl.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 665c9e2a8802..61e62f1ccee4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -67,6 +67,7 @@ #include <linux/bpf.h> #include <linux/mount.h> #include <linux/pipe_fs_i.h> +#include <linux/share_pool.h>
#include "../lib/kstrtox.h"
@@ -1242,6 +1243,18 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#ifdef CONFIG_ASCEND_SHARE_POOL + { + /* 0: disable, 1: enable */ + .procname = "share_pool_hugepage_enable", + .data = &sysctl_share_pool_hugepage_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif { } };
@@ -1713,6 +1726,17 @@ static struct ctl_table vm_table[] = { .extra1 = (void *)&mmap_rnd_compat_bits_min, .extra2 = (void *)&mmap_rnd_compat_bits_max, }, +#endif +#ifdef CONFIG_ASCEND_SHARE_POOL + { + .procname = "sharepool_ac_mode", + .data = &sysctl_ac_mode, + .maxlen = sizeof(sysctl_ac_mode), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } };
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The proc_share_init should be used in proc_root_init when enable the share pool features.
The is_vm_huge_special() was used to distinguish the vma with VM_HUGE_SPECIAL and handle it, it would not be used by default.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/proc/root.c | 2 ++ mm/pagewalk.c | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/fs/proc/root.c b/fs/proc/root.c index f4b1a9d2eca6..33f9e1a627cc 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -23,6 +23,7 @@ #include <linux/pid_namespace.h> #include <linux/parser.h> #include <linux/cred.h> +#include <linux/share_pool.h>
#include "internal.h"
@@ -140,6 +141,7 @@ void __init proc_root_init(void) proc_tty_init(); proc_mkdir("bus", NULL); proc_sys_init(); + proc_sharepool_init();
register_filesystem(&proc_fs_type); } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 3c0930d94a29..9dd747151f03 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -3,6 +3,7 @@ #include <linux/highmem.h> #include <linux/sched.h> #include <linux/hugetlb.h> +#include <linux/share_pool.h>
static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -178,7 +179,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; - struct hstate *h = hstate_vma(vma); + struct hstate *h = is_vm_huge_special(vma) ? &default_hstate : hstate_vma(vma); unsigned long next; unsigned long hmask = huge_page_mask(h); unsigned long sz = huge_page_size(h); @@ -247,7 +248,7 @@ static int __walk_page_range(unsigned long start, unsigned long end, int err = 0; struct vm_area_struct *vma = walk->vma;
- if (vma && is_vm_hugetlb_page(vma)) { + if (vma && ((is_vm_hugetlb_page(vma)) || is_vm_huge_special(vma))) { if (walk->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The fork() will create the new mm for new process, the mm should not take any information from the parent process, so need to clean it.
The exit() will mmput the mm and free the memory, if the mm is alrready be used for sp_group, need to clean the group first.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/fork.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/kernel/fork.c b/kernel/fork.c index 1ac49d1852cf..b5f9a36fa4eb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -91,6 +91,7 @@ #include <linux/kcov.h> #include <linux/livepatch.h> #include <linux/thread_info.h> +#include <linux/share_pool.h>
#include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -1023,6 +1024,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, goto fail_nocontext;
mm->user_ns = get_user_ns(user_ns); + + sp_init_mm(mm); + return mm;
fail_nocontext: @@ -1051,11 +1055,16 @@ static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users));
+ sp_group_exit(mm); + uprobe_clear_state(mm); exit_aio(mm); ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); + + sp_group_post_exit(mm); + mm_put_huge_zero_page(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) {
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The share pool features is a big feature, it is mainly used to share user virtual memory for different processes in the same group. It could be used by this steps: 1. Process A create a new group which is owned by process A. 2. Process A add process B to the group. 3. Process A add process C to the same group. 4. Process B alloc a new memory VA, and write something in it. 5. The VA was send to the process C by IPC, then process C got it. 6. The process C access the VA and got the data directly. 7. The process A could add more processes in the group to share the memory. 8. Fix the memory by use the free function or exit the group.
The new features is enabled both by CONFIG_ASCEND_SHARE_POOL and the enable_ascend_share_pool flag, it would not affect anything if disabled.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Wu Peng wupeng58@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/Kconfig | 9 + mm/Makefile | 1 + mm/share_pool.c | 2278 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 2288 insertions(+) create mode 100644 mm/share_pool.c
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e379fca3c8ad..be2f23c8fdf9 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1541,6 +1541,15 @@ config ASCEND_AUTO_TUNING_HUGEPAGE help The hugepage auto-tuning means the kernel dynamically manages the number of huage pages. To achieve this purpose, custom interfaces are required. + +config ASCEND_SHARE_POOL + bool "Enable support for the Share Pool Memory" + default n + select ARCH_USES_HIGH_VMA_FLAGS + select MM_OWNER + help + This feature allows multiple processes to share virtual memory both + in kernel and user level, which is only enabled for ascend platform. endif
endmenu diff --git a/mm/Makefile b/mm/Makefile index d71892c0bbf1..eb9545fbb20d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -106,3 +106,4 @@ obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE) += hugepage_tuning.o +obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o diff --git a/mm/share_pool.c b/mm/share_pool.c new file mode 100644 index 000000000000..fcbc831f7f8c --- /dev/null +++ b/mm/share_pool.c @@ -0,0 +1,2278 @@ +/* + * Huawei Ascend Share Pool Memory + * + * Copyright (C) 2020 Huawei Limited + * Author: Tang Yizhou tangyizhou@huawei.com + * Zefan Li lizefan@huawei.com + * Wu Peng wupeng58@huawei.com + * Ding Tianhong dingtgianhong@huawei.com + * Zhou Guanghui zhouguanghui1@huawei.com + * Li Ming limingming.li@huawei.com + * + * This code is based on the hisilicon ascend platform. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/share_pool.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/mm.h> +#include <linux/mm_types.h> +#include <linux/idr.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/shmem_fs.h> +#include <linux/file.h> +#include <linux/printk.h> +#include <linux/hugetlb.h> +#include <linux/vmalloc.h> +#include <linux/pid.h> +#include <linux/pid_namespace.h> +#include <linux/atomic.h> +#include <linux/lockdep.h> +#include <linux/kernel.h> +#include <linux/falloc.h> +#include <linux/types.h> +#include <linux/idr.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +/* access control mode macros */ +#define AC_NONE 0 +#define AC_SINGLE_OWNER 1 + +#define spg_valid(spg) ((spg) && ((spg)->is_alive == true)) +#define ESPGMMEXIT 4000 + +#define byte2kb(size) ((size) / 1024) + +/* mdc scene hack */ +int enable_mdc_default_group; +static const int mdc_default_group_id = 1; + +/* access control mode */ +int sysctl_ac_mode = AC_NONE; + +/* idr of all sp_groups */ +static DEFINE_IDR(sp_group_idr); + +static DEFINE_MUTEX(sp_mutex); + +static BLOCKING_NOTIFIER_HEAD(sp_notifier_chain); + +static DEFINE_IDA(sp_group_id_ida); + +/*** Statistical and maintenance tools ***/ + +/* idr of all sp_proc_stats */ +static DEFINE_IDR(sp_stat_idr); + +/* per process memory usage statistics indexed by tgid */ +struct sp_proc_stat { + char comm[TASK_COMM_LEN]; + /* + * alloc amount minus free amount, may be negative when freed by + * another task in the same sp group. + */ + long amount; +}; + +/* for kthread buff_module_guard_work */ +static struct sp_proc_stat kthread_stat = {0}; + +/* The caller must hold sp_mutex. */ +static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk) +{ + struct sp_proc_stat *stat; + int id = tsk->mm->sp_stat_id; + int tgid = tsk->tgid; + int ret; + + if (id) { + stat = idr_find(&sp_stat_idr, id); + /* other threads in the same process may have initialized it */ + if (stat) + return stat; + } + + stat = kzalloc(sizeof(*stat), GFP_KERNEL); + if (stat == NULL) { + if (printk_ratelimit()) + pr_err("share pool: alloc proc stat failed due to lack of memory\n"); + return ERR_PTR(-ENOMEM); + } + + stat->amount = 0; + get_task_comm(stat->comm, tsk); + ret = idr_alloc(&sp_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); + if (ret < 0) { + if (printk_ratelimit()) + pr_err("share pool: proc stat idr alloc failed %d\n", ret); + kfree(stat); + return ERR_PTR(ret); + } + + tsk->mm->sp_stat_id = ret; + return stat; +} + +/* statistics of all sp area, protected by sp_area_lock */ +struct sp_spa_stat { + unsigned int total_num; + unsigned int alloc_num; + unsigned int k2u_task_num; + unsigned int k2u_spg_num; + unsigned long total_size; + unsigned long alloc_size; + unsigned long k2u_task_size; + unsigned long k2u_spg_size; +}; + +static struct sp_spa_stat spa_stat = {0}; + +/* statistics of all sp group born from sp_alloc and k2u(spg) */ +struct sp_spg_stat { + atomic_t spa_total_num; + atomic_t spa_total_size; +}; + +static struct sp_spg_stat spg_stat = {0}; + +/*** Global share pool VA allocator ***/ + +enum spa_type { + SPA_TYPE_ALLOC = 1, + SPA_TYPE_K2TASK, + SPA_TYPE_K2SPG, +}; + +/* + * We bump the reference when each mmap succeeds, and it will be dropped + * when vma is about to release, so sp_area object will be automatically + * freed when all tasks in the sp group has exited. + */ +struct sp_area { + unsigned long va_start; + unsigned long va_end; /* va_end always align to hugepage */ + unsigned long real_size; /* real size with alignment */ + bool is_hugepage; + atomic_t use_count; /* How many vmas use this VA region */ + struct rb_node rb_node; /* address sorted rbtree */ + struct list_head link; /* link to the spg->head */ + struct sp_group *spg; + enum spa_type type; /* where spa born from */ +}; +static DEFINE_SPINLOCK(sp_area_lock); +static struct rb_root sp_area_root = RB_ROOT; +bool host_svm_sp_enable = false; + +int sysctl_share_pool_hugepage_enable = 1; + +static unsigned long spa_size(struct sp_area *spa) +{ + return spa->real_size; +} + +static struct file *spa_file(struct sp_area *spa) +{ + if (spa->is_hugepage) + return spa->spg->file_hugetlb; + else + return spa->spg->file; +} + +/* the caller should hold sp_area_lock */ +static int spa_inc_usage(enum spa_type type, unsigned long size) +{ + /* + * all the calculations won't overflow due to system limitation and + * parameter checking in sp_alloc_area() + */ + spa_stat.total_num += 1; + spa_stat.total_size += size; + switch (type) { + case SPA_TYPE_ALLOC: + spa_stat.alloc_num += 1; + spa_stat.alloc_size += size; + break; + case SPA_TYPE_K2TASK: + spa_stat.k2u_task_num += 1; + spa_stat.k2u_task_size += size; + break; + case SPA_TYPE_K2SPG: + spa_stat.k2u_spg_num += 1; + spa_stat.k2u_spg_size += size; + break; + default: + /* usually impossible, perhaps a developer's mistake */ + return -EINVAL; + } + return 0; +} + +/* the caller should hold sp_area_lock */ +static int spa_dec_usage(enum spa_type type, unsigned long size) +{ + switch (type) { + case SPA_TYPE_ALLOC: + spa_stat.alloc_num -= 1; + spa_stat.alloc_size -= size; + break; + case SPA_TYPE_K2TASK: + spa_stat.k2u_task_num -= 1; + spa_stat.k2u_task_size -= size; + break; + case SPA_TYPE_K2SPG: + spa_stat.k2u_spg_num -= 1; + spa_stat.k2u_spg_size -= size; + break; + default: + /* usually impossible, perhaps a developer's mistake */ + spin_unlock(&sp_area_lock); + return -EINVAL; + } + spa_stat.total_num -= 1; + spa_stat.total_size -= size; + return 0; +} + +static void *sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate); + +static void free_sp_group(struct sp_group *spg) +{ + fput(spg->file); + fput(spg->file_hugetlb); + idr_remove(&sp_group_idr, spg->id); + if ((spg->id >= SPG_ID_AUTO_MIN && spg->id <= SPG_ID_AUTO_MAX) || + (spg->id >= SPG_ID_DVPP_PASS_THROUGH_MIN && + spg->id <= SPG_ID_DVPP_PASS_THROUGH_MAX)) + ida_free(&sp_group_id_ida, (unsigned int)spg->id); + kfree(spg); +} + +/* The caller must hold sp_mutex. */ +static struct sp_group *__sp_find_spg(int pid, int spg_id) +{ + struct sp_group *spg; + int ret = 0; + + if (spg_id == SPG_ID_DEFAULT) { + struct task_struct *tsk; + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else + get_task_struct(tsk); + rcu_read_unlock(); + if (ret) + return NULL; + + spg = tsk->mm->sp_group; + put_task_struct(tsk); + } else { + spg = idr_find(&sp_group_idr, spg_id); + } + + return spg; +} + +int sp_group_id_by_pid(int pid) +{ + struct sp_group *spg; + int spg_id = -ENODEV; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(pid, SPG_ID_DEFAULT); + if (spg_valid(spg)) + spg_id = spg->id; + + mutex_unlock(&sp_mutex); + return spg_id; +} +EXPORT_SYMBOL_GPL(sp_group_id_by_pid); + +/* The caller must hold sp_mutex. */ +static struct sp_group *find_or_alloc_sp_group(int spg_id) +{ + struct sp_group *spg; + int ret; + char name[20]; + + spg = idr_find(&sp_group_idr, spg_id); + if (!spg) { + struct user_struct *user = NULL; + int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; + + spg = kzalloc(sizeof(*spg), GFP_KERNEL); + if (spg == NULL) { + if (printk_ratelimit()) + pr_err("share pool: alloc spg failed due to lack of memory\n"); + return ERR_PTR(-ENOMEM); + } + spg->id = spg_id; + atomic_set(&spg->spa_num, 0); + atomic_set(&spg->size, 0); + spg->is_alive = true; + spg->hugepage_failures = 0; + spg->dvpp_multi_spaces = false; + spg->owner = current->group_leader; + atomic_set(&spg->use_count, 0); + INIT_LIST_HEAD(&spg->procs); + INIT_LIST_HEAD(&spg->spa_list); + + ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id+1, + GFP_KERNEL); + if (ret < 0) { + if (printk_ratelimit()) + pr_err("share pool: create group idr alloc failed\n"); + goto out_kfree; + } + + sprintf(name, "sp_group_%d", spg_id); + spg->file = shmem_kernel_file_setup(name, MAX_LFS_FILESIZE, + VM_NORESERVE); + if (IS_ERR(spg->file)) { + if (printk_ratelimit()) + pr_err("share pool: file setup for small page failed %ld\n", + PTR_ERR(spg->file)); + ret = PTR_ERR(spg->file); + goto out_idr; + } + + spg->file_hugetlb = hugetlb_file_setup(name, MAX_LFS_FILESIZE, + VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, hsize_log); + if (IS_ERR(spg->file_hugetlb)) { + if (printk_ratelimit()) + pr_err("share pool: file setup for hugepage failed %ld\n", + PTR_ERR(spg->file_hugetlb)); + ret = PTR_ERR(spg->file_hugetlb); + goto out_fput; + } + } + + return spg; + +out_fput: + fput(spg->file); +out_idr: + idr_remove(&sp_group_idr, spg_id); +out_kfree: + kfree(spg); + return ERR_PTR(ret); +} + +static void __sp_area_drop_locked(struct sp_area *spa); + +/* The caller must hold sp_mutex. */ +static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) +{ + struct sp_area *spa, *prev = NULL; + int err; + + if (!mmget_not_zero(mm)) + return; + down_write(&mm->mmap_sem); + spin_lock(&sp_area_lock); + + list_for_each_entry(spa, &mm->sp_group->spa_list, link) { + if (&spa->link == stop) + break; + + if (prev) + __sp_area_drop_locked(prev); + prev = spa; + + atomic_inc(&spa->use_count); + spin_unlock(&sp_area_lock); + + err = do_munmap(mm, spa->va_start, spa_size(spa), NULL); + if (err) { + /* we are not supposed to fail */ + pr_err("share pool: failed to unmap VA %pK when munmap task areas\n", + (void *)spa->va_start); + } + + spin_lock(&sp_area_lock); + } + if (prev) + __sp_area_drop_locked(prev); + + spin_unlock(&sp_area_lock); + up_write(&mm->mmap_sem); + mmput(mm); +} + +/** + * sp_group_add_task - add a process to an sp_group + * @pid: the pid of the task to be added + * @spg_id: the ID of the sp_group + * + * A thread group can't be added to more than one sp_group. + * + * Return: The manually allocated ID is between [SPG_ID_MIN, SPG_ID_MAX] + * The automatically allocated ID is between [SPG_ID_AUTO_MIN, SPG_ID_AUTO_MAX] + * When negative, the return value is -errno. + */ +int sp_group_add_task(int pid, int spg_id) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct sp_group *spg; + int ret = 0; + struct sp_area *spa, *prev = NULL; + struct sp_proc_stat *stat; + + /* mdc scene hack */ + if (enable_mdc_default_group) + spg_id = mdc_default_group_id; + + if ((spg_id < SPG_ID_MIN || spg_id > SPG_ID_AUTO) + && spg_id != SPG_ID_DVPP_PASS_THROUGH) { + if (printk_ratelimit()) + pr_err("share pool: task add group failed due to invalid group id %d\n", spg_id); + return -EINVAL; + } + + if (spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) { + mutex_lock(&sp_mutex); + spg = idr_find(&sp_group_idr, spg_id); + if (!spg_valid(spg)) { + mutex_unlock(&sp_mutex); + pr_err("share pool: task add group failed because group id %d hasn't been create or dead\n", + spg_id); + return -EINVAL; + } + mutex_unlock(&sp_mutex); + } + + if (spg_id == SPG_ID_AUTO) { + spg_id = ida_alloc_range(&sp_group_id_ida, SPG_ID_AUTO_MIN, + SPG_ID_AUTO_MAX, GFP_ATOMIC); + if (spg_id < 0) { + pr_err("share pool: task add group failed when automatically generate group id failed\n"); + return spg_id; + } + } + + if (spg_id == SPG_ID_DVPP_PASS_THROUGH) { + spg_id = ida_alloc_range(&sp_group_id_ida, + SPG_ID_DVPP_PASS_THROUGH_MIN, + SPG_ID_DVPP_PASS_THROUGH_MAX, GFP_ATOMIC); + if (spg_id < 0) { + pr_err("share pool: task add group failed when automatically generate group id failed" + "in DVPP pass through\n"); + return spg_id; + } + } + + mutex_lock(&sp_mutex); + + rcu_read_lock(); + + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else if (tsk->mm->sp_group) /* if it's already in a sp_group */ + ret = -EEXIST; + else + get_task_struct(tsk); + + rcu_read_unlock(); + if (ret) + goto out_unlock; + + spg = find_or_alloc_sp_group(spg_id); + if (IS_ERR(spg) || !spg_valid(spg)) { + ret = PTR_ERR(spg); + goto out_put_task; + } + /* access control permission check */ + if (sysctl_ac_mode == AC_SINGLE_OWNER) { + if (spg->owner != current->group_leader) { + ret = -EPERM; + goto out_put_task; + } + } + + /* per process statistics initialization */ + stat = sp_init_proc_stat(tsk); + if (IS_ERR(stat)) { + ret = PTR_ERR(stat); + pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); + goto out_put_task; + } + + mm = tsk->mm; + mm->sp_group = spg; + atomic_inc(&spg->use_count); + list_add_tail(&tsk->mm->sp_node, &spg->procs); + /* + * create mappings of existing shared memory segments into this + * new process' page table. + */ + spin_lock(&sp_area_lock); + + list_for_each_entry(spa, &spg->spa_list, link) { + unsigned long populate = 0; + struct file *file = spa_file(spa); + void *p; + + if (prev) + __sp_area_drop_locked(prev); + prev = spa; + + atomic_inc(&spa->use_count); + spin_unlock(&sp_area_lock); + + p = sp_mmap(mm, file, spa, &populate); + if (IS_ERR(p) && (PTR_ERR(p) != -ESPGMMEXIT)) { + sp_munmap_task_areas(mm, &spa->link); + ret = PTR_ERR(p); + pr_err("share pool: task add group sp mmap failed, ret %d\n", ret); + spin_lock(&sp_area_lock); + break; + } + + if (PTR_ERR(p) == -ESPGMMEXIT) { + pr_err("share pool: task add group sp mmap failed, ret -ESPGMEXIT\n"); + spin_lock(&sp_area_lock); + ret = -ESPGMMEXIT; + break; + } + + if (populate) { + ret = do_mm_populate(mm, spa->va_start, populate, 0); + if (ret) { + if (printk_ratelimit()) + pr_err("share pool: task add group failed when mm populate failed: %d\n", + ret); + sp_munmap_task_areas(mm, spa->link.next); + } + } + + spin_lock(&sp_area_lock); + } + if (prev) + __sp_area_drop_locked(prev); + spin_unlock(&sp_area_lock); + + if (unlikely(ret)) { + idr_remove(&sp_stat_idr, mm->sp_stat_id); + kfree(stat); + } + +out_put_task: + put_task_struct(tsk); +out_unlock: + mutex_unlock(&sp_mutex); + return ret == 0 ? spg_id : ret; +} +EXPORT_SYMBOL_GPL(sp_group_add_task); + +static void spg_exit_lock(bool *unlock) +{ + switch (mutex_trylock_recursive(&sp_mutex)) { + case MUTEX_TRYLOCK_RECURSIVE: + *unlock = false; + break; + case MUTEX_TRYLOCK_FAILED: + mutex_lock(&sp_mutex); + *unlock = true; + break; + case MUTEX_TRYLOCK_SUCCESS: + *unlock = true; + break; + default: + BUG(); + } +} + +static void spg_exit_unlock(bool unlock) +{ + if (unlock) + mutex_unlock(&sp_mutex); +} + +/* + * Do cleanup when a process exits. + */ +void sp_group_exit(struct mm_struct *mm) +{ + bool is_alive = true; + bool unlock; + + if (!enable_ascend_share_pool) + return; + + /* + * Nothing to do if this thread group doesn't belong to any sp_group. + * No need to protect this check with lock because we can add a task + * to a group if !PF_EXITING. + */ + if (!mm->sp_group) + return; + + spg_exit_lock(&unlock); + if (list_is_singular(&mm->sp_group->procs)) + is_alive = mm->sp_group->is_alive = false; + list_del(&mm->sp_node); + spg_exit_unlock(unlock); + + /* + * To avoid calling this with sp_mutex held, we first mark the + * sp_group as dead and then send the notification and then do + * the real cleanup in sp_group_post_exit(). + */ + if (!is_alive) + blocking_notifier_call_chain(&sp_notifier_chain, 0, + mm->sp_group); +} + +void sp_group_post_exit(struct mm_struct *mm) +{ + bool is_alive; + struct sp_proc_stat *stat; + bool unlock; + + if (!enable_ascend_share_pool) + return; + + if (!mm->sp_group) + return; + + spg_exit_lock(&unlock); + is_alive = mm->sp_group->is_alive; + + /* pointer stat must be valid, we don't need to check sanity */ + stat = idr_find(&sp_stat_idr, mm->sp_stat_id); + /* + * There are two basic scenarios when a process in the share pool is + * exiting but its share pool memory usage is not 0. + * 1. Process A called sp_alloc(), but it terminates without calling + * sp_free(). Then its share pool memory usage is a positive number. + * 2. Process A never called sp_alloc(), and process B in the same spg + * called sp_alloc() to get an addr u. Then A gets u somehow and + * called sp_free(u). Now A's share pool memory usage is a negative + * number. Notice B's memory usage will be a positive number. + * + * We decide to print a info when seeing both of the scenarios. + */ + if (stat && stat->amount != 0) + pr_info("share pool: process %s(%d) of sp group %d exits. " + "It applied %ld aligned KB\n", + stat->comm, mm->sp_stat_id, + mm->sp_group->id, byte2kb(stat->amount)); + + idr_remove(&sp_stat_idr, mm->sp_stat_id); + + if (atomic_dec_and_test(&mm->sp_group->use_count)) { + BUG_ON(is_alive); + free_sp_group(mm->sp_group); + } + spg_exit_unlock(unlock); + + kfree(stat); +} + +/* the caller must hold sp_area_lock */ +static void __insert_sp_area(struct sp_area *spa) +{ + struct rb_node **p = &sp_area_root.rb_node; + struct rb_node *parent = NULL; + + while (*p) { + struct sp_area *tmp; + + parent = *p; + tmp = rb_entry(parent, struct sp_area, rb_node); + if (spa->va_start < tmp->va_end) + p = &(*p)->rb_left; + else if (spa->va_end > tmp->va_start) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&spa->rb_node, parent, p); + rb_insert_color(&spa->rb_node, &sp_area_root); +} + +/* + * Allocate a region of VA from the share pool. + * @size - the size of VA to allocate + * + * The caller must hold must sp_mutex when input parameter spg is not NULL + * + * Return NULL if fail. + */ +static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, + struct sp_group *spg, enum spa_type type) +{ + struct sp_area *spa; + struct rb_node *n; + unsigned long vstart = MMAP_SHARE_POOL_START; + unsigned long vend = MMAP_SHARE_POOL_16G_START; + unsigned long addr; + unsigned long size_align = ALIGN(size, 1 << 21); /* align to 2M */ + + if ((flags & SP_DVPP)) { + if (host_svm_sp_enable == false) { + vstart = MMAP_SHARE_POOL_16G_START; + vend = MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE; + } else { + vstart = spg->dvpp_va_start; + vend = spg->dvpp_va_start + spg->dvpp_size; + } + } + + addr = vstart; + + if (!sysctl_share_pool_hugepage_enable) + flags &= ~(SP_HUGEPAGE_ONLY | SP_HUGEPAGE); + + spa = kmalloc(sizeof(struct sp_area), GFP_KERNEL); + if (unlikely(!spa)) { + if (printk_ratelimit()) + pr_err("share pool: alloc spa failed due to lack of memory\n"); + return NULL; + } + + spin_lock(&sp_area_lock); + + n = sp_area_root.rb_node; + if (n) { + struct sp_area *first = NULL; + + do { + struct sp_area *tmp; + tmp = rb_entry(n, struct sp_area, rb_node); + if (tmp->va_end >= addr) { + if (!first && tmp->va_start < addr + size_align) + first = tmp; + n = n->rb_left; + } else { + first = tmp; + n = n->rb_right; + } + } while (n); + + if (!first) + goto found; + + if (first->va_end < addr) { + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct sp_area, rb_node); + else + goto found; + } + + while (addr + size_align >= first->va_start && + addr + size_align <= vend) { + addr = first->va_end; + + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct sp_area, rb_node); + else + goto found; + } + } +found: + if (addr + size_align > vend) { + goto error; + } + + spa->va_start = addr; + spa->va_end = addr + size_align; + spa->real_size = size; + spa->is_hugepage = (flags & SP_HUGEPAGE); + spa->spg = spg; + atomic_set(&spa->use_count, 1); + spa->type = type; + + if (spa_inc_usage(type, size)) + goto error; + + __insert_sp_area(spa); + if (spa->spg) { + atomic_inc(&spg->spa_num); + atomic_add(size, &spg->size); + atomic_inc(&spg_stat.spa_total_num); + atomic_add(size, &spg_stat.spa_total_size); + list_add_tail(&spa->link, &spg->spa_list); + } + spin_unlock(&sp_area_lock); + + return spa; + +error: + spin_unlock(&sp_area_lock); + kfree(spa); + return NULL; +} + +/* the caller should hold sp_area_lock */ +static struct sp_area *__find_sp_area_locked(unsigned long addr) +{ + struct rb_node *n = sp_area_root.rb_node; + + while (n) { + struct sp_area *spa; + + spa = rb_entry(n, struct sp_area, rb_node); + if (addr < spa->va_start) { + n = n->rb_left; + } else if (addr > spa->va_start) { + n = n->rb_right; + } else { + return spa; + } + } + + return NULL; +} + +static struct sp_area *__find_sp_area(unsigned long addr) +{ + struct sp_area *n; + spin_lock(&sp_area_lock); + n = __find_sp_area_locked(addr); + if (n) + atomic_inc(&n->use_count); + spin_unlock(&sp_area_lock); + return n; +} + +/* + * Free the VA region starting from addr to the share pool + */ +static void sp_free_area(struct sp_area *spa) +{ + lockdep_assert_held(&sp_area_lock); + + spa_dec_usage(spa->type, spa->real_size); /* won't fail */ + if (spa->spg) { + atomic_dec(&spa->spg->spa_num); + atomic_sub(spa->real_size, &spa->spg->size); + atomic_dec(&spg_stat.spa_total_num); + atomic_sub(spa->real_size, &spg_stat.spa_total_size); + list_del(&spa->link); + } + rb_erase(&spa->rb_node, &sp_area_root); + RB_CLEAR_NODE(&spa->rb_node); + kfree(spa); +} + +static void __sp_area_drop_locked(struct sp_area *spa) +{ + /* + * Considering a situation where task A and B are in the same spg. + * A is exiting and calling remove_vma(). Before A calls this func, + * B calls sp_free() to free the same spa. So spa maybe NULL when A + * calls this func later. + */ + if (!spa) + return; + + if (atomic_dec_and_test(&spa->use_count)) + sp_free_area(spa); +} + +static void __sp_area_drop(struct sp_area *spa) +{ + spin_lock(&sp_area_lock); + __sp_area_drop_locked(spa); + spin_unlock(&sp_area_lock); +} + +void sp_area_drop(struct vm_area_struct *vma) +{ + struct sp_area *spa; + + if (!sp_check_vm_share_pool(vma->vm_flags)) + return; + + /* + * Considering a situation where task A and B are in the same spg. + * A is exiting and calling remove_vma() -> ... -> sp_area_drop(). + * Concurrently, B is calling sp_free() to free the same spa. + * __find_sp_area_locked() and __sp_area_drop_locked() should be + * an atomic operation. + */ + spin_lock(&sp_area_lock); + spa = __find_sp_area_locked(vma->vm_start); + __sp_area_drop_locked(spa); + spin_unlock(&sp_area_lock); +} + +/* The caller must hold sp_mutex. */ +static void sp_munmap(struct mm_struct *mm, unsigned long addr, + unsigned long size) +{ + int err; + + if (!mmget_not_zero(mm)) + return; + down_write(&mm->mmap_sem); + + err = do_munmap(mm, addr, size, NULL); + if (err) { + /* we are not supposed to fail */ + pr_err("share pool: failed to unmap VA %pK when sp munmap\n", (void *)addr); + } + + up_write(&mm->mmap_sem); + mmput(mm); +} + +/* The caller must hold sp_mutex. */ +static void __sp_free(struct sp_group *spg, unsigned long addr, + unsigned long size, struct mm_struct *stop) +{ + struct mm_struct *mm; + struct mm_struct *tmp; + + list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + if (mm == stop) + break; + sp_munmap(mm, addr, size); + } +} + +/* + * Free the memory allocated by sp_alloc() + * @addr - the starting VA of the memory + * + * Return fail if the memory can't be found or was not allocted by share pool. + */ +int sp_free(unsigned long addr) +{ + struct sp_area *spa; + struct sp_proc_stat *stat; + int mode; + loff_t offset; + int ret = 0; + + mutex_lock(&sp_mutex); + + /* + * Access control: a share pool addr can only be freed by another task + * in the same spg or a kthread (such as buff_module_guard_work) + */ + spa = __find_sp_area(addr); + if (spa) { + if (current->mm != NULL) { + if (current->mm->sp_group != spa->spg) { + ret = -EPERM; + goto drop_spa; + } + } + } else { /* spa == NULL */ + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: sp_free invalid input addr %pK\n", (void *)addr); + goto out; + } + + if (!spg_valid(spa->spg)) + goto drop_spa; + + __sp_free(spa->spg, spa->va_start, spa_size(spa), NULL); + + /* Free the memory of the backing shmem or hugetlbfs */ + mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; + offset = addr - MMAP_SHARE_POOL_START; + ret = vfs_fallocate(spa_file(spa), mode, offset, spa_size(spa)); + if (ret) + pr_err("share pool: fallocate failed: %d\n", ret); + + /* pointer stat may be invalid because of kthread buff_module_guard_work */ + if (current->mm == NULL) { + kthread_stat.amount -= spa->real_size; + } else { + stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); + if (stat) + stat->amount -= spa->real_size; + else + BUG(); + } + +drop_spa: + __sp_area_drop(spa); +out: + mutex_unlock(&sp_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(sp_free); + +/* wrapper of __do_mmap() and the caller must hold down_write(&mm->mmap_sem). */ +static unsigned long __sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate) +{ + unsigned long addr = spa->va_start; + unsigned long size = spa_size(spa); + unsigned long prot = PROT_READ | PROT_WRITE; + unsigned long flags = MAP_FIXED | MAP_SHARED | MAP_LOCKED | + MAP_POPULATE | MAP_SHARE_POOL; + unsigned long vm_flags = VM_NORESERVE | VM_SHARE_POOL | VM_DONTCOPY; + unsigned long pgoff = (addr - MMAP_SHARE_POOL_START) >> PAGE_SHIFT; + + atomic_inc(&spa->use_count); + addr = __do_mmap(mm, file, addr, size, prot, flags, vm_flags, pgoff, + populate, NULL); + if (IS_ERR_VALUE(addr)) { + atomic_dec(&spa->use_count); + pr_err("share pool: do_mmap fails %ld\n", addr); + } + + return addr; +} + +static void *sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate) +{ + unsigned long addr; + + if (!mmget_not_zero(mm)) + return ERR_PTR(-ESPGMMEXIT); + down_write(&mm->mmap_sem); + addr = __sp_mmap(mm, file, spa, populate); + up_write(&mm->mmap_sem); + mmput(mm); + + if (IS_ERR_VALUE(addr)) + return ERR_PTR(addr); + + BUG_ON(addr != spa->va_start); + return (void *)addr; +} + +/** + * Allocate shared memory for all the processes in the same sp_group + * size - the size of memory to allocate + * sp_flags - how to allocate the memory + * spg_id - the share group that the memory is allocated to. + * + * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + */ +void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) +{ + struct sp_group *spg = NULL; + struct sp_area *spa = NULL; + struct sp_proc_stat *stat; + unsigned long sp_addr; + void *p_mmap, *p = ERR_PTR(-ENODEV); + struct mm_struct *mm; + struct file *file; + unsigned long size_aligned; + int ret = 0; + struct mm_struct *tmp; + + /* mdc scene hack */ + if (enable_mdc_default_group) + spg_id = mdc_default_group_id; + + if (spg_id != SPG_ID_DEFAULT && spg_id < SPG_ID_MIN) { + if (printk_ratelimit()) + pr_err("share pool: allocation failed due to invalid group id %d\n", spg_id); + return ERR_PTR(-EINVAL); + } + + if (sp_flags & ~(SP_HUGEPAGE_ONLY | SP_HUGEPAGE | SP_DVPP)) { + if (printk_ratelimit()) + pr_err("share pool: allocation failed due to invalid flag %lu\n", sp_flags); + return ERR_PTR(-EINVAL); + } + + if (sp_flags & SP_HUGEPAGE_ONLY) + sp_flags |= SP_HUGEPAGE; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(current->pid, SPG_ID_DEFAULT); + mutex_unlock(&sp_mutex); + if (!spg) { /* DVPP pass through scene: first call sp_alloc() */ + /* mdc scene hack */ + if (enable_mdc_default_group) + ret = sp_group_add_task(current->tgid, spg_id); + else + ret = sp_group_add_task(current->tgid, + SPG_ID_DVPP_PASS_THROUGH); + /* + * The multi-thread contention may cause repeated joins to the group. + * The judgment is added to prevent exit in this case. + */ + if (ret < 0 && (ret != -EEXIST)) { + pr_err("share pool: allocation failed due to add group error %d in DVPP pass through scenario", + ret); + p = ERR_PTR(ret); + goto out; + } + mutex_lock(&sp_mutex); + spg = current->mm->sp_group; + } else { /* other scenes */ + mutex_lock(&sp_mutex); + if (spg_id != SPG_ID_DEFAULT) { + /* the caller should be a member of the sp group */ + if (spg != idr_find(&sp_group_idr, spg_id)) + goto out; + } + } + + if (!spg_valid(spg)) { + pr_err("share pool: sp alloc failed, spg is invalid\n"); + goto out; + } + + if (!sysctl_share_pool_hugepage_enable) + sp_flags &= ~(SP_HUGEPAGE_ONLY | SP_HUGEPAGE); + + if (sp_flags & SP_HUGEPAGE) { + file = spg->file_hugetlb; + size_aligned = ALIGN(size, PMD_SIZE); + } else { + file = spg->file; + size_aligned = ALIGN(size, PAGE_SIZE); + } +try_again: + spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_ALLOC); + if (!spa) { + if (printk_ratelimit()) + pr_err("share pool: allocation failed due to alloc spa failure\n"); + p = ERR_PTR(-ENOMEM); + goto out; + } + sp_addr = spa->va_start; + + /* create mapping for each process in the group */ + list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + unsigned long populate = 0; + struct vm_area_struct *vma; + + p_mmap = sp_mmap(mm, file, spa, &populate); + if (IS_ERR(p_mmap) && (PTR_ERR(p_mmap) != -ESPGMMEXIT)) { + p = p_mmap; + __sp_free(spg, sp_addr, size_aligned, mm); + pr_err("share pool: allocation sp mmap failed, ret %ld\n", PTR_ERR(p_mmap)); + break; + } + + if (PTR_ERR(p_mmap) == -ESPGMMEXIT) { + pr_info("share pool: allocation sp mmap failed, ret -ESPGMMEXIT\n"); + continue; + } + + p = p_mmap; /* success */ + if (populate == 0) + continue; + + if (!mmget_not_zero(mm)) + continue; + down_write(&mm->mmap_sem); + vma = find_vma(mm, sp_addr); + if (unlikely(!vma)) { + pr_err("share pool: allocation failed due to find %pK vma failure\n", + (void *)sp_addr); + p = ERR_PTR(-EINVAL); + up_write(&mm->mmap_sem); + mmput(mm); + goto out; + } + /* clean PTE_RDONLY flags or trigger SMMU event */ + vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); + up_write(&mm->mmap_sem); + /* + * We are not ignoring errors, so if we fail to allocate + * physical memory we just return failure, so we won't encounter + * page fault later on, and more importantly sp_make_share_u2k() + * depends on this feature (and MAP_LOCKED) to work correctly. + */ + ret = do_mm_populate(mm, sp_addr, populate, 0); + if (ret) { + __sp_free(spg, sp_addr, size_aligned, + list_next_entry(mm, sp_node)); + + if (file == spg->file_hugetlb) { + spg->hugepage_failures++; + + /* fallback to small pages */ + if (!(sp_flags & SP_HUGEPAGE_ONLY)) { + file = spg->file; + spa->is_hugepage = false; + size_aligned = ALIGN(size, PAGE_SIZE); + __sp_area_drop(spa); + mmput(mm); + goto try_again; + } + } + + if (printk_ratelimit()) + pr_err("share pool: allocation failed due to mm populate failed: %d\n", + ret); + p = ERR_PTR(ret); + mmput(mm); + break; + } + mmput(mm); + } + + if (!IS_ERR(p)) { + stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); + if (stat) + stat->amount += size_aligned; + } + +out: + mutex_unlock(&sp_mutex); + + /* this will free spa if mmap failed */ + if (spa) + __sp_area_drop(spa); + + return p; +} +EXPORT_SYMBOL_GPL(sp_alloc); + +static unsigned long __sp_remap_get_pfn(unsigned long kva) +{ + unsigned long pfn; + if (is_vmalloc_addr((void *)kva)) + pfn = vmalloc_to_pfn((void *)kva); + else + pfn = virt_to_pfn(kva); + + return pfn; +} + +/* + * return value: >0 means this is a hugepage addr + * =0 means a normal addr. <0 means an errno. + */ +static int is_vmap_hugepage(unsigned long addr) +{ + struct vm_struct *area; + + if (unlikely(!addr)) { + if (printk_ratelimit()) + pr_err("share pool: null pointer when judge vmap addr\n"); + return -EINVAL; + } + + area = find_vm_area((void *)addr); + if (unlikely(!area)) { + if (printk_ratelimit()) + pr_err("share pool: failed to find vm area(%lx)\n", addr); + return -EINVAL; + } + + if (area->flags & VM_HUGE_PAGES) + return 1; + else + return 0; +} + +static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, + struct mm_struct *mm) +{ + struct vm_area_struct *vma; + unsigned long ret_addr; + unsigned long populate = 0; + unsigned long addr, buf, offset; + struct file *file = NULL; + int ret = 0; + struct user_struct *user = NULL; + int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; + + if (spa->is_hugepage) { + file = hugetlb_file_setup(HUGETLB_ANON_FILE, spa_size(spa), VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE, hsize_log); + if (IS_ERR(file)) { + pr_err("share pool: file setup for k2u hugepage failed %ld\n", PTR_ERR(file)); + return PTR_ERR(file); + } + } + + if (!mmget_not_zero(mm)) { + ret_addr = -ESPGMMEXIT; + goto put_file; + } + down_write(&mm->mmap_sem); + + ret_addr = __sp_mmap(mm, file, spa, &populate); + if (IS_ERR_VALUE(ret_addr)) { + pr_err("share pool: k2u mmap failed %lx\n", ret_addr); + goto out; + } + BUG_ON(ret_addr != spa->va_start); + + vma = find_vma(mm, ret_addr); + BUG_ON(vma == NULL); + vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); + + if (is_vm_hugetlb_page(vma)) { + ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); + if (ret) { + pr_err("share pool: remap vmalloc hugepage failed, ret %d\n", ret); + ret_addr = ret; + goto out; + } + } else { + buf = ret_addr; + addr = kva; + offset = 0; + do { + ret = remap_pfn_range(vma, buf, __sp_remap_get_pfn(addr), PAGE_SIZE, + __pgprot(vma->vm_page_prot.pgprot)); + if (ret) { + ret_addr = ret; + goto out; + } + offset += PAGE_SIZE; + buf += PAGE_SIZE; + addr += PAGE_SIZE; + } while (offset < spa_size(spa)); + } + +out: + up_write(&mm->mmap_sem); + mmput(mm); +put_file: + if (file) + fput(file); + + return ret_addr; +} + +static void *sp_make_share_kva_to_task(unsigned long kva, struct sp_area *spa, + int pid) +{ + struct task_struct *tsk; + unsigned long ret_addr; + void *p = ERR_PTR(-ENODEV); + int ret = 0; + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else + get_task_struct(tsk); + + rcu_read_unlock(); + if (ret) + return ERR_PTR(ret); + + ret_addr = sp_remap_kva_to_vma(kva, spa, tsk->mm); + if (IS_ERR_VALUE(ret_addr)) { + pr_err("share pool: remap k2u to task failed, ret %ld\n", ret_addr); + sp_munmap(tsk->mm, spa->va_start, spa_size(spa)); + p = ERR_PTR(ret_addr); + goto out; + } + + p = (void *)ret_addr; +out: + put_task_struct(tsk); + return p; +} + +static void *sp_make_share_kva_to_spg(unsigned long kva, struct sp_area *spa, + struct sp_group *spg) +{ + struct mm_struct *mm; + struct mm_struct *tmp; + unsigned long ret_addr = -ENODEV; + unsigned long uva = -ENODEV; + void *p = ERR_PTR(-ENODEV); + + list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + ret_addr = sp_remap_kva_to_vma(kva, spa, mm); + if (IS_ERR_VALUE(ret_addr) && (ret_addr != -ESPGMMEXIT)) { + pr_err("share pool: remap k2u to spg failed, ret %ld \n", ret_addr); + __sp_free(spg, spa->va_start, spa_size(spa), + list_next_entry(mm, sp_node)); + p = ERR_PTR(ret_addr); + goto out; + } + + if (ret_addr == -ESPGMMEXIT) { + pr_info("share pool: remap k2u, ret is -ESPGMMEXIT\n"); + continue; + } + + uva = ret_addr; + } + p = (void *)uva; +out: + return p; +} + +/** + * Share kernel memory to a specified process or sp_group + * @kva: the VA of shared kernel memory + * @size: the size of shared kernel memory + * @sp_flags: how to allocate the memory. We only support SP_DVPP. + * @pid: the pid of the specified process + * @spg_id: currently, only support default value(SPG_ID_DEFAULT) and other values + * are useless. + * + * Return: the shared target user address to start at + * + * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + */ +void *sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id) +{ + void *uva = ERR_PTR(-ENODEV); + struct sp_group *spg; + struct sp_area *spa; + unsigned long kva_aligned; + unsigned long size_aligned; + unsigned int page_size = PAGE_SIZE; + int ret; + + if (sp_flags & ~SP_DVPP) { + if (printk_ratelimit()) + pr_err("share pool: k2u sp_flags %lu error\n", sp_flags); + return ERR_PTR(-EINVAL); + } + + ret = is_vmap_hugepage(kva); + if (ret > 0) { + sp_flags |= SP_HUGEPAGE; + page_size = PMD_SIZE; + } else if (ret == 0) { + /* do nothing */ + } else { + return ERR_PTR(ret); + } + /* aligned down kva is convenient for caller to start with any valid kva */ + kva_aligned = ALIGN_DOWN(kva, page_size); + size_aligned = ALIGN(kva + size, page_size) - kva_aligned; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(pid, spg_id); + if (spg == NULL) { + spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); + if (!spa) { + mutex_unlock(&sp_mutex); + if (printk_ratelimit()) + pr_err("share pool: k2u failed due to alloc spa failure\n"); + return ERR_PTR(-ENOMEM); + } + uva = sp_make_share_kva_to_task(kva_aligned, spa, pid); + mutex_unlock(&sp_mutex); + } else if (spg_valid(spg)) { + spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); + if (!spa) { + mutex_unlock(&sp_mutex); + if (printk_ratelimit()) + pr_err("share pool: k2u failed due to alloc spa failure\n"); + return ERR_PTR(-ENOMEM); + } + + uva = sp_make_share_kva_to_spg(kva_aligned, spa, spg); + mutex_unlock(&sp_mutex); + } else { + mutex_unlock(&sp_mutex); + pr_err("share pool: failed to make k2u\n"); + return NULL; + } + + if (!IS_ERR(uva)) + uva = uva + (kva - kva_aligned); + + __sp_area_drop(spa); + return uva; +} +EXPORT_SYMBOL_GPL(sp_make_share_k2u); + +static int sp_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page *page = pte_page(*pte); + struct sp_walk_data *sp_walk_data; + + if (unlikely(!pte_present(*pte))) { + if (printk_ratelimit()) + pr_err("share pool: the page of addr %pK unexpectedly not in RAM\n", (void *)addr); + return -EFAULT; + } + + sp_walk_data = walk->private; + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + return 0; +} + +static int sp_test_walk(unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + /* + * FIXME: The devmm driver uses remap_pfn_range() but actually there + * are associated struct pages, so they should use vm_map_pages() or + * similar APIs. Before the driver has been converted to correct APIs + * we use this test_walk() callback so we can treat VM_PFNMAP VMAs as + * normal VMAs. + */ + return 0; +} + +static int sp_pte_hole(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + if (printk_ratelimit()) + pr_err("share pool: hole [%pK, %pK) appeared unexpectedly\n", (void *)start, (void *)end); + return -EFAULT; +} + +static int sp_hugetlb_entry(pte_t *ptep, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + pte_t pte = huge_ptep_get(ptep); + struct page *page = pte_page(pte); + struct sp_walk_data *sp_walk_data; + + if (unlikely(!pte_present(pte))) { + if (printk_ratelimit()) + pr_err("share pool: the page of addr %pK unexpectedly not in RAM\n", (void *)addr); + return -EFAULT; + } + + sp_walk_data = walk->private; + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + return 0; +} + +/** + * the caller must hold mm->mmap_sem + * + * Notes for parameter alignment: + * When size == 0, let it be page_size, so that at least one page is walked. + * + * When size > 0, for convenience, usually the parameters of uva and + * size are not page aligned. There are four different alignment scenarios and + * we must handler all of them correctly. + * + * The basic idea is to align down uva and align up size so all the pages + * in range [uva, uva + size) are walked. However, there are special cases. + * + * Considering a 2M-hugepage addr scenario. Assuming the caller wants to + * traverse range [1001M, 1004.5M), so uva and size is 1001M and 3.5M + * accordingly. The aligned-down uva is 1000M and the aligned-up size is 4M. + * The traverse range will be [1000M, 1004M). Obviously, the final page for + * [1004M, 1004.5M) is not covered. + * + * To fix this problem, we need to walk an additional page, size should be + * ALIGN(uva+size) - uva_aligned + */ +static int __sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + int ret = 0; + struct vm_area_struct *vma; + unsigned long page_nr; + struct page **pages = NULL; + bool is_hugepage = false; + unsigned long uva_aligned; + unsigned long size_aligned; + unsigned int page_size = PAGE_SIZE; + struct mm_walk sp_walk = {}; + + /* + * Here we also support non share pool memory in this interface + * because the caller can't distinguish whether a uva is from the + * share pool or not. It is not the best idea to do so, but currently + * it simplifies overall design. + * + * In this situation, the correctness of the parameters is mainly + * guaranteed by the caller. + */ + vma = find_vma(tsk->mm, uva); + if (!vma) { + if (printk_ratelimit()) + pr_err("share pool: u2k input uva %pK is invalid\n", (void *)uva); + return -EINVAL; + } + if ((is_vm_hugetlb_page(vma)) || is_vm_huge_special(vma)) + is_hugepage = true; + + sp_walk.pte_hole = sp_pte_hole; + sp_walk.test_walk = sp_test_walk; + if (is_hugepage) { + sp_walk_data->is_hugepage = true; + sp_walk.hugetlb_entry = sp_hugetlb_entry; + page_size = PMD_SIZE; + } else { + sp_walk_data->is_hugepage = false; + sp_walk.pte_entry = sp_pte_entry; + } + + sp_walk_data->page_size = page_size; + uva_aligned = ALIGN_DOWN(uva, page_size); + sp_walk_data->uva_aligned = uva_aligned; + if (size == 0) + size_aligned = page_size; + else + /* special alignment handling */ + size_aligned = ALIGN(uva + size, page_size) - uva_aligned; + + if (uva_aligned + size_aligned < uva_aligned) { + if (printk_ratelimit()) + pr_err("share pool: overflow happened in walk page range\n"); + return -EINVAL; + } + + page_nr = size_aligned / page_size; + pages = kvmalloc(page_nr * sizeof(struct page *), GFP_KERNEL); + if (!pages) { + if (printk_ratelimit()) + pr_err("share pool: alloc page array failed in walk page range\n"); + return -ENOMEM; + } + sp_walk_data->pages = pages; + + sp_walk.mm = tsk->mm; + sp_walk.private = sp_walk_data; + + ret = walk_page_range(uva_aligned, uva_aligned + size_aligned, + &sp_walk); + if (ret) + kvfree(pages); + + return ret; +} + +/** + * Share user memory of a specified process to kernel + * @uva: the VA of shared user memory + * @size: the size of shared user memory + * @pid: the pid of the specified process + * + * Return: if success, return the starting kernel address of the shared memory. + * if failed, return the pointer of -errno. + */ +void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) +{ + int ret = 0; + struct task_struct *tsk; + void *p = ERR_PTR(-ENODEV); + struct sp_walk_data sp_walk_data = { + .page_count = 0, + }; + struct vm_struct *area; + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else + get_task_struct(tsk); + rcu_read_unlock(); + if (ret) { + p = ERR_PTR(ret); + goto out; + } + + if (!mmget_not_zero(tsk->mm)) + goto out_put_task; + down_write(&tsk->mm->mmap_sem); + ret = __sp_walk_page_range(uva, size, tsk, &sp_walk_data); + if (ret) { + pr_err("share pool: walk page range failed, ret %d\n", ret); + up_write(&tsk->mm->mmap_sem); + mmput(tsk->mm); + p = ERR_PTR(ret); + goto out_put_task; + } + + if (sp_walk_data.is_hugepage) + p = vmap_hugepage(sp_walk_data.pages, sp_walk_data.page_count, + VM_MAP | VM_HUGE_PAGES, PAGE_KERNEL); + else + p = vmap(sp_walk_data.pages, sp_walk_data.page_count, VM_MAP, + PAGE_KERNEL); + up_write(&tsk->mm->mmap_sem); + mmput(tsk->mm); + + if (!p) { + if (printk_ratelimit()) + pr_err("share pool: vmap(huge) in u2k failed\n"); + p = ERR_PTR(-ENOMEM); + goto out_free_pages; + } else { + p = p + (uva - sp_walk_data.uva_aligned); + } + + /* + * kva p may be used later in k2u. Since p comes from uva originally, + * it's reasonable to add flag VM_USERMAP so that p can be remapped + * into userspace again. + */ + area = find_vm_area(p); + area->flags |= VM_USERMAP; + +out_free_pages: + kvfree(sp_walk_data.pages); +out_put_task: + put_task_struct(tsk); +out: + return p; +} +EXPORT_SYMBOL_GPL(sp_make_share_u2k); + +static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int spg_id) +{ + int ret = 0; + struct task_struct *tsk; + struct sp_group *spg; + struct sp_area *spa; + unsigned long uva_aligned; + unsigned long size_aligned; + unsigned int page_size; + + mutex_lock(&sp_mutex); + /* + * at first we guess it's a hugepage addr + * we can tolerate at most PMD_SIZE or PAGE_SIZE which is matched in k2u + */ + spa = __find_sp_area(ALIGN_DOWN(uva, PMD_SIZE)); + if (!spa) { + spa = __find_sp_area(ALIGN_DOWN(uva, PAGE_SIZE)); + if (!spa) { + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: invalid input uva %pK in unshare uva\n", (void *)uva); + goto out_unlock; + } + } + + /* + * 1. overflow actually won't happen due to an spa must be valid. + * 2. we must unshare [spa->va_start, spa->va_start + spa->real_size) completely + * because an spa is one-to-one correspondence with an vma. + * Thus input paramter size is not necessarily needed. + */ + page_size = (spa->is_hugepage ? PMD_SIZE : PAGE_SIZE); + uva_aligned = spa->va_start; + size_aligned = spa->real_size; + + if (size_aligned < ALIGN(size, page_size)) { + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: unshare uva failed due to invalid parameter size %lu\n", size); + goto out_drop_area; + } + + if (spg_id == SPG_ID_NONE) { + if (spa->spg) { + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: unshare uva failed, SPG_ID_NONE is invalid\n"); + goto out_drop_area; + } + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else + get_task_struct(tsk); + + rcu_read_unlock(); + if (ret) + goto out_drop_area; + + if (!mmget_not_zero(tsk->mm)) { + put_task_struct(tsk); + pr_info("share pool: no need to unshare uva, target process is exiting\n"); + goto out_drop_area; + } + down_write(&tsk->mm->mmap_sem); + ret = do_munmap(tsk->mm, uva_aligned, size_aligned, NULL); + up_write(&tsk->mm->mmap_sem); + mmput(tsk->mm); + if (ret) { + /* we are not supposed to fail */ + pr_err("share pool: failed to unmap VA %pK when munmap in unshare uva\n", + (void *)uva_aligned); + } + put_task_struct(tsk); + } else { + /* + * k2u to task, then unshare_uva(..., spg_id) is invalid due to potential + * spa memory leak. + */ + if (!spa->spg) { + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: unshare uva failed, sp group id %d is invalid\n", spg_id); + goto out_drop_area; + } + + spg = __sp_find_spg(pid, spg_id); + if (spg_valid(spg)) { + __sp_free(spg, uva_aligned, size_aligned, NULL); + } else { + if (!spg) { + if (printk_ratelimit()) + pr_err("share pool: unshare uva failed, doesn't belong to group %d\n", + spg_id); + ret = -EINVAL; + goto out_drop_area; + } else { + pr_info("share pool: no need to unshare uva, target process is exiting\n"); + } + } + } + +out_drop_area: + __sp_area_drop(spa); +out_unlock: + mutex_unlock(&sp_mutex); + return ret; +} + +static int sp_unshare_kva(unsigned long kva, unsigned long size) +{ + unsigned long addr, kva_aligned; + struct page *page; + unsigned long size_aligned; + unsigned long step; + bool is_hugepage = true; + int ret; + + ret = is_vmap_hugepage(kva); + if (ret > 0) { + kva_aligned = ALIGN_DOWN(kva, PMD_SIZE); + size_aligned = ALIGN(kva + size, PMD_SIZE) - kva_aligned; + step = PMD_SIZE; + } else if (ret == 0) { + kva_aligned = ALIGN_DOWN(kva, PAGE_SIZE); + size_aligned = ALIGN(kva + size, PAGE_SIZE) - kva_aligned; + step = PAGE_SIZE; + is_hugepage = false; + } else { + pr_err("share pool: check vmap hugepage failed, ret %d\n", ret); + return -EINVAL; + } + + if (kva_aligned + size_aligned < kva_aligned) { + if (printk_ratelimit()) + pr_err("share pool: overflow happened in unshare kva\n"); + return -EINVAL; + } + + for (addr = kva_aligned; addr < (kva_aligned + size_aligned); addr += step) { + if (is_hugepage) + page = vmalloc_to_hugepage((void *)addr); + else + page = vmalloc_to_page((void *)addr); + if (page) + put_page(page); + else + pr_err("share pool: vmalloc to hugepage failed\n"); + } + + vunmap((void *)kva_aligned); + + return 0; +} + +/** + * Unshare the kernel or user memory which shared by calling sp_make_share_{k2u,u2k}(). + * @va: the specified virtual address of memory + * @size: the size of unshared memory + * @pid: the pid of the specified process if the VA is user address + * @spg_id: the ID of the specified sp_group if the VA is user address + * + * Return -errno if fail. + * + * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + */ +int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) +{ + int ret = 0; + + if (va < TASK_SIZE) { + /* user address */ + ret = sp_unshare_uva(va, size, pid, spg_id); + } else if (va >= VA_START) { + /* kernel address */ + ret = sp_unshare_kva(va, size); + } else { + /* regard user and kernel address ranges as bad address */ + if (printk_ratelimit()) + pr_err("share pool: unshare addr %pK is not a user or kernel addr", (void *)va); + ret = -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sp_unshare); + +/** + * Return 0 when success. + * When return value < 0, information in sp_walk_data is useless + */ +int sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + int ret = 0; + + if (unlikely(!sp_walk_data)) { + if (printk_ratelimit()) + pr_err("share pool: null pointer when walk page range\n"); + return -EINVAL; + } + if (!tsk || (tsk->flags & PF_EXITING)) + return -ESRCH; + + sp_walk_data->page_count = 0; + + get_task_struct(tsk); + if (!mmget_not_zero(tsk->mm)) { + put_task_struct(tsk); + return -EINVAL; + } + down_write(&tsk->mm->mmap_sem); + ret = __sp_walk_page_range(uva, size, tsk, sp_walk_data); + up_write(&tsk->mm->mmap_sem); + mmput(tsk->mm); + put_task_struct(tsk); + + return ret; +} +EXPORT_SYMBOL_GPL(sp_walk_page_range); + +void sp_walk_page_free(struct sp_walk_data *sp_walk_data) +{ + struct page *page; + unsigned int i = 0; + + if (!sp_walk_data) + return; + + while (i < sp_walk_data->page_count) { + page = sp_walk_data->pages[i++]; + put_page(page); + } + + kvfree(sp_walk_data->pages); +} +EXPORT_SYMBOL_GPL(sp_walk_page_free); + +/** + * Walk the mm_struct of processes in the specified sp_group + * and call CALLBACK once for each mm_struct. + * @spg_id: the ID of the specified sp_group + * @data: the param for callback function + * @func: caller specific callback function + * + * Return -errno if fail. + */ +int sp_group_walk(int spg_id, void *data, int (*func)(struct mm_struct *mm, void *)) +{ + struct sp_group *spg; + int ret = -ESRCH; + + if (!func) { + if (printk_ratelimit()) + pr_err("share pool: null func pointer\n"); + return -EINVAL; + } + + mutex_lock(&sp_mutex); + spg = idr_find(&sp_group_idr, spg_id); + if (spg_valid(spg)) { + struct mm_struct *mm; + struct mm_struct *tmp; + list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + if (func) { + ret = func(mm, data); + if (ret) + goto out_unlock; + } + } + } +out_unlock: + mutex_unlock(&sp_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(sp_group_walk); + +int sp_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&sp_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(sp_register_notifier); + +int sp_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&sp_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(sp_unregister_notifier); + +/** + * user can config the share pool start addrese of each Da-vinci device + * @start: the value of share pool start + * @size: the value of share pool + * @device_id: the num of Da-vinci device + * @pid: the pid of device process + * + * Return false if parameter invalid of has been set up. + */ +bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) +{ + struct sp_group *spg; + + if (device_id < 0 || device_id >= MAX_DEVID || pid < 0 || size <= 0 || + size > MMAP_SHARE_POOL_16G_SIZE) + return false; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(pid, SPG_ID_DEFAULT); + if (!spg_valid(spg) || spg->dvpp_multi_spaces == true) { + mutex_unlock(&sp_mutex); + return false; + } + spg->dvpp_va_start = start; + spg->dvpp_size = size; + spg->dvpp_multi_spaces = true; + host_svm_sp_enable = true; + mutex_unlock(&sp_mutex); + + return true; +} +EXPORT_SYMBOL_GPL(sp_config_dvpp_range); + +/* Check whether the address belongs to the share pool. */ +bool is_sharepool_addr(unsigned long addr) +{ + if (host_svm_sp_enable == false) + return (addr >= MMAP_SHARE_POOL_START) && + addr < (MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE); + + return addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_END; +} +EXPORT_SYMBOL_GPL(is_sharepool_addr); + +static int __init mdc_default_group(char *s) +{ + enable_mdc_default_group = 1; + return 1; +} +__setup("enable_mdc_default_group", mdc_default_group); + +int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct sp_group *spg = NULL; + struct sp_proc_stat *stat; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(task->pid, SPG_ID_DEFAULT); + if (spg_valid(spg)) { + /* print the file header */ + stat = idr_find(&sp_stat_idr, task->mm->sp_stat_id); + if (!stat) { + mutex_unlock(&sp_mutex); + return 0; + } + seq_printf(m, "%-10s %-18s %-15s\n", + "Group ID", "Aligned Apply(KB)", "HugePage Fails"); + seq_printf(m, "%-10d %-18ld %-15d\n", + spg->id, byte2kb(stat->amount), spg->hugepage_failures); + } + mutex_unlock(&sp_mutex); + + return 0; +} + +static int idr_proc_stat_cb(int id, void *p, void *data) +{ + struct sp_group *spg; + struct sp_proc_stat *stat = p; + struct seq_file *seq = data; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(id, SPG_ID_DEFAULT); + if (spg) { + seq_printf(seq, "%-12d %-10d %-18ld\n", + id, spg->id, byte2kb(stat->amount)); + } + mutex_unlock(&sp_mutex); + + return 0; +} + +static int proc_stat_show(struct seq_file *seq, void *offset) +{ + /* print the file header */ + seq_printf(seq, "%-12s %-10s %-18s\n", + "Process ID", "Group ID", "Aligned Apply(KB)"); + /* print kthread buff_module_guard_work */ + seq_printf(seq, "%-12s %-10s %-18ld\n", + "guard", "-", byte2kb(kthread_stat.amount)); + idr_for_each(&sp_stat_idr, idr_proc_stat_cb, seq); + return 0; +} + +static void rb_spa_stat_show(struct seq_file *seq) +{ + struct rb_node *node; + struct sp_area *spa; + + spin_lock(&sp_area_lock); + + for (node = rb_first(&sp_area_root); node; node = rb_next(node)) { + spa = rb_entry(node, struct sp_area, rb_node); + atomic_inc(&spa->use_count); + spin_unlock(&sp_area_lock); + + mutex_lock(&sp_mutex); + if (spg_valid(spa->spg)) + seq_printf(seq, "%-10d ", spa->spg->id); + else /* k2u for task or spg is dead */ + seq_printf(seq, "%-10s ", "None"); + mutex_unlock(&sp_mutex); + + seq_printf(seq, "%2s%-14lx %2s%-14lx %-13ld ", + "0x", spa->va_start, + "0x", spa->va_end, + byte2kb(spa->real_size)); + + switch (spa->type) { + case SPA_TYPE_ALLOC: + seq_printf(seq, "%-7s ", "ALLOC"); + break; + case SPA_TYPE_K2TASK: + seq_printf(seq, "%-7s ", "TASK"); + break; + case SPA_TYPE_K2SPG: + seq_printf(seq, "%-7s ", "SPG"); + break; + default: + /* usually impossible, perhaps a developer's mistake */ + break; + } + + if (spa->is_hugepage) + seq_printf(seq, "%-5s ", "Y"); + else + seq_printf(seq, "%-5s ", "N"); + + seq_printf(seq, "%-10d\n", atomic_read(&spa->use_count)); + + spin_lock(&sp_area_lock); + __sp_area_drop_locked(spa); + } + + spin_unlock(&sp_area_lock); +} + +static void spa_overview_show(struct seq_file *seq) +{ + unsigned int total_num, alloc_num, k2u_task_num, k2u_spg_num; + unsigned long total_size, alloc_size, k2u_task_size, k2u_spg_size; + + spin_lock(&sp_area_lock); + total_num = spa_stat.total_num; + alloc_num = spa_stat.alloc_num; + k2u_task_num = spa_stat.k2u_task_num; + k2u_spg_num = spa_stat.k2u_spg_num; + total_size = spa_stat.total_size; + alloc_size = spa_stat.alloc_size; + k2u_task_size = spa_stat.k2u_task_size; + k2u_spg_size = spa_stat.k2u_spg_size; + spin_unlock(&sp_area_lock); + + seq_printf(seq, "Spa total num %u.\n", total_num); + seq_printf(seq, "Spa alloc num %u, k2u(task) num %u, k2u(spg) num %u.\n", + alloc_num, k2u_task_num, k2u_spg_num); + seq_printf(seq, "Spa total size: %13lu KB\n", byte2kb(total_size)); + seq_printf(seq, "Spa alloc size: %13lu KB\n", byte2kb(alloc_size)); + seq_printf(seq, "Spa k2u(task) size: %13lu KB\n", byte2kb(k2u_task_size)); + seq_printf(seq, "Spa k2u(spg) size: %13lu KB\n", byte2kb(k2u_spg_size)); + seq_printf(seq, "\n"); +} + +/* the caller must hold sp_mutex */ +static int idr_spg_stat_cb(int id, void *p, void *data) +{ + struct sp_group *spg = p; + struct seq_file *seq = data; + + seq_printf(seq, "Group %-10d size: %13d KB, spa num: %d.\n", + id, byte2kb(atomic_read(&spg->size)), + atomic_read(&spg->spa_num)); + + return 0; +} + +static void spg_overview_show(struct seq_file *seq) +{ + mutex_lock(&sp_mutex); + idr_for_each(&sp_group_idr, idr_spg_stat_cb, seq); + mutex_unlock(&sp_mutex); + seq_printf(seq, "Share pool total size: %13d KB, spa total num: %d.\n\n", + byte2kb(atomic_read(&spg_stat.spa_total_size)), + atomic_read(&spg_stat.spa_total_num)); +} + +static int spa_stat_show(struct seq_file *seq, void *offset) +{ + spg_overview_show(seq); + spa_overview_show(seq); + /* print the file header */ + seq_printf(seq, "%-10s %-16s %-16s %-13s %-7s %-5s %-10s\n", + "Group ID", "va_start", "va_end", "Aligned KB", "Type", "Huge", "Ref"); + rb_spa_stat_show(seq); + return 0; +} + +/* + * Called by proc_root_init() to initialize the /proc/sharepool subtree + */ +void __init proc_sharepool_init(void) +{ + if (!proc_mkdir("sharepool", NULL)) + return; + + proc_create_single_data("sharepool/proc_stat", 0, NULL, proc_stat_show, NULL); + proc_create_single_data("sharepool/spa_stat", 0, NULL, spa_stat_show, NULL); +} + + +struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, + unsigned int page_order, int node) +{ + if (area->flags & VM_HUGE_PAGES) + return hugetlb_alloc_hugepage(NUMA_NO_NODE); + else + return alloc_pages_node(node, mask, page_order); +} + +int enable_ascend_share_pool; + +static int __init enable_share_pool(char *s) +{ + enable_ascend_share_pool = 1; + + pr_info("Ascend enable share pool features\n"); + + return 1; +} +__setup("enable_ascend_share_pool", enable_share_pool);
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/configs/hulk_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index 4b259b7561eb..eee186ec38c4 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -481,6 +481,7 @@ CONFIG_ASCEND_IOPF_HIPRI=y CONFIG_ASCEND_CHARGE_MIGRATE_HUGEPAGES=y CONFIG_ASCEND_WATCHDOG_SYSFS_CONFIGURE=y CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE=y +CONFIG_ASCEND_SHARE_POOL=y CONFIG_ARM64_CNP=y
#