From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
This is a prepare patch for share pool, export new function to vmalloc hugepage and vmap the hugepage to virtually contiguous space.
The new head file share_pool.h is mainly used for share pool features and export some sp_xxx function when ascend_share_pool config is enabled, and do nothing by default.
Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/share_pool.h | 354 +++++++++++++++++++++++++++++++++++++ include/linux/vmalloc.h | 9 + mm/vmalloc.c | 237 ++++++++++++++++++++++++- 3 files changed, 596 insertions(+), 4 deletions(-) create mode 100644 include/linux/share_pool.h
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h new file mode 100644 index 000000000000..09afbae33d41 --- /dev/null +++ b/include/linux/share_pool.h @@ -0,0 +1,354 @@ +#ifndef LINUX_SHARE_POOL_H +#define LINUX_SHARE_POOL_H + +#include <linux/mman.h> +#include <linux/mm_types.h> +#include <linux/notifier.h> +#include <linux/vmalloc.h> + +#define SP_HUGEPAGE (1 << 0) +#define SP_HUGEPAGE_ONLY (1 << 1) +#define SP_DVPP (1 << 2) + +#define SPG_ID_NONE -1 /* not associated with sp_group, only for specified thread */ +#define SPG_ID_DEFAULT 0 /* use the spg id of current thread */ +#define SPG_ID_MIN 1 /* valid id should be >= 1 */ +#define SPG_ID_MAX 99999 +#define SPG_ID_AUTO_MIN 100000 +#define SPG_ID_AUTO_MAX 199999 +#define SPG_ID_AUTO 200000 /* generate group id automatically */ +#define SPG_ID_DVPP_PASS_THROUGH_MIN 800000 +#define SPG_ID_DVPP_PASS_THROUGH_MAX 899999 +#define SPG_ID_DVPP_PASS_THROUGH 900000 + +#define MAX_DEVID 1 /* the max num of Da-vinci devices */ + +#define VM_HUGE_PAGES 0x00001000 /* use for huge pages */ + +/* to align the pointer to the (next) PMD boundary */ +#define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE) + +/* test whether an address (unsigned long or pointer) is aligned to PMD_SIZE */ +#define PMD_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PMD_SIZE) + +extern int sysctl_share_pool_hugepage_enable; + +extern int sysctl_ac_mode; + +extern int enable_ascend_share_pool; + +/* Processes in the same sp_group can share memory. + * Memory layout for share pool: + * + * |-------------------- 8T -------------------|---|------ 8T ------------| + * | Device 0 | Device 1 |...| | + * |----------------------------------------------------------------------| + * |- 16G -|- 16G -|- 16G -|- 16G -| | | | | + * | DVPP GROUP0 | DVPP GROUP1 | ... | ... |...| sp normal memory | + * | svm | sp | svm | sp | | | | | + * |----------------------------------------------------------------------| + * + * The host SVM feature reserves 8T virtual memory by mmap, and due to the + * restriction of DVPP, while SVM and share pool will both allocate memory + * for DVPP, the memory have to be in the same 32G range. + * + * Share pool reserves 16T memory, with 8T for normal uses and 8T for DVPP. + * Within this 8T DVPP memory, SVM will call sp_config_dvpp_range() to + * tell us which 16G memory range is reserved for share pool . + * + * In some scenarios where there is no host SVM feature, share pool uses + * the default memory setting for DVPP. + */ +struct sp_group { + int id; + struct file *file; + struct file *file_hugetlb; + /* list head of processes */ + struct list_head procs; + /* list of sp_area */ + struct list_head spa_list; + /* number of sp_area */ + atomic_t spa_num; + /* total size of all sp_area from sp_alloc and k2u(spg) */ + atomic_t size; + /* record the number of hugepage allocation failures */ + int hugepage_failures; + /* is_alive == false means it's being destroyed */ + bool is_alive; + /* we define the creator process of a sp_group as owner */ + struct task_struct *owner; + /* dvpp_multi_spaces == true means multiple dvpp 16G spaces are set */ + bool dvpp_multi_spaces; + unsigned long dvpp_va_start; + unsigned long dvpp_size; + atomic_t use_count; +}; + +struct sp_walk_data { + struct page **pages; + unsigned int page_count; + unsigned long uva_aligned; + unsigned long page_size; + bool is_hugepage; +}; + +#ifdef CONFIG_ASCEND_SHARE_POOL + +#define MAP_SHARE_POOL 0x100000 + +#define MMAP_TOP_4G_SIZE 0x100000000UL + +/* 8T size */ +#define MMAP_SHARE_POOL_NORMAL_SIZE 0x80000000000UL +/* 8T size*/ +#define MMAP_SHARE_POOL_DVPP_SIZE 0x80000000000UL +/* 16G size */ +#define MMAP_SHARE_POOL_16G_SIZE 0x400000000UL +#define MMAP_SHARE_POOL_SIZE (MMAP_SHARE_POOL_NORMAL_SIZE + MMAP_SHARE_POOL_DVPP_SIZE) +/* align to 2M hugepage size, and MMAP_SHARE_POOL_TOP_16G_START should be align to 16G */ +#define MMAP_SHARE_POOL_END ((TASK_SIZE - MMAP_SHARE_POOL_DVPP_SIZE) & ~((1 << 21) - 1)) +#define MMAP_SHARE_POOL_START (MMAP_SHARE_POOL_END - MMAP_SHARE_POOL_SIZE) +#define MMAP_SHARE_POOL_16G_START (MMAP_SHARE_POOL_END - MMAP_SHARE_POOL_DVPP_SIZE) + +static inline void sp_init_mm(struct mm_struct *mm) +{ + mm->sp_group = NULL; + INIT_LIST_HEAD(&mm->sp_node); + mm->sp_stat_id = 0; +} + +extern int sp_group_add_task(int pid, int spg_id); +extern void sp_group_exit(struct mm_struct *mm); +extern void sp_group_post_exit(struct mm_struct *mm); +extern int sp_group_id_by_pid(int pid); +extern int sp_group_walk(int spg_id, void *data, int (*func)(struct mm_struct *mm, void *)); +extern int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); + +extern void *sp_alloc(unsigned long size, unsigned long sp_flags, int sp_id); +extern int sp_free(unsigned long addr); +extern void *sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id); +extern void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid); +extern int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id); + +extern void sp_area_drop(struct vm_area_struct *vma); + +extern int sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data); +extern void sp_walk_page_free(struct sp_walk_data *sp_walk_data); + +extern int sp_register_notifier(struct notifier_block *nb); +extern int sp_unregister_notifier(struct notifier_block *nb); +extern bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid); +extern bool is_sharepool_addr(unsigned long addr); +extern void proc_sharepool_init(void); + +static inline struct task_struct *sp_get_task(struct mm_struct *mm) +{ + if (enable_ascend_share_pool) + return mm->owner; + else + return current; +} + +static inline bool sp_check_hugepage(struct page *p) +{ + if (enable_ascend_share_pool && PageHuge(p)) + return true; + + return false; +} + +static inline bool sp_is_enabled(void) +{ + return enable_ascend_share_pool ? true : false; +} + +static inline bool sp_check_vm_huge_page(unsigned long flags) +{ + if (enable_ascend_share_pool && (flags & VM_HUGE_PAGES)) + return true; + + return false; +} + +static inline void sp_area_work_around(struct vm_unmapped_area_info *info) +{ + if (enable_ascend_share_pool) + info->high_limit = min(info->high_limit, MMAP_SHARE_POOL_START); +} + +extern struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, + unsigned int page_order, int node); + +static inline void sp_free_pages(struct page *page, struct vm_struct *area) +{ + if (PageHuge(page)) + put_page(page); + else + __free_pages(page, area->page_order); +} + +static inline bool sp_check_vm_share_pool(unsigned long vm_flags) +{ + if (enable_ascend_share_pool && (vm_flags & VM_SHARE_POOL)) + return true; + + return false; +} + +static inline bool is_vm_huge_special(struct vm_area_struct *vma) +{ + return !!(enable_ascend_share_pool && (vma->vm_flags & VM_HUGE_SPECIAL)); +} + +static inline bool sp_mmap_check(unsigned long flags) +{ + if (enable_ascend_share_pool && (flags & MAP_SHARE_POOL)) + return true; + + return false; +} + +#else + +static inline int sp_group_add_task(int pid, int spg_id) +{ + return -EPERM; +} + +static inline void sp_group_exit(struct mm_struct *mm) +{ +} + +static inline void sp_group_post_exit(struct mm_struct *mm) +{ +} + +static inline int sp_group_id_by_pid(int pid) +{ + return -EPERM; +} + +static inline int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + return -EPERM; +} + +static inline void *sp_alloc(unsigned long size, unsigned long sp_flags, int sp_id) +{ + return NULL; +} + +static inline int sp_free(unsigned long addr) +{ + return -EPERM; +} + +static inline void *sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id) +{ + return NULL; +} + +static inline void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) +{ + return NULL; +} +static inline int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) +{ + return -EPERM; +} + +static inline void sp_init_mm(struct mm_struct *mm) +{ +} + +static inline void sp_area_drop(struct vm_area_struct *vma) +{ +} + +static inline int sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + return 0; +} + +static inline void sp_walk_page_free(struct sp_walk_data *sp_walk_data) +{ +} +static inline int sp_register_notifier(struct notifier_block *nb) +{ + return -EPERM; +} + +static inline int sp_unregister_notifier(struct notifier_block *nb) +{ + return -EPERM; +} +static inline bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) +{ + return false; +} + +static inline bool is_sharepool_addr(unsigned long addr) +{ + return false; +} + +static inline void proc_sharepool_init(void) +{ +} + +static inline struct task_struct *sp_get_task(struct mm_struct *mm) +{ + return current; +} +static inline bool sp_check_hugepage(struct page *p) +{ + return false; +} + +static inline bool sp_is_enabled(void) +{ + return false; +} + +static inline bool sp_check_vm_huge_page(unsigned long flags) +{ + return false; +} + +static inline void sp_area_work_around(struct vm_unmapped_area_info *info) +{ +} + +static inline struct page *sp_alloc_pages(void *area, gfp_t mask, + unsigned int page_order, int node) +{ + return NULL; +} + +static inline void sp_free_pages(struct page *page, struct vm_struct *area) +{ +} + +static inline bool sp_check_vm_share_pool(unsigned long vm_flags) +{ + return false; +} + +static inline bool is_vm_huge_special(struct vm_area_struct *vma) +{ + return false; +} + +static inline bool sp_mmap_check(unsigned long flags) +{ + return false; +} +#endif + +#endif /* LINUX_SHARE_POOL_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 07b4b1141ed8..244eedb7591a 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -95,6 +95,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); +extern void *vmalloc_hugepage(unsigned long size); +extern void *vmalloc_hugepage_user(unsigned long size); #ifndef CONFIG_MMU extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, @@ -123,6 +125,13 @@ extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, void vmalloc_sync_mappings(void); void vmalloc_sync_unmappings(void);
+extern void *vmap_hugepage(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot); +extern int remap_vmalloc_hugepage_range_partial(struct vm_area_struct *vma, + unsigned long uaddr, void *kaddr, + unsigned long size); +extern int remap_vmalloc_hugepage_range(struct vm_area_struct *vma, + void *addr, unsigned long pgoff); /* * Lowlevel-APIs (not for driver use!) */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2fec803edc90..78f56e719e1d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -33,6 +33,7 @@ #include <linux/bitops.h> #include <linux/rbtree_augmented.h> #include <linux/overflow.h> +#include <linux/share_pool.h>
#include <linux/uaccess.h> #include <asm/tlbflush.h> @@ -478,6 +479,37 @@ static int vmap_pages_range(unsigned long addr, unsigned long end, return err; }
+static int vmap_hugepages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + unsigned int i, nr = (end - addr) >> page_shift; + + for (i = 0; i < nr; i++) { + int err; + + err = vmap_range_noflush(addr, addr + (1UL << page_shift), + __pa(page_address(pages[i])), prot, + page_shift); + if (err) + return err; + + addr += 1UL << page_shift; + } + + return 0; +} + +static int vmap_hugepages_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) +{ + int err; + + err = vmap_hugepages_range_noflush(addr, end, prot, pages, page_shift); + flush_cache_vmap(addr, end); + return err; +} + /** * map_kernel_range_noflush - map kernel VM area with the specified pages * @addr: start of the VM area to map @@ -589,6 +621,22 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) } EXPORT_SYMBOL(vmalloc_to_page);
+/* + * Walk a hugepage vmap address to the struct page it maps. + * return the head page that corresponds to the base page address. + */ +struct page *vmalloc_to_hugepage(const void *vmalloc_addr) +{ + struct page *huge; + + huge = vmalloc_to_page(vmalloc_addr); + if (huge && PageHuge(huge)) + return huge; + else + return NULL; +} +EXPORT_SYMBOL(vmalloc_to_hugepage); + /* * Map a vmalloc()-space virtual address to the physical page frame number. */ @@ -2243,7 +2291,12 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller) { - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, + unsigned long align = 1; + + if (sp_check_vm_huge_page(flags)) + align = PMD_SIZE; + + return __get_vm_area_node(size, align, flags, VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, caller); }
@@ -2327,7 +2380,10 @@ static void __vunmap(const void *addr, int deallocate_pages) struct page *page = area->pages[i];
BUG_ON(!page); - __free_pages(page, area->page_order); + if (sp_is_enabled()) + sp_free_pages(page, area); + else + __free_pages(page, area->page_order); }
kvfree(area->pages); @@ -2452,6 +2508,43 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap);
+/** + * vmap_hugepag - map an array of huge pages into virtually contiguous space + * @pages: array of huge page pointers + * @count: number of pages to map + * @flags: vm_area->flags + * @prot: page protection for the mapping + * + * Maps @count pages from @pages into contiguous kernel virtual + * space. + */ +void *vmap_hugepage(struct page **pages, unsigned int count, + unsigned long flags, pgprot_t prot) +{ + struct vm_struct *area; + unsigned long size; /* In bytes */ + + might_sleep(); + + if (count > totalram_pages) + return NULL; + + size = (unsigned long)count << PMD_SHIFT; + area = get_vm_area_caller(size, flags, __builtin_return_address(0)); + if (!area) + return NULL; + + if (vmap_hugepages_range((unsigned long)area->addr, + (unsigned long)area->addr + size, prot, + pages, PMD_SHIFT) < 0) { + vunmap(area->addr); + return NULL; + } + + return area->addr; +} +EXPORT_SYMBOL(vmap_hugepage); + static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller); @@ -2494,7 +2587,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; int p;
- page = alloc_pages_node(node, alloc_mask|highmem_mask, page_order); + if (sp_is_enabled()) + page = sp_alloc_pages(area, alloc_mask|highmem_mask, + page_order, node); + else + page = alloc_pages_node(node, alloc_mask|highmem_mask, + page_order); if (unlikely(!page)) { /* Successfully allocated i pages, free them in __vunmap() */ area->nr_pages = i; @@ -2562,7 +2660,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, */
size_per_node = size; - if (node == NUMA_NO_NODE) + if (node == NUMA_NO_NODE && !sp_is_enabled()) size_per_node /= num_online_nodes(); if (size_per_node >= PMD_SIZE) { shift = PMD_SHIFT; @@ -2825,6 +2923,55 @@ void *vmalloc_32_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_32_user);
+/** + * vmalloc_hugepage - allocate virtually contiguous hugetlb memory + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage(unsigned long size) +{ + /* PMD hugepage aligned */ + size = PMD_ALIGN(size); + + return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, + NUMA_NO_NODE, __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_hugepage); + +/** + * vmalloc_hugepage_user - allocate virtually contiguous hugetlb memory + * for userspace + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. The resulting memory area + * is zeroed so it can be mapped to userspace without leaking data. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage_user(unsigned long size) +{ + struct vm_struct *area; + void *ret; + + /* 2M hugepa aligned */ + size = PMD_ALIGN(size); + + ret = __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + NUMA_NO_NODE, __builtin_return_address(0)); + if (ret) { + area = find_vm_area(ret); + area->flags |= VM_USERMAP; + } + return ret; +} +EXPORT_SYMBOL(vmalloc_hugepage_user); + + /* * small helper routine , copy contents to buf from addr. * If the page is not present, fill zero. @@ -3150,6 +3297,85 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, } EXPORT_SYMBOL(remap_vmalloc_range);
+/** + * remap_vmalloc_hugepage_range_partial - map vmalloc hugepages + * to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc hugepage kernel memory + * @size: size of map area + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_hugepage_range_partial(struct vm_area_struct *vma, + unsigned long uaddr, void *kaddr, unsigned long size) +{ + struct vm_struct *area; + + size = PMD_ALIGN(size); + + if (!PMD_ALIGNED(uaddr) || !PMD_ALIGNED(kaddr)) + return -EINVAL; + + area = find_vm_area(kaddr); + if (!area) + return -EINVAL; + + if (!(area->flags & VM_USERMAP)) + return -EINVAL; + + if (kaddr + size > area->addr + get_vm_area_size(area)) + return -EINVAL; + + do { + struct page *page = vmalloc_to_hugepage(kaddr); + int ret; + + ret = vm_insert_page(vma, uaddr, page); + if (ret) + return ret; + + uaddr += PMD_SIZE; + kaddr += PMD_SIZE; + size -= PMD_SIZE; + } while (size > 0); + + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + + return 0; +} +EXPORT_SYMBOL(remap_vmalloc_hugepage_range_partial); + +/** + * remap_vmalloc_hugepage_range - map vmalloc hugepages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of hugepages into addr before first page to map + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_hugepage_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + return remap_vmalloc_hugepage_range_partial(vma, vma->vm_start, + addr + (pgoff << PMD_SHIFT), + vma->vm_end - vma->vm_start); +} +EXPORT_SYMBOL(remap_vmalloc_hugepage_range); + /* * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose * not to have one. @@ -3611,6 +3837,9 @@ static int s_show(struct seq_file *m, void *p) if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages");
+ if (sp_is_enabled()) + seq_printf(m, " order=%d", v->page_order); + show_numa_info(m, v); seq_putc(m, '\n'); return 0;