From: Wang Wensheng wangwensheng4@huawei.com
ascend inclusion category: Feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4NDAW CVE: NA
-------------------
This share a kernel memory range to userspace. Introduce vm_struct flag VM_SHAREPOOL to indicate that a vm_struct shared to userspace and we cannot vfree such a vm_area.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/share_pool.h | 8 + include/linux/vmalloc.h | 5 + kernel/sysctl.c | 19 ++ mm/hugetlb.c | 8 + mm/share_pool.c | 404 ++++++++++++++++++++++++++++++++++++- mm/vmalloc.c | 7 + 6 files changed, 450 insertions(+), 1 deletion(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 7e7ced34be57..6ec844708f83 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -288,6 +288,14 @@ static inline void sp_dump_stack(void) dump_stack(); }
+static inline bool is_vmalloc_sharepool(unsigned long vm_flags) +{ + if (sp_is_enabled() && (vm_flags & VM_SHAREPOOL)) + return true; + + return false; +} + #else /* CONFIG_ASCEND_SHARE_POOL */
static inline int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 8f9e13944cc7..49c94afce25b 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -28,6 +28,11 @@ struct notifier_block; /* in notifier.h */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ #define VM_NO_HUGE_VMAP 0x00000400 /* force PAGE_SIZE pte mapping */ #define VM_HUGE_PAGES 0x00001000 /* used for vmalloc hugepages */ +#ifdef CONFIG_ASCEND_SHARE_POOL +#define VM_SHAREPOOL 0x00002000 /* remapped to sharepool */ +#else +#define VM_SHAREPOOL 0 +#endif
/* * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c7073b652b0c..9d242841ca3f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3287,6 +3287,25 @@ static struct ctl_table vm_table[] = { .extra1 = &zero_ul, .extra2 = &sysctl_sp_compact_interval_max, }, + { + /* 0: map_unlock, 1: map_lock */ + .procname = "share_pool_map_lock_enable", + .data = &sysctl_share_pool_map_lock_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sharepool_perf_k2u", + .data = &sysctl_sp_perf_k2u, + .maxlen = sizeof(sysctl_sp_perf_k2u), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &ten_thousand, + }, #endif { } }; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eaddb18d58e1..74d23542f9f2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -31,6 +31,7 @@ #include <linux/llist.h> #include <linux/cma.h> #include <linux/mman.h> +#include <linux/share_pool.h>
#include <asm/page.h> #include <asm/pgalloc.h> @@ -4110,6 +4111,13 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
pte = huge_ptep_get_and_clear(mm, address, ptep); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); + + /* sharepool k2u mapped pages are marked special */ + if (sp_check_vm_share_pool(vma->vm_flags) && pte_special(pte)) { + spin_unlock(ptl); + continue; + } + if (huge_pte_dirty(pte)) set_page_dirty(page);
diff --git a/mm/share_pool.c b/mm/share_pool.c index bb134dd7ffc4..3c935d256b08 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -87,6 +87,10 @@ static int __read_mostly enable_share_k2u_spg = 1; /* debug mode */ int sysctl_sp_debug_mode;
+int sysctl_share_pool_map_lock_enable; + +int sysctl_sp_perf_k2u; + static int share_pool_group_mode = SINGLE_GROUP_MODE;
static int system_group_count; @@ -632,6 +636,13 @@ static inline void check_interrupt_context(void) panic("function can't be used in interrupt context\n"); }
+static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate, + unsigned long prot); +static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size); +static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, + struct mm_struct *mm, unsigned long prot); + static void free_sp_group_id(int spg_id) { /* ida operation is protected by an internal spin_lock */ @@ -1206,6 +1217,19 @@ static struct sp_area *__find_sp_area(unsigned long addr) return n; }
+static bool vmalloc_area_clr_flag(unsigned long kva, unsigned long flags) +{ + struct vm_struct *area; + + area = find_vm_area((void *)kva); + if (area) { + area->flags &= ~flags; + return true; + } + + return false; +} + /* * Free the VA region starting from addr to the share pool */ @@ -1235,6 +1259,9 @@ static void sp_free_area(struct sp_area *spa) } }
+ if (spa->kva && !vmalloc_area_clr_flag(spa->kva, VM_SHAREPOOL)) + pr_debug("clear spa->kva %ld is not valid\n", spa->kva); + spa_dec_usage(spa); if (spa->spg != spg_none) list_del(&spa->link); @@ -1542,6 +1569,37 @@ int mg_sp_free(unsigned long addr) } EXPORT_SYMBOL_GPL(mg_sp_free);
+/* wrapper of __do_mmap() and the caller must hold down_write(&mm->mmap_lock). */ +static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate, + unsigned long prot) +{ + unsigned long addr = spa->va_start; + unsigned long size = spa_size(spa); + unsigned long flags = MAP_FIXED | MAP_SHARED | MAP_POPULATE | + MAP_SHARE_POOL; + unsigned long vm_flags = VM_NORESERVE | VM_SHARE_POOL | VM_DONTCOPY; + unsigned long pgoff = addr_offset(spa) >> PAGE_SHIFT; + + /* Mark the mapped region to be locked. After the MAP_LOCKED is enable, + * multiple tasks will preempt resources, causing performance loss. + */ + if (sysctl_share_pool_map_lock_enable) + flags |= MAP_LOCKED; + + atomic_inc(&spa->use_count); + addr = __do_mmap_mm(mm, file, addr, size, prot, flags, vm_flags, pgoff, + populate, NULL); + if (IS_ERR_VALUE(addr)) { + atomic_dec(&spa->use_count); + pr_err("do_mmap fails %ld\n", addr); + } else { + BUG_ON(addr != spa->va_start); + } + + return addr; +} + /** * sp_alloc() - Allocate shared memory for all the processes in a sp_group. * @size: the size of memory to allocate. @@ -1596,6 +1654,314 @@ static int is_vmap_hugepage(unsigned long addr) return 0; }
+static unsigned long __sp_remap_get_pfn(unsigned long kva) +{ + unsigned long pfn; + + if (is_vmalloc_addr((void *)kva)) + pfn = vmalloc_to_pfn((void *)kva); + else + pfn = virt_to_pfn(kva); + + return pfn; +} + +/* when called by k2u to group, always make sure rw_lock of spg is down */ +static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, + struct mm_struct *mm, unsigned long prot) +{ + struct vm_area_struct *vma; + unsigned long ret_addr; + unsigned long populate = 0; + int ret = 0; + unsigned long addr, buf, offset; + + down_write(&mm->mmap_lock); + if (unlikely(mm->core_state)) { + pr_err("k2u mmap: encountered coredump, abort\n"); + ret_addr = -EBUSY; + goto put_mm; + } + + ret_addr = sp_mmap(mm, spa_file(spa), spa, &populate, prot); + if (IS_ERR_VALUE(ret_addr)) { + pr_debug("k2u mmap failed %lx\n", ret_addr); + goto put_mm; + } + BUG_ON(ret_addr != spa->va_start); + + vma = find_vma(mm, ret_addr); + BUG_ON(vma == NULL); + if (prot & PROT_WRITE) + vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); + + if (is_vm_hugetlb_page(vma)) { + ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); + if (ret) { + do_munmap(mm, ret_addr, spa_size(spa), NULL); + pr_debug("remap vmalloc hugepage failed, ret %d, kva is %lx\n", + ret, (unsigned long)kva); + ret_addr = ret; + goto put_mm; + } + vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; + } else { + buf = ret_addr; + addr = kva; + offset = 0; + do { + ret = remap_pfn_range(vma, buf, __sp_remap_get_pfn(addr), PAGE_SIZE, + __pgprot(vma->vm_page_prot.pgprot)); + if (ret) { + do_munmap(mm, ret_addr, spa_size(spa), NULL); + pr_err("remap_pfn_range failed %d\n", ret); + ret_addr = ret; + goto put_mm; + } + offset += PAGE_SIZE; + buf += PAGE_SIZE; + addr += PAGE_SIZE; + } while (offset < spa_size(spa)); + } + +put_mm: + up_write(&mm->mmap_lock); + + return ret_addr; +} + +/** + * sp_make_share_kva_to_task() - Share kernel memory to current task. + * @kva: the VA of shared kernel memory + * @size: the size of area to share, should be aligned properly + * @sp_flags: the flags for the opreation + * + * Return: + * * if succeed, return the shared user address to start at. + * * if fail, return the pointer of -errno. + */ +static void *sp_make_share_kva_to_task(unsigned long kva, unsigned long size, unsigned long sp_flags) +{ + void *uva; + struct sp_area *spa; + struct spg_proc_stat *stat; + unsigned long prot = PROT_READ | PROT_WRITE; + + down_write(&sp_group_sem); + stat = sp_init_process_stat(current, current->mm, spg_none); + up_write(&sp_group_sem); + if (IS_ERR(stat)) { + pr_err_ratelimited("k2u_task init process stat failed %lx\n", + PTR_ERR(stat)); + return stat; + } + + spa = sp_alloc_area(size, sp_flags, spg_none, SPA_TYPE_K2TASK, current->tgid); + if (IS_ERR(spa)) { + pr_err_ratelimited("alloc spa failed in k2u_task (potential no enough virtual memory when -75): %ld\n", + PTR_ERR(spa)); + return spa; + } + + spa->kva = kva; + + uva = (void *)sp_remap_kva_to_vma(kva, spa, current->mm, prot); + __sp_area_drop(spa); + if (IS_ERR(uva)) + pr_err("remap k2u to task failed %ld\n", PTR_ERR(uva)); + else { + update_spg_proc_stat(size, true, stat, SPA_TYPE_K2TASK); + spa->mm = current->mm; + } + + return uva; +} + +/** + * Share kernel memory to a spg, the current process must be in that group + * @kva: the VA of shared kernel memory + * @size: the size of area to share, should be aligned properly + * @sp_flags: the flags for the opreation + * @spg: the sp group to be shared with + * + * Return: the shared user address to start at + */ +static void *sp_make_share_kva_to_spg(unsigned long kva, unsigned long size, + unsigned long sp_flags, struct sp_group *spg) +{ + struct sp_area *spa; + struct mm_struct *mm; + struct sp_group_node *spg_node; + void *uva = ERR_PTR(-ENODEV); + + down_read(&spg->rw_lock); + spa = sp_alloc_area(size, sp_flags, spg, SPA_TYPE_K2SPG, current->tgid); + if (IS_ERR(spa)) { + up_read(&spg->rw_lock); + pr_err_ratelimited("alloc spa failed in k2u_spg (potential no enough virtual memory when -75): %ld\n", + PTR_ERR(spa)); + return spa; + } + + spa->kva = kva; + + list_for_each_entry(spg_node, &spg->procs, proc_node) { + mm = spg_node->master->mm; + uva = (void *)sp_remap_kva_to_vma(kva, spa, mm, spg_node->prot); + if (IS_ERR(uva)) { + pr_err("remap k2u to spg failed %ld\n", PTR_ERR(uva)); + __sp_free(spg, spa->va_start, spa_size(spa), mm); + goto out; + } + } + +out: + up_read(&spg->rw_lock); + __sp_area_drop(spa); + if (!IS_ERR(uva)) + sp_update_process_stat(current, true, spa); + + return uva; +} + +static bool vmalloc_area_set_flag(unsigned long kva, unsigned long flags) +{ + struct vm_struct *area; + + area = find_vm_area((void *)kva); + if (area) { + area->flags |= flags; + return true; + } + + return false; +} + +struct sp_k2u_context { + unsigned long kva; + unsigned long kva_aligned; + unsigned long size; + unsigned long size_aligned; + unsigned long sp_flags; + int spg_id; + bool to_task; + struct timespec64 start; + struct timespec64 end; +}; + +static void trace_sp_k2u_begin(struct sp_k2u_context *kc) +{ + if (!sysctl_sp_perf_k2u) + return; + + ktime_get_ts64(&kc->start); +} + +static void trace_sp_k2u_finish(struct sp_k2u_context *kc, void *uva) +{ + unsigned long cost; + + if (!sysctl_sp_perf_k2u) + return; + + ktime_get_ts64(&kc->end); + + cost = SEC2US(kc->end.tv_sec - kc->start.tv_sec) + + NS2US(kc->end.tv_nsec - kc->start.tv_nsec); + if (cost >= (unsigned long)sysctl_sp_perf_k2u) { + pr_err("Task %s(%d/%d) sp_k2u returns 0x%lx consumes %luus, size is %luKB, size_aligned is %luKB, sp_flags is %lx, to_task is %d\n", + current->comm, current->tgid, current->pid, + (unsigned long)uva, cost, byte2kb(kc->size), byte2kb(kc->size_aligned), + kc->sp_flags, kc->to_task); + } +} + +static int sp_k2u_prepare(unsigned long kva, unsigned long size, + unsigned long sp_flags, int spg_id, struct sp_k2u_context *kc) +{ + int is_hugepage; + unsigned int page_size = PAGE_SIZE; + unsigned long kva_aligned, size_aligned; + + trace_sp_k2u_begin(kc); + + if (sp_flags & ~SP_DVPP) { + pr_err_ratelimited("k2u sp_flags %lx error\n", sp_flags); + return -EINVAL; + } + + if (!current->mm) { + pr_err_ratelimited("k2u: kthread is not allowed\n"); + return -EPERM; + } + + is_hugepage = is_vmap_hugepage(kva); + if (is_hugepage > 0) { + sp_flags |= SP_HUGEPAGE; + page_size = PMD_SIZE; + } else if (is_hugepage == 0) { + /* do nothing */ + } else { + pr_err_ratelimited("k2u kva is not vmalloc address\n"); + return is_hugepage; + } + + /* aligned down kva is convenient for caller to start with any valid kva */ + kva_aligned = ALIGN_DOWN(kva, page_size); + size_aligned = ALIGN(kva + size, page_size) - kva_aligned; + + if (!vmalloc_area_set_flag(kva_aligned, VM_SHAREPOOL)) { + pr_debug("k2u_task kva %lx is not valid\n", kva_aligned); + return -EINVAL; + } + + kc->kva = kva; + kc->kva_aligned = kva_aligned; + kc->size = size; + kc->size_aligned = size_aligned; + kc->sp_flags = sp_flags; + kc->spg_id = spg_id; + kc->to_task = false; + return 0; +} + +static int sp_check_k2task(struct sp_k2u_context *kc) +{ + int ret = 0; + int spg_id = kc->spg_id; + + if (share_pool_group_mode == SINGLE_GROUP_MODE) { + struct sp_group *spg = get_first_group(current->mm); + + if (!spg) { + if (spg_id != SPG_ID_NONE && spg_id != SPG_ID_DEFAULT) + ret = -EINVAL; + else + kc->to_task = true; + } else { + if (spg_id != SPG_ID_DEFAULT && spg_id != spg->id) + ret = -EINVAL; + sp_group_drop(spg); + } + } else { + if (spg_id == SPG_ID_DEFAULT || spg_id == SPG_ID_NONE) + kc->to_task = true; + } + return ret; +} + +static void *sp_k2u_finish(void *uva, struct sp_k2u_context *kc) +{ + if (IS_ERR(uva)) + vmalloc_area_clr_flag(kc->kva_aligned, VM_SHAREPOOL); + else + uva = uva + (kc->kva - kc->kva_aligned); + + trace_sp_k2u_finish(kc, uva); + sp_dump_stack(); + return uva; +} + /** * sp_make_share_k2u() - Share kernel memory to current process or an sp_group. * @kva: the VA of shared kernel memory. @@ -1616,7 +1982,43 @@ static int is_vmap_hugepage(unsigned long addr) void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned long sp_flags, int pid, int spg_id) { - return NULL; + void *uva; + int ret; + struct sp_k2u_context kc; + + check_interrupt_context(); + + ret = sp_k2u_prepare(kva, size, sp_flags, spg_id, &kc); + if (ret) + return ERR_PTR(ret); + + ret = sp_check_k2task(&kc); + if (ret) { + uva = ERR_PTR(ret); + goto out; + } + + if (kc.to_task) + uva = sp_make_share_kva_to_task(kc.kva_aligned, kc.size_aligned, kc.sp_flags); + else { + struct sp_group *spg; + + spg = __sp_find_spg(current->pid, kc.spg_id); + if (spg) { + ret = sp_check_caller_permission(spg, current->mm); + if (ret < 0) { + sp_group_drop(spg); + uva = ERR_PTR(ret); + goto out; + } + uva = sp_make_share_kva_to_spg(kc.kva_aligned, kc.size_aligned, kc.sp_flags, spg); + sp_group_drop(spg); + } else + uva = ERR_PTR(-ENODEV); + } + +out: + return sp_k2u_finish(uva, &kc); } EXPORT_SYMBOL_GPL(sp_make_share_k2u);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 755be0c19c81..dadbea29241d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -37,6 +37,7 @@ #include <linux/pgtable.h> #include <linux/uaccess.h> #include <linux/hugetlb.h> +#include <linux/share_pool.h> #include <asm/io.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> @@ -2622,6 +2623,12 @@ static void __vunmap(const void *addr, int deallocate_pages) return; }
+ /* unmap a sharepool vm area will cause meamleak! */ + if (is_vmalloc_sharepool(area->flags)) { + WARN(1, "Memory leak due to vfree() sharepool vm area (%p) !\n", addr); + return; + } + debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); debug_check_no_obj_freed(area->addr, get_vm_area_size(area));