From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The share pool features is a big feature, it is mainly used to share user virtual memory for different processes in the same group. It could be used by this steps: 1. Process A create a new group which is owned by process A. 2. Process A add process B to the group. 3. Process A add process C to the same group. 4. Process B alloc a new memory VA, and write something in it. 5. The VA was send to the process C by IPC, then process C got it. 6. The process C access the VA and got the data directly. 7. The process A could add more processes in the group to share the memory. 8. Fix the memory by use the free function or exit the group.
The new features is enabled both by CONFIG_ASCEND_SHARE_POOL and the enable_ascend_share_pool flag, it would not affect anything if disabled.
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Signed-off-by: Li Ming limingming.li@huawei.com Signed-off-by: Zefan Li lizefan@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Wu Peng wupeng58@huawei.com Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/Kconfig | 9 + mm/Makefile | 1 + mm/share_pool.c | 2278 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 2288 insertions(+) create mode 100644 mm/share_pool.c
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index e379fca3c8ad..be2f23c8fdf9 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1541,6 +1541,15 @@ config ASCEND_AUTO_TUNING_HUGEPAGE help The hugepage auto-tuning means the kernel dynamically manages the number of huage pages. To achieve this purpose, custom interfaces are required. + +config ASCEND_SHARE_POOL + bool "Enable support for the Share Pool Memory" + default n + select ARCH_USES_HIGH_VMA_FLAGS + select MM_OWNER + help + This feature allows multiple processes to share virtual memory both + in kernel and user level, which is only enabled for ascend platform. endif
endmenu diff --git a/mm/Makefile b/mm/Makefile index d71892c0bbf1..eb9545fbb20d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -106,3 +106,4 @@ obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o obj-$(CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE) += hugepage_tuning.o +obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o diff --git a/mm/share_pool.c b/mm/share_pool.c new file mode 100644 index 000000000000..fcbc831f7f8c --- /dev/null +++ b/mm/share_pool.c @@ -0,0 +1,2278 @@ +/* + * Huawei Ascend Share Pool Memory + * + * Copyright (C) 2020 Huawei Limited + * Author: Tang Yizhou tangyizhou@huawei.com + * Zefan Li lizefan@huawei.com + * Wu Peng wupeng58@huawei.com + * Ding Tianhong dingtgianhong@huawei.com + * Zhou Guanghui zhouguanghui1@huawei.com + * Li Ming limingming.li@huawei.com + * + * This code is based on the hisilicon ascend platform. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/share_pool.h> +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/mm.h> +#include <linux/mm_types.h> +#include <linux/idr.h> +#include <linux/mutex.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/shmem_fs.h> +#include <linux/file.h> +#include <linux/printk.h> +#include <linux/hugetlb.h> +#include <linux/vmalloc.h> +#include <linux/pid.h> +#include <linux/pid_namespace.h> +#include <linux/atomic.h> +#include <linux/lockdep.h> +#include <linux/kernel.h> +#include <linux/falloc.h> +#include <linux/types.h> +#include <linux/idr.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> + +/* access control mode macros */ +#define AC_NONE 0 +#define AC_SINGLE_OWNER 1 + +#define spg_valid(spg) ((spg) && ((spg)->is_alive == true)) +#define ESPGMMEXIT 4000 + +#define byte2kb(size) ((size) / 1024) + +/* mdc scene hack */ +int enable_mdc_default_group; +static const int mdc_default_group_id = 1; + +/* access control mode */ +int sysctl_ac_mode = AC_NONE; + +/* idr of all sp_groups */ +static DEFINE_IDR(sp_group_idr); + +static DEFINE_MUTEX(sp_mutex); + +static BLOCKING_NOTIFIER_HEAD(sp_notifier_chain); + +static DEFINE_IDA(sp_group_id_ida); + +/*** Statistical and maintenance tools ***/ + +/* idr of all sp_proc_stats */ +static DEFINE_IDR(sp_stat_idr); + +/* per process memory usage statistics indexed by tgid */ +struct sp_proc_stat { + char comm[TASK_COMM_LEN]; + /* + * alloc amount minus free amount, may be negative when freed by + * another task in the same sp group. + */ + long amount; +}; + +/* for kthread buff_module_guard_work */ +static struct sp_proc_stat kthread_stat = {0}; + +/* The caller must hold sp_mutex. */ +static struct sp_proc_stat *sp_init_proc_stat(struct task_struct *tsk) +{ + struct sp_proc_stat *stat; + int id = tsk->mm->sp_stat_id; + int tgid = tsk->tgid; + int ret; + + if (id) { + stat = idr_find(&sp_stat_idr, id); + /* other threads in the same process may have initialized it */ + if (stat) + return stat; + } + + stat = kzalloc(sizeof(*stat), GFP_KERNEL); + if (stat == NULL) { + if (printk_ratelimit()) + pr_err("share pool: alloc proc stat failed due to lack of memory\n"); + return ERR_PTR(-ENOMEM); + } + + stat->amount = 0; + get_task_comm(stat->comm, tsk); + ret = idr_alloc(&sp_stat_idr, stat, tgid, tgid + 1, GFP_KERNEL); + if (ret < 0) { + if (printk_ratelimit()) + pr_err("share pool: proc stat idr alloc failed %d\n", ret); + kfree(stat); + return ERR_PTR(ret); + } + + tsk->mm->sp_stat_id = ret; + return stat; +} + +/* statistics of all sp area, protected by sp_area_lock */ +struct sp_spa_stat { + unsigned int total_num; + unsigned int alloc_num; + unsigned int k2u_task_num; + unsigned int k2u_spg_num; + unsigned long total_size; + unsigned long alloc_size; + unsigned long k2u_task_size; + unsigned long k2u_spg_size; +}; + +static struct sp_spa_stat spa_stat = {0}; + +/* statistics of all sp group born from sp_alloc and k2u(spg) */ +struct sp_spg_stat { + atomic_t spa_total_num; + atomic_t spa_total_size; +}; + +static struct sp_spg_stat spg_stat = {0}; + +/*** Global share pool VA allocator ***/ + +enum spa_type { + SPA_TYPE_ALLOC = 1, + SPA_TYPE_K2TASK, + SPA_TYPE_K2SPG, +}; + +/* + * We bump the reference when each mmap succeeds, and it will be dropped + * when vma is about to release, so sp_area object will be automatically + * freed when all tasks in the sp group has exited. + */ +struct sp_area { + unsigned long va_start; + unsigned long va_end; /* va_end always align to hugepage */ + unsigned long real_size; /* real size with alignment */ + bool is_hugepage; + atomic_t use_count; /* How many vmas use this VA region */ + struct rb_node rb_node; /* address sorted rbtree */ + struct list_head link; /* link to the spg->head */ + struct sp_group *spg; + enum spa_type type; /* where spa born from */ +}; +static DEFINE_SPINLOCK(sp_area_lock); +static struct rb_root sp_area_root = RB_ROOT; +bool host_svm_sp_enable = false; + +int sysctl_share_pool_hugepage_enable = 1; + +static unsigned long spa_size(struct sp_area *spa) +{ + return spa->real_size; +} + +static struct file *spa_file(struct sp_area *spa) +{ + if (spa->is_hugepage) + return spa->spg->file_hugetlb; + else + return spa->spg->file; +} + +/* the caller should hold sp_area_lock */ +static int spa_inc_usage(enum spa_type type, unsigned long size) +{ + /* + * all the calculations won't overflow due to system limitation and + * parameter checking in sp_alloc_area() + */ + spa_stat.total_num += 1; + spa_stat.total_size += size; + switch (type) { + case SPA_TYPE_ALLOC: + spa_stat.alloc_num += 1; + spa_stat.alloc_size += size; + break; + case SPA_TYPE_K2TASK: + spa_stat.k2u_task_num += 1; + spa_stat.k2u_task_size += size; + break; + case SPA_TYPE_K2SPG: + spa_stat.k2u_spg_num += 1; + spa_stat.k2u_spg_size += size; + break; + default: + /* usually impossible, perhaps a developer's mistake */ + return -EINVAL; + } + return 0; +} + +/* the caller should hold sp_area_lock */ +static int spa_dec_usage(enum spa_type type, unsigned long size) +{ + switch (type) { + case SPA_TYPE_ALLOC: + spa_stat.alloc_num -= 1; + spa_stat.alloc_size -= size; + break; + case SPA_TYPE_K2TASK: + spa_stat.k2u_task_num -= 1; + spa_stat.k2u_task_size -= size; + break; + case SPA_TYPE_K2SPG: + spa_stat.k2u_spg_num -= 1; + spa_stat.k2u_spg_size -= size; + break; + default: + /* usually impossible, perhaps a developer's mistake */ + spin_unlock(&sp_area_lock); + return -EINVAL; + } + spa_stat.total_num -= 1; + spa_stat.total_size -= size; + return 0; +} + +static void *sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate); + +static void free_sp_group(struct sp_group *spg) +{ + fput(spg->file); + fput(spg->file_hugetlb); + idr_remove(&sp_group_idr, spg->id); + if ((spg->id >= SPG_ID_AUTO_MIN && spg->id <= SPG_ID_AUTO_MAX) || + (spg->id >= SPG_ID_DVPP_PASS_THROUGH_MIN && + spg->id <= SPG_ID_DVPP_PASS_THROUGH_MAX)) + ida_free(&sp_group_id_ida, (unsigned int)spg->id); + kfree(spg); +} + +/* The caller must hold sp_mutex. */ +static struct sp_group *__sp_find_spg(int pid, int spg_id) +{ + struct sp_group *spg; + int ret = 0; + + if (spg_id == SPG_ID_DEFAULT) { + struct task_struct *tsk; + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else + get_task_struct(tsk); + rcu_read_unlock(); + if (ret) + return NULL; + + spg = tsk->mm->sp_group; + put_task_struct(tsk); + } else { + spg = idr_find(&sp_group_idr, spg_id); + } + + return spg; +} + +int sp_group_id_by_pid(int pid) +{ + struct sp_group *spg; + int spg_id = -ENODEV; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(pid, SPG_ID_DEFAULT); + if (spg_valid(spg)) + spg_id = spg->id; + + mutex_unlock(&sp_mutex); + return spg_id; +} +EXPORT_SYMBOL_GPL(sp_group_id_by_pid); + +/* The caller must hold sp_mutex. */ +static struct sp_group *find_or_alloc_sp_group(int spg_id) +{ + struct sp_group *spg; + int ret; + char name[20]; + + spg = idr_find(&sp_group_idr, spg_id); + if (!spg) { + struct user_struct *user = NULL; + int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; + + spg = kzalloc(sizeof(*spg), GFP_KERNEL); + if (spg == NULL) { + if (printk_ratelimit()) + pr_err("share pool: alloc spg failed due to lack of memory\n"); + return ERR_PTR(-ENOMEM); + } + spg->id = spg_id; + atomic_set(&spg->spa_num, 0); + atomic_set(&spg->size, 0); + spg->is_alive = true; + spg->hugepage_failures = 0; + spg->dvpp_multi_spaces = false; + spg->owner = current->group_leader; + atomic_set(&spg->use_count, 0); + INIT_LIST_HEAD(&spg->procs); + INIT_LIST_HEAD(&spg->spa_list); + + ret = idr_alloc(&sp_group_idr, spg, spg_id, spg_id+1, + GFP_KERNEL); + if (ret < 0) { + if (printk_ratelimit()) + pr_err("share pool: create group idr alloc failed\n"); + goto out_kfree; + } + + sprintf(name, "sp_group_%d", spg_id); + spg->file = shmem_kernel_file_setup(name, MAX_LFS_FILESIZE, + VM_NORESERVE); + if (IS_ERR(spg->file)) { + if (printk_ratelimit()) + pr_err("share pool: file setup for small page failed %ld\n", + PTR_ERR(spg->file)); + ret = PTR_ERR(spg->file); + goto out_idr; + } + + spg->file_hugetlb = hugetlb_file_setup(name, MAX_LFS_FILESIZE, + VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, hsize_log); + if (IS_ERR(spg->file_hugetlb)) { + if (printk_ratelimit()) + pr_err("share pool: file setup for hugepage failed %ld\n", + PTR_ERR(spg->file_hugetlb)); + ret = PTR_ERR(spg->file_hugetlb); + goto out_fput; + } + } + + return spg; + +out_fput: + fput(spg->file); +out_idr: + idr_remove(&sp_group_idr, spg_id); +out_kfree: + kfree(spg); + return ERR_PTR(ret); +} + +static void __sp_area_drop_locked(struct sp_area *spa); + +/* The caller must hold sp_mutex. */ +static void sp_munmap_task_areas(struct mm_struct *mm, struct list_head *stop) +{ + struct sp_area *spa, *prev = NULL; + int err; + + if (!mmget_not_zero(mm)) + return; + down_write(&mm->mmap_sem); + spin_lock(&sp_area_lock); + + list_for_each_entry(spa, &mm->sp_group->spa_list, link) { + if (&spa->link == stop) + break; + + if (prev) + __sp_area_drop_locked(prev); + prev = spa; + + atomic_inc(&spa->use_count); + spin_unlock(&sp_area_lock); + + err = do_munmap(mm, spa->va_start, spa_size(spa), NULL); + if (err) { + /* we are not supposed to fail */ + pr_err("share pool: failed to unmap VA %pK when munmap task areas\n", + (void *)spa->va_start); + } + + spin_lock(&sp_area_lock); + } + if (prev) + __sp_area_drop_locked(prev); + + spin_unlock(&sp_area_lock); + up_write(&mm->mmap_sem); + mmput(mm); +} + +/** + * sp_group_add_task - add a process to an sp_group + * @pid: the pid of the task to be added + * @spg_id: the ID of the sp_group + * + * A thread group can't be added to more than one sp_group. + * + * Return: The manually allocated ID is between [SPG_ID_MIN, SPG_ID_MAX] + * The automatically allocated ID is between [SPG_ID_AUTO_MIN, SPG_ID_AUTO_MAX] + * When negative, the return value is -errno. + */ +int sp_group_add_task(int pid, int spg_id) +{ + struct task_struct *tsk; + struct mm_struct *mm; + struct sp_group *spg; + int ret = 0; + struct sp_area *spa, *prev = NULL; + struct sp_proc_stat *stat; + + /* mdc scene hack */ + if (enable_mdc_default_group) + spg_id = mdc_default_group_id; + + if ((spg_id < SPG_ID_MIN || spg_id > SPG_ID_AUTO) + && spg_id != SPG_ID_DVPP_PASS_THROUGH) { + if (printk_ratelimit()) + pr_err("share pool: task add group failed due to invalid group id %d\n", spg_id); + return -EINVAL; + } + + if (spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) { + mutex_lock(&sp_mutex); + spg = idr_find(&sp_group_idr, spg_id); + if (!spg_valid(spg)) { + mutex_unlock(&sp_mutex); + pr_err("share pool: task add group failed because group id %d hasn't been create or dead\n", + spg_id); + return -EINVAL; + } + mutex_unlock(&sp_mutex); + } + + if (spg_id == SPG_ID_AUTO) { + spg_id = ida_alloc_range(&sp_group_id_ida, SPG_ID_AUTO_MIN, + SPG_ID_AUTO_MAX, GFP_ATOMIC); + if (spg_id < 0) { + pr_err("share pool: task add group failed when automatically generate group id failed\n"); + return spg_id; + } + } + + if (spg_id == SPG_ID_DVPP_PASS_THROUGH) { + spg_id = ida_alloc_range(&sp_group_id_ida, + SPG_ID_DVPP_PASS_THROUGH_MIN, + SPG_ID_DVPP_PASS_THROUGH_MAX, GFP_ATOMIC); + if (spg_id < 0) { + pr_err("share pool: task add group failed when automatically generate group id failed" + "in DVPP pass through\n"); + return spg_id; + } + } + + mutex_lock(&sp_mutex); + + rcu_read_lock(); + + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else if (tsk->mm->sp_group) /* if it's already in a sp_group */ + ret = -EEXIST; + else + get_task_struct(tsk); + + rcu_read_unlock(); + if (ret) + goto out_unlock; + + spg = find_or_alloc_sp_group(spg_id); + if (IS_ERR(spg) || !spg_valid(spg)) { + ret = PTR_ERR(spg); + goto out_put_task; + } + /* access control permission check */ + if (sysctl_ac_mode == AC_SINGLE_OWNER) { + if (spg->owner != current->group_leader) { + ret = -EPERM; + goto out_put_task; + } + } + + /* per process statistics initialization */ + stat = sp_init_proc_stat(tsk); + if (IS_ERR(stat)) { + ret = PTR_ERR(stat); + pr_err("share pool: init proc stat failed, ret %lx\n", PTR_ERR(stat)); + goto out_put_task; + } + + mm = tsk->mm; + mm->sp_group = spg; + atomic_inc(&spg->use_count); + list_add_tail(&tsk->mm->sp_node, &spg->procs); + /* + * create mappings of existing shared memory segments into this + * new process' page table. + */ + spin_lock(&sp_area_lock); + + list_for_each_entry(spa, &spg->spa_list, link) { + unsigned long populate = 0; + struct file *file = spa_file(spa); + void *p; + + if (prev) + __sp_area_drop_locked(prev); + prev = spa; + + atomic_inc(&spa->use_count); + spin_unlock(&sp_area_lock); + + p = sp_mmap(mm, file, spa, &populate); + if (IS_ERR(p) && (PTR_ERR(p) != -ESPGMMEXIT)) { + sp_munmap_task_areas(mm, &spa->link); + ret = PTR_ERR(p); + pr_err("share pool: task add group sp mmap failed, ret %d\n", ret); + spin_lock(&sp_area_lock); + break; + } + + if (PTR_ERR(p) == -ESPGMMEXIT) { + pr_err("share pool: task add group sp mmap failed, ret -ESPGMEXIT\n"); + spin_lock(&sp_area_lock); + ret = -ESPGMMEXIT; + break; + } + + if (populate) { + ret = do_mm_populate(mm, spa->va_start, populate, 0); + if (ret) { + if (printk_ratelimit()) + pr_err("share pool: task add group failed when mm populate failed: %d\n", + ret); + sp_munmap_task_areas(mm, spa->link.next); + } + } + + spin_lock(&sp_area_lock); + } + if (prev) + __sp_area_drop_locked(prev); + spin_unlock(&sp_area_lock); + + if (unlikely(ret)) { + idr_remove(&sp_stat_idr, mm->sp_stat_id); + kfree(stat); + } + +out_put_task: + put_task_struct(tsk); +out_unlock: + mutex_unlock(&sp_mutex); + return ret == 0 ? spg_id : ret; +} +EXPORT_SYMBOL_GPL(sp_group_add_task); + +static void spg_exit_lock(bool *unlock) +{ + switch (mutex_trylock_recursive(&sp_mutex)) { + case MUTEX_TRYLOCK_RECURSIVE: + *unlock = false; + break; + case MUTEX_TRYLOCK_FAILED: + mutex_lock(&sp_mutex); + *unlock = true; + break; + case MUTEX_TRYLOCK_SUCCESS: + *unlock = true; + break; + default: + BUG(); + } +} + +static void spg_exit_unlock(bool unlock) +{ + if (unlock) + mutex_unlock(&sp_mutex); +} + +/* + * Do cleanup when a process exits. + */ +void sp_group_exit(struct mm_struct *mm) +{ + bool is_alive = true; + bool unlock; + + if (!enable_ascend_share_pool) + return; + + /* + * Nothing to do if this thread group doesn't belong to any sp_group. + * No need to protect this check with lock because we can add a task + * to a group if !PF_EXITING. + */ + if (!mm->sp_group) + return; + + spg_exit_lock(&unlock); + if (list_is_singular(&mm->sp_group->procs)) + is_alive = mm->sp_group->is_alive = false; + list_del(&mm->sp_node); + spg_exit_unlock(unlock); + + /* + * To avoid calling this with sp_mutex held, we first mark the + * sp_group as dead and then send the notification and then do + * the real cleanup in sp_group_post_exit(). + */ + if (!is_alive) + blocking_notifier_call_chain(&sp_notifier_chain, 0, + mm->sp_group); +} + +void sp_group_post_exit(struct mm_struct *mm) +{ + bool is_alive; + struct sp_proc_stat *stat; + bool unlock; + + if (!enable_ascend_share_pool) + return; + + if (!mm->sp_group) + return; + + spg_exit_lock(&unlock); + is_alive = mm->sp_group->is_alive; + + /* pointer stat must be valid, we don't need to check sanity */ + stat = idr_find(&sp_stat_idr, mm->sp_stat_id); + /* + * There are two basic scenarios when a process in the share pool is + * exiting but its share pool memory usage is not 0. + * 1. Process A called sp_alloc(), but it terminates without calling + * sp_free(). Then its share pool memory usage is a positive number. + * 2. Process A never called sp_alloc(), and process B in the same spg + * called sp_alloc() to get an addr u. Then A gets u somehow and + * called sp_free(u). Now A's share pool memory usage is a negative + * number. Notice B's memory usage will be a positive number. + * + * We decide to print a info when seeing both of the scenarios. + */ + if (stat && stat->amount != 0) + pr_info("share pool: process %s(%d) of sp group %d exits. " + "It applied %ld aligned KB\n", + stat->comm, mm->sp_stat_id, + mm->sp_group->id, byte2kb(stat->amount)); + + idr_remove(&sp_stat_idr, mm->sp_stat_id); + + if (atomic_dec_and_test(&mm->sp_group->use_count)) { + BUG_ON(is_alive); + free_sp_group(mm->sp_group); + } + spg_exit_unlock(unlock); + + kfree(stat); +} + +/* the caller must hold sp_area_lock */ +static void __insert_sp_area(struct sp_area *spa) +{ + struct rb_node **p = &sp_area_root.rb_node; + struct rb_node *parent = NULL; + + while (*p) { + struct sp_area *tmp; + + parent = *p; + tmp = rb_entry(parent, struct sp_area, rb_node); + if (spa->va_start < tmp->va_end) + p = &(*p)->rb_left; + else if (spa->va_end > tmp->va_start) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&spa->rb_node, parent, p); + rb_insert_color(&spa->rb_node, &sp_area_root); +} + +/* + * Allocate a region of VA from the share pool. + * @size - the size of VA to allocate + * + * The caller must hold must sp_mutex when input parameter spg is not NULL + * + * Return NULL if fail. + */ +static struct sp_area *sp_alloc_area(unsigned long size, unsigned long flags, + struct sp_group *spg, enum spa_type type) +{ + struct sp_area *spa; + struct rb_node *n; + unsigned long vstart = MMAP_SHARE_POOL_START; + unsigned long vend = MMAP_SHARE_POOL_16G_START; + unsigned long addr; + unsigned long size_align = ALIGN(size, 1 << 21); /* align to 2M */ + + if ((flags & SP_DVPP)) { + if (host_svm_sp_enable == false) { + vstart = MMAP_SHARE_POOL_16G_START; + vend = MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE; + } else { + vstart = spg->dvpp_va_start; + vend = spg->dvpp_va_start + spg->dvpp_size; + } + } + + addr = vstart; + + if (!sysctl_share_pool_hugepage_enable) + flags &= ~(SP_HUGEPAGE_ONLY | SP_HUGEPAGE); + + spa = kmalloc(sizeof(struct sp_area), GFP_KERNEL); + if (unlikely(!spa)) { + if (printk_ratelimit()) + pr_err("share pool: alloc spa failed due to lack of memory\n"); + return NULL; + } + + spin_lock(&sp_area_lock); + + n = sp_area_root.rb_node; + if (n) { + struct sp_area *first = NULL; + + do { + struct sp_area *tmp; + tmp = rb_entry(n, struct sp_area, rb_node); + if (tmp->va_end >= addr) { + if (!first && tmp->va_start < addr + size_align) + first = tmp; + n = n->rb_left; + } else { + first = tmp; + n = n->rb_right; + } + } while (n); + + if (!first) + goto found; + + if (first->va_end < addr) { + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct sp_area, rb_node); + else + goto found; + } + + while (addr + size_align >= first->va_start && + addr + size_align <= vend) { + addr = first->va_end; + + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct sp_area, rb_node); + else + goto found; + } + } +found: + if (addr + size_align > vend) { + goto error; + } + + spa->va_start = addr; + spa->va_end = addr + size_align; + spa->real_size = size; + spa->is_hugepage = (flags & SP_HUGEPAGE); + spa->spg = spg; + atomic_set(&spa->use_count, 1); + spa->type = type; + + if (spa_inc_usage(type, size)) + goto error; + + __insert_sp_area(spa); + if (spa->spg) { + atomic_inc(&spg->spa_num); + atomic_add(size, &spg->size); + atomic_inc(&spg_stat.spa_total_num); + atomic_add(size, &spg_stat.spa_total_size); + list_add_tail(&spa->link, &spg->spa_list); + } + spin_unlock(&sp_area_lock); + + return spa; + +error: + spin_unlock(&sp_area_lock); + kfree(spa); + return NULL; +} + +/* the caller should hold sp_area_lock */ +static struct sp_area *__find_sp_area_locked(unsigned long addr) +{ + struct rb_node *n = sp_area_root.rb_node; + + while (n) { + struct sp_area *spa; + + spa = rb_entry(n, struct sp_area, rb_node); + if (addr < spa->va_start) { + n = n->rb_left; + } else if (addr > spa->va_start) { + n = n->rb_right; + } else { + return spa; + } + } + + return NULL; +} + +static struct sp_area *__find_sp_area(unsigned long addr) +{ + struct sp_area *n; + spin_lock(&sp_area_lock); + n = __find_sp_area_locked(addr); + if (n) + atomic_inc(&n->use_count); + spin_unlock(&sp_area_lock); + return n; +} + +/* + * Free the VA region starting from addr to the share pool + */ +static void sp_free_area(struct sp_area *spa) +{ + lockdep_assert_held(&sp_area_lock); + + spa_dec_usage(spa->type, spa->real_size); /* won't fail */ + if (spa->spg) { + atomic_dec(&spa->spg->spa_num); + atomic_sub(spa->real_size, &spa->spg->size); + atomic_dec(&spg_stat.spa_total_num); + atomic_sub(spa->real_size, &spg_stat.spa_total_size); + list_del(&spa->link); + } + rb_erase(&spa->rb_node, &sp_area_root); + RB_CLEAR_NODE(&spa->rb_node); + kfree(spa); +} + +static void __sp_area_drop_locked(struct sp_area *spa) +{ + /* + * Considering a situation where task A and B are in the same spg. + * A is exiting and calling remove_vma(). Before A calls this func, + * B calls sp_free() to free the same spa. So spa maybe NULL when A + * calls this func later. + */ + if (!spa) + return; + + if (atomic_dec_and_test(&spa->use_count)) + sp_free_area(spa); +} + +static void __sp_area_drop(struct sp_area *spa) +{ + spin_lock(&sp_area_lock); + __sp_area_drop_locked(spa); + spin_unlock(&sp_area_lock); +} + +void sp_area_drop(struct vm_area_struct *vma) +{ + struct sp_area *spa; + + if (!sp_check_vm_share_pool(vma->vm_flags)) + return; + + /* + * Considering a situation where task A and B are in the same spg. + * A is exiting and calling remove_vma() -> ... -> sp_area_drop(). + * Concurrently, B is calling sp_free() to free the same spa. + * __find_sp_area_locked() and __sp_area_drop_locked() should be + * an atomic operation. + */ + spin_lock(&sp_area_lock); + spa = __find_sp_area_locked(vma->vm_start); + __sp_area_drop_locked(spa); + spin_unlock(&sp_area_lock); +} + +/* The caller must hold sp_mutex. */ +static void sp_munmap(struct mm_struct *mm, unsigned long addr, + unsigned long size) +{ + int err; + + if (!mmget_not_zero(mm)) + return; + down_write(&mm->mmap_sem); + + err = do_munmap(mm, addr, size, NULL); + if (err) { + /* we are not supposed to fail */ + pr_err("share pool: failed to unmap VA %pK when sp munmap\n", (void *)addr); + } + + up_write(&mm->mmap_sem); + mmput(mm); +} + +/* The caller must hold sp_mutex. */ +static void __sp_free(struct sp_group *spg, unsigned long addr, + unsigned long size, struct mm_struct *stop) +{ + struct mm_struct *mm; + struct mm_struct *tmp; + + list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + if (mm == stop) + break; + sp_munmap(mm, addr, size); + } +} + +/* + * Free the memory allocated by sp_alloc() + * @addr - the starting VA of the memory + * + * Return fail if the memory can't be found or was not allocted by share pool. + */ +int sp_free(unsigned long addr) +{ + struct sp_area *spa; + struct sp_proc_stat *stat; + int mode; + loff_t offset; + int ret = 0; + + mutex_lock(&sp_mutex); + + /* + * Access control: a share pool addr can only be freed by another task + * in the same spg or a kthread (such as buff_module_guard_work) + */ + spa = __find_sp_area(addr); + if (spa) { + if (current->mm != NULL) { + if (current->mm->sp_group != spa->spg) { + ret = -EPERM; + goto drop_spa; + } + } + } else { /* spa == NULL */ + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: sp_free invalid input addr %pK\n", (void *)addr); + goto out; + } + + if (!spg_valid(spa->spg)) + goto drop_spa; + + __sp_free(spa->spg, spa->va_start, spa_size(spa), NULL); + + /* Free the memory of the backing shmem or hugetlbfs */ + mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE; + offset = addr - MMAP_SHARE_POOL_START; + ret = vfs_fallocate(spa_file(spa), mode, offset, spa_size(spa)); + if (ret) + pr_err("share pool: fallocate failed: %d\n", ret); + + /* pointer stat may be invalid because of kthread buff_module_guard_work */ + if (current->mm == NULL) { + kthread_stat.amount -= spa->real_size; + } else { + stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); + if (stat) + stat->amount -= spa->real_size; + else + BUG(); + } + +drop_spa: + __sp_area_drop(spa); +out: + mutex_unlock(&sp_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(sp_free); + +/* wrapper of __do_mmap() and the caller must hold down_write(&mm->mmap_sem). */ +static unsigned long __sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate) +{ + unsigned long addr = spa->va_start; + unsigned long size = spa_size(spa); + unsigned long prot = PROT_READ | PROT_WRITE; + unsigned long flags = MAP_FIXED | MAP_SHARED | MAP_LOCKED | + MAP_POPULATE | MAP_SHARE_POOL; + unsigned long vm_flags = VM_NORESERVE | VM_SHARE_POOL | VM_DONTCOPY; + unsigned long pgoff = (addr - MMAP_SHARE_POOL_START) >> PAGE_SHIFT; + + atomic_inc(&spa->use_count); + addr = __do_mmap(mm, file, addr, size, prot, flags, vm_flags, pgoff, + populate, NULL); + if (IS_ERR_VALUE(addr)) { + atomic_dec(&spa->use_count); + pr_err("share pool: do_mmap fails %ld\n", addr); + } + + return addr; +} + +static void *sp_mmap(struct mm_struct *mm, struct file *file, + struct sp_area *spa, unsigned long *populate) +{ + unsigned long addr; + + if (!mmget_not_zero(mm)) + return ERR_PTR(-ESPGMMEXIT); + down_write(&mm->mmap_sem); + addr = __sp_mmap(mm, file, spa, populate); + up_write(&mm->mmap_sem); + mmput(mm); + + if (IS_ERR_VALUE(addr)) + return ERR_PTR(addr); + + BUG_ON(addr != spa->va_start); + return (void *)addr; +} + +/** + * Allocate shared memory for all the processes in the same sp_group + * size - the size of memory to allocate + * sp_flags - how to allocate the memory + * spg_id - the share group that the memory is allocated to. + * + * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + */ +void *sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) +{ + struct sp_group *spg = NULL; + struct sp_area *spa = NULL; + struct sp_proc_stat *stat; + unsigned long sp_addr; + void *p_mmap, *p = ERR_PTR(-ENODEV); + struct mm_struct *mm; + struct file *file; + unsigned long size_aligned; + int ret = 0; + struct mm_struct *tmp; + + /* mdc scene hack */ + if (enable_mdc_default_group) + spg_id = mdc_default_group_id; + + if (spg_id != SPG_ID_DEFAULT && spg_id < SPG_ID_MIN) { + if (printk_ratelimit()) + pr_err("share pool: allocation failed due to invalid group id %d\n", spg_id); + return ERR_PTR(-EINVAL); + } + + if (sp_flags & ~(SP_HUGEPAGE_ONLY | SP_HUGEPAGE | SP_DVPP)) { + if (printk_ratelimit()) + pr_err("share pool: allocation failed due to invalid flag %lu\n", sp_flags); + return ERR_PTR(-EINVAL); + } + + if (sp_flags & SP_HUGEPAGE_ONLY) + sp_flags |= SP_HUGEPAGE; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(current->pid, SPG_ID_DEFAULT); + mutex_unlock(&sp_mutex); + if (!spg) { /* DVPP pass through scene: first call sp_alloc() */ + /* mdc scene hack */ + if (enable_mdc_default_group) + ret = sp_group_add_task(current->tgid, spg_id); + else + ret = sp_group_add_task(current->tgid, + SPG_ID_DVPP_PASS_THROUGH); + /* + * The multi-thread contention may cause repeated joins to the group. + * The judgment is added to prevent exit in this case. + */ + if (ret < 0 && (ret != -EEXIST)) { + pr_err("share pool: allocation failed due to add group error %d in DVPP pass through scenario", + ret); + p = ERR_PTR(ret); + goto out; + } + mutex_lock(&sp_mutex); + spg = current->mm->sp_group; + } else { /* other scenes */ + mutex_lock(&sp_mutex); + if (spg_id != SPG_ID_DEFAULT) { + /* the caller should be a member of the sp group */ + if (spg != idr_find(&sp_group_idr, spg_id)) + goto out; + } + } + + if (!spg_valid(spg)) { + pr_err("share pool: sp alloc failed, spg is invalid\n"); + goto out; + } + + if (!sysctl_share_pool_hugepage_enable) + sp_flags &= ~(SP_HUGEPAGE_ONLY | SP_HUGEPAGE); + + if (sp_flags & SP_HUGEPAGE) { + file = spg->file_hugetlb; + size_aligned = ALIGN(size, PMD_SIZE); + } else { + file = spg->file; + size_aligned = ALIGN(size, PAGE_SIZE); + } +try_again: + spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_ALLOC); + if (!spa) { + if (printk_ratelimit()) + pr_err("share pool: allocation failed due to alloc spa failure\n"); + p = ERR_PTR(-ENOMEM); + goto out; + } + sp_addr = spa->va_start; + + /* create mapping for each process in the group */ + list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + unsigned long populate = 0; + struct vm_area_struct *vma; + + p_mmap = sp_mmap(mm, file, spa, &populate); + if (IS_ERR(p_mmap) && (PTR_ERR(p_mmap) != -ESPGMMEXIT)) { + p = p_mmap; + __sp_free(spg, sp_addr, size_aligned, mm); + pr_err("share pool: allocation sp mmap failed, ret %ld\n", PTR_ERR(p_mmap)); + break; + } + + if (PTR_ERR(p_mmap) == -ESPGMMEXIT) { + pr_info("share pool: allocation sp mmap failed, ret -ESPGMMEXIT\n"); + continue; + } + + p = p_mmap; /* success */ + if (populate == 0) + continue; + + if (!mmget_not_zero(mm)) + continue; + down_write(&mm->mmap_sem); + vma = find_vma(mm, sp_addr); + if (unlikely(!vma)) { + pr_err("share pool: allocation failed due to find %pK vma failure\n", + (void *)sp_addr); + p = ERR_PTR(-EINVAL); + up_write(&mm->mmap_sem); + mmput(mm); + goto out; + } + /* clean PTE_RDONLY flags or trigger SMMU event */ + vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); + up_write(&mm->mmap_sem); + /* + * We are not ignoring errors, so if we fail to allocate + * physical memory we just return failure, so we won't encounter + * page fault later on, and more importantly sp_make_share_u2k() + * depends on this feature (and MAP_LOCKED) to work correctly. + */ + ret = do_mm_populate(mm, sp_addr, populate, 0); + if (ret) { + __sp_free(spg, sp_addr, size_aligned, + list_next_entry(mm, sp_node)); + + if (file == spg->file_hugetlb) { + spg->hugepage_failures++; + + /* fallback to small pages */ + if (!(sp_flags & SP_HUGEPAGE_ONLY)) { + file = spg->file; + spa->is_hugepage = false; + size_aligned = ALIGN(size, PAGE_SIZE); + __sp_area_drop(spa); + mmput(mm); + goto try_again; + } + } + + if (printk_ratelimit()) + pr_err("share pool: allocation failed due to mm populate failed: %d\n", + ret); + p = ERR_PTR(ret); + mmput(mm); + break; + } + mmput(mm); + } + + if (!IS_ERR(p)) { + stat = idr_find(&sp_stat_idr, current->mm->sp_stat_id); + if (stat) + stat->amount += size_aligned; + } + +out: + mutex_unlock(&sp_mutex); + + /* this will free spa if mmap failed */ + if (spa) + __sp_area_drop(spa); + + return p; +} +EXPORT_SYMBOL_GPL(sp_alloc); + +static unsigned long __sp_remap_get_pfn(unsigned long kva) +{ + unsigned long pfn; + if (is_vmalloc_addr((void *)kva)) + pfn = vmalloc_to_pfn((void *)kva); + else + pfn = virt_to_pfn(kva); + + return pfn; +} + +/* + * return value: >0 means this is a hugepage addr + * =0 means a normal addr. <0 means an errno. + */ +static int is_vmap_hugepage(unsigned long addr) +{ + struct vm_struct *area; + + if (unlikely(!addr)) { + if (printk_ratelimit()) + pr_err("share pool: null pointer when judge vmap addr\n"); + return -EINVAL; + } + + area = find_vm_area((void *)addr); + if (unlikely(!area)) { + if (printk_ratelimit()) + pr_err("share pool: failed to find vm area(%lx)\n", addr); + return -EINVAL; + } + + if (area->flags & VM_HUGE_PAGES) + return 1; + else + return 0; +} + +static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, + struct mm_struct *mm) +{ + struct vm_area_struct *vma; + unsigned long ret_addr; + unsigned long populate = 0; + unsigned long addr, buf, offset; + struct file *file = NULL; + int ret = 0; + struct user_struct *user = NULL; + int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; + + if (spa->is_hugepage) { + file = hugetlb_file_setup(HUGETLB_ANON_FILE, spa_size(spa), VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE, hsize_log); + if (IS_ERR(file)) { + pr_err("share pool: file setup for k2u hugepage failed %ld\n", PTR_ERR(file)); + return PTR_ERR(file); + } + } + + if (!mmget_not_zero(mm)) { + ret_addr = -ESPGMMEXIT; + goto put_file; + } + down_write(&mm->mmap_sem); + + ret_addr = __sp_mmap(mm, file, spa, &populate); + if (IS_ERR_VALUE(ret_addr)) { + pr_err("share pool: k2u mmap failed %lx\n", ret_addr); + goto out; + } + BUG_ON(ret_addr != spa->va_start); + + vma = find_vma(mm, ret_addr); + BUG_ON(vma == NULL); + vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY); + + if (is_vm_hugetlb_page(vma)) { + ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); + if (ret) { + pr_err("share pool: remap vmalloc hugepage failed, ret %d\n", ret); + ret_addr = ret; + goto out; + } + } else { + buf = ret_addr; + addr = kva; + offset = 0; + do { + ret = remap_pfn_range(vma, buf, __sp_remap_get_pfn(addr), PAGE_SIZE, + __pgprot(vma->vm_page_prot.pgprot)); + if (ret) { + ret_addr = ret; + goto out; + } + offset += PAGE_SIZE; + buf += PAGE_SIZE; + addr += PAGE_SIZE; + } while (offset < spa_size(spa)); + } + +out: + up_write(&mm->mmap_sem); + mmput(mm); +put_file: + if (file) + fput(file); + + return ret_addr; +} + +static void *sp_make_share_kva_to_task(unsigned long kva, struct sp_area *spa, + int pid) +{ + struct task_struct *tsk; + unsigned long ret_addr; + void *p = ERR_PTR(-ENODEV); + int ret = 0; + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else + get_task_struct(tsk); + + rcu_read_unlock(); + if (ret) + return ERR_PTR(ret); + + ret_addr = sp_remap_kva_to_vma(kva, spa, tsk->mm); + if (IS_ERR_VALUE(ret_addr)) { + pr_err("share pool: remap k2u to task failed, ret %ld\n", ret_addr); + sp_munmap(tsk->mm, spa->va_start, spa_size(spa)); + p = ERR_PTR(ret_addr); + goto out; + } + + p = (void *)ret_addr; +out: + put_task_struct(tsk); + return p; +} + +static void *sp_make_share_kva_to_spg(unsigned long kva, struct sp_area *spa, + struct sp_group *spg) +{ + struct mm_struct *mm; + struct mm_struct *tmp; + unsigned long ret_addr = -ENODEV; + unsigned long uva = -ENODEV; + void *p = ERR_PTR(-ENODEV); + + list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + ret_addr = sp_remap_kva_to_vma(kva, spa, mm); + if (IS_ERR_VALUE(ret_addr) && (ret_addr != -ESPGMMEXIT)) { + pr_err("share pool: remap k2u to spg failed, ret %ld \n", ret_addr); + __sp_free(spg, spa->va_start, spa_size(spa), + list_next_entry(mm, sp_node)); + p = ERR_PTR(ret_addr); + goto out; + } + + if (ret_addr == -ESPGMMEXIT) { + pr_info("share pool: remap k2u, ret is -ESPGMMEXIT\n"); + continue; + } + + uva = ret_addr; + } + p = (void *)uva; +out: + return p; +} + +/** + * Share kernel memory to a specified process or sp_group + * @kva: the VA of shared kernel memory + * @size: the size of shared kernel memory + * @sp_flags: how to allocate the memory. We only support SP_DVPP. + * @pid: the pid of the specified process + * @spg_id: currently, only support default value(SPG_ID_DEFAULT) and other values + * are useless. + * + * Return: the shared target user address to start at + * + * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + */ +void *sp_make_share_k2u(unsigned long kva, unsigned long size, + unsigned long sp_flags, int pid, int spg_id) +{ + void *uva = ERR_PTR(-ENODEV); + struct sp_group *spg; + struct sp_area *spa; + unsigned long kva_aligned; + unsigned long size_aligned; + unsigned int page_size = PAGE_SIZE; + int ret; + + if (sp_flags & ~SP_DVPP) { + if (printk_ratelimit()) + pr_err("share pool: k2u sp_flags %lu error\n", sp_flags); + return ERR_PTR(-EINVAL); + } + + ret = is_vmap_hugepage(kva); + if (ret > 0) { + sp_flags |= SP_HUGEPAGE; + page_size = PMD_SIZE; + } else if (ret == 0) { + /* do nothing */ + } else { + return ERR_PTR(ret); + } + /* aligned down kva is convenient for caller to start with any valid kva */ + kva_aligned = ALIGN_DOWN(kva, page_size); + size_aligned = ALIGN(kva + size, page_size) - kva_aligned; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(pid, spg_id); + if (spg == NULL) { + spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); + if (!spa) { + mutex_unlock(&sp_mutex); + if (printk_ratelimit()) + pr_err("share pool: k2u failed due to alloc spa failure\n"); + return ERR_PTR(-ENOMEM); + } + uva = sp_make_share_kva_to_task(kva_aligned, spa, pid); + mutex_unlock(&sp_mutex); + } else if (spg_valid(spg)) { + spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); + if (!spa) { + mutex_unlock(&sp_mutex); + if (printk_ratelimit()) + pr_err("share pool: k2u failed due to alloc spa failure\n"); + return ERR_PTR(-ENOMEM); + } + + uva = sp_make_share_kva_to_spg(kva_aligned, spa, spg); + mutex_unlock(&sp_mutex); + } else { + mutex_unlock(&sp_mutex); + pr_err("share pool: failed to make k2u\n"); + return NULL; + } + + if (!IS_ERR(uva)) + uva = uva + (kva - kva_aligned); + + __sp_area_drop(spa); + return uva; +} +EXPORT_SYMBOL_GPL(sp_make_share_k2u); + +static int sp_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + struct page *page = pte_page(*pte); + struct sp_walk_data *sp_walk_data; + + if (unlikely(!pte_present(*pte))) { + if (printk_ratelimit()) + pr_err("share pool: the page of addr %pK unexpectedly not in RAM\n", (void *)addr); + return -EFAULT; + } + + sp_walk_data = walk->private; + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + return 0; +} + +static int sp_test_walk(unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + /* + * FIXME: The devmm driver uses remap_pfn_range() but actually there + * are associated struct pages, so they should use vm_map_pages() or + * similar APIs. Before the driver has been converted to correct APIs + * we use this test_walk() callback so we can treat VM_PFNMAP VMAs as + * normal VMAs. + */ + return 0; +} + +static int sp_pte_hole(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + if (printk_ratelimit()) + pr_err("share pool: hole [%pK, %pK) appeared unexpectedly\n", (void *)start, (void *)end); + return -EFAULT; +} + +static int sp_hugetlb_entry(pte_t *ptep, unsigned long hmask, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + pte_t pte = huge_ptep_get(ptep); + struct page *page = pte_page(pte); + struct sp_walk_data *sp_walk_data; + + if (unlikely(!pte_present(pte))) { + if (printk_ratelimit()) + pr_err("share pool: the page of addr %pK unexpectedly not in RAM\n", (void *)addr); + return -EFAULT; + } + + sp_walk_data = walk->private; + get_page(page); + sp_walk_data->pages[sp_walk_data->page_count++] = page; + return 0; +} + +/** + * the caller must hold mm->mmap_sem + * + * Notes for parameter alignment: + * When size == 0, let it be page_size, so that at least one page is walked. + * + * When size > 0, for convenience, usually the parameters of uva and + * size are not page aligned. There are four different alignment scenarios and + * we must handler all of them correctly. + * + * The basic idea is to align down uva and align up size so all the pages + * in range [uva, uva + size) are walked. However, there are special cases. + * + * Considering a 2M-hugepage addr scenario. Assuming the caller wants to + * traverse range [1001M, 1004.5M), so uva and size is 1001M and 3.5M + * accordingly. The aligned-down uva is 1000M and the aligned-up size is 4M. + * The traverse range will be [1000M, 1004M). Obviously, the final page for + * [1004M, 1004.5M) is not covered. + * + * To fix this problem, we need to walk an additional page, size should be + * ALIGN(uva+size) - uva_aligned + */ +static int __sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + int ret = 0; + struct vm_area_struct *vma; + unsigned long page_nr; + struct page **pages = NULL; + bool is_hugepage = false; + unsigned long uva_aligned; + unsigned long size_aligned; + unsigned int page_size = PAGE_SIZE; + struct mm_walk sp_walk = {}; + + /* + * Here we also support non share pool memory in this interface + * because the caller can't distinguish whether a uva is from the + * share pool or not. It is not the best idea to do so, but currently + * it simplifies overall design. + * + * In this situation, the correctness of the parameters is mainly + * guaranteed by the caller. + */ + vma = find_vma(tsk->mm, uva); + if (!vma) { + if (printk_ratelimit()) + pr_err("share pool: u2k input uva %pK is invalid\n", (void *)uva); + return -EINVAL; + } + if ((is_vm_hugetlb_page(vma)) || is_vm_huge_special(vma)) + is_hugepage = true; + + sp_walk.pte_hole = sp_pte_hole; + sp_walk.test_walk = sp_test_walk; + if (is_hugepage) { + sp_walk_data->is_hugepage = true; + sp_walk.hugetlb_entry = sp_hugetlb_entry; + page_size = PMD_SIZE; + } else { + sp_walk_data->is_hugepage = false; + sp_walk.pte_entry = sp_pte_entry; + } + + sp_walk_data->page_size = page_size; + uva_aligned = ALIGN_DOWN(uva, page_size); + sp_walk_data->uva_aligned = uva_aligned; + if (size == 0) + size_aligned = page_size; + else + /* special alignment handling */ + size_aligned = ALIGN(uva + size, page_size) - uva_aligned; + + if (uva_aligned + size_aligned < uva_aligned) { + if (printk_ratelimit()) + pr_err("share pool: overflow happened in walk page range\n"); + return -EINVAL; + } + + page_nr = size_aligned / page_size; + pages = kvmalloc(page_nr * sizeof(struct page *), GFP_KERNEL); + if (!pages) { + if (printk_ratelimit()) + pr_err("share pool: alloc page array failed in walk page range\n"); + return -ENOMEM; + } + sp_walk_data->pages = pages; + + sp_walk.mm = tsk->mm; + sp_walk.private = sp_walk_data; + + ret = walk_page_range(uva_aligned, uva_aligned + size_aligned, + &sp_walk); + if (ret) + kvfree(pages); + + return ret; +} + +/** + * Share user memory of a specified process to kernel + * @uva: the VA of shared user memory + * @size: the size of shared user memory + * @pid: the pid of the specified process + * + * Return: if success, return the starting kernel address of the shared memory. + * if failed, return the pointer of -errno. + */ +void *sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) +{ + int ret = 0; + struct task_struct *tsk; + void *p = ERR_PTR(-ENODEV); + struct sp_walk_data sp_walk_data = { + .page_count = 0, + }; + struct vm_struct *area; + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else + get_task_struct(tsk); + rcu_read_unlock(); + if (ret) { + p = ERR_PTR(ret); + goto out; + } + + if (!mmget_not_zero(tsk->mm)) + goto out_put_task; + down_write(&tsk->mm->mmap_sem); + ret = __sp_walk_page_range(uva, size, tsk, &sp_walk_data); + if (ret) { + pr_err("share pool: walk page range failed, ret %d\n", ret); + up_write(&tsk->mm->mmap_sem); + mmput(tsk->mm); + p = ERR_PTR(ret); + goto out_put_task; + } + + if (sp_walk_data.is_hugepage) + p = vmap_hugepage(sp_walk_data.pages, sp_walk_data.page_count, + VM_MAP | VM_HUGE_PAGES, PAGE_KERNEL); + else + p = vmap(sp_walk_data.pages, sp_walk_data.page_count, VM_MAP, + PAGE_KERNEL); + up_write(&tsk->mm->mmap_sem); + mmput(tsk->mm); + + if (!p) { + if (printk_ratelimit()) + pr_err("share pool: vmap(huge) in u2k failed\n"); + p = ERR_PTR(-ENOMEM); + goto out_free_pages; + } else { + p = p + (uva - sp_walk_data.uva_aligned); + } + + /* + * kva p may be used later in k2u. Since p comes from uva originally, + * it's reasonable to add flag VM_USERMAP so that p can be remapped + * into userspace again. + */ + area = find_vm_area(p); + area->flags |= VM_USERMAP; + +out_free_pages: + kvfree(sp_walk_data.pages); +out_put_task: + put_task_struct(tsk); +out: + return p; +} +EXPORT_SYMBOL_GPL(sp_make_share_u2k); + +static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int spg_id) +{ + int ret = 0; + struct task_struct *tsk; + struct sp_group *spg; + struct sp_area *spa; + unsigned long uva_aligned; + unsigned long size_aligned; + unsigned int page_size; + + mutex_lock(&sp_mutex); + /* + * at first we guess it's a hugepage addr + * we can tolerate at most PMD_SIZE or PAGE_SIZE which is matched in k2u + */ + spa = __find_sp_area(ALIGN_DOWN(uva, PMD_SIZE)); + if (!spa) { + spa = __find_sp_area(ALIGN_DOWN(uva, PAGE_SIZE)); + if (!spa) { + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: invalid input uva %pK in unshare uva\n", (void *)uva); + goto out_unlock; + } + } + + /* + * 1. overflow actually won't happen due to an spa must be valid. + * 2. we must unshare [spa->va_start, spa->va_start + spa->real_size) completely + * because an spa is one-to-one correspondence with an vma. + * Thus input paramter size is not necessarily needed. + */ + page_size = (spa->is_hugepage ? PMD_SIZE : PAGE_SIZE); + uva_aligned = spa->va_start; + size_aligned = spa->real_size; + + if (size_aligned < ALIGN(size, page_size)) { + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: unshare uva failed due to invalid parameter size %lu\n", size); + goto out_drop_area; + } + + if (spg_id == SPG_ID_NONE) { + if (spa->spg) { + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: unshare uva failed, SPG_ID_NONE is invalid\n"); + goto out_drop_area; + } + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk || (tsk->flags & PF_EXITING)) + ret = -ESRCH; + else + get_task_struct(tsk); + + rcu_read_unlock(); + if (ret) + goto out_drop_area; + + if (!mmget_not_zero(tsk->mm)) { + put_task_struct(tsk); + pr_info("share pool: no need to unshare uva, target process is exiting\n"); + goto out_drop_area; + } + down_write(&tsk->mm->mmap_sem); + ret = do_munmap(tsk->mm, uva_aligned, size_aligned, NULL); + up_write(&tsk->mm->mmap_sem); + mmput(tsk->mm); + if (ret) { + /* we are not supposed to fail */ + pr_err("share pool: failed to unmap VA %pK when munmap in unshare uva\n", + (void *)uva_aligned); + } + put_task_struct(tsk); + } else { + /* + * k2u to task, then unshare_uva(..., spg_id) is invalid due to potential + * spa memory leak. + */ + if (!spa->spg) { + ret = -EINVAL; + if (printk_ratelimit()) + pr_err("share pool: unshare uva failed, sp group id %d is invalid\n", spg_id); + goto out_drop_area; + } + + spg = __sp_find_spg(pid, spg_id); + if (spg_valid(spg)) { + __sp_free(spg, uva_aligned, size_aligned, NULL); + } else { + if (!spg) { + if (printk_ratelimit()) + pr_err("share pool: unshare uva failed, doesn't belong to group %d\n", + spg_id); + ret = -EINVAL; + goto out_drop_area; + } else { + pr_info("share pool: no need to unshare uva, target process is exiting\n"); + } + } + } + +out_drop_area: + __sp_area_drop(spa); +out_unlock: + mutex_unlock(&sp_mutex); + return ret; +} + +static int sp_unshare_kva(unsigned long kva, unsigned long size) +{ + unsigned long addr, kva_aligned; + struct page *page; + unsigned long size_aligned; + unsigned long step; + bool is_hugepage = true; + int ret; + + ret = is_vmap_hugepage(kva); + if (ret > 0) { + kva_aligned = ALIGN_DOWN(kva, PMD_SIZE); + size_aligned = ALIGN(kva + size, PMD_SIZE) - kva_aligned; + step = PMD_SIZE; + } else if (ret == 0) { + kva_aligned = ALIGN_DOWN(kva, PAGE_SIZE); + size_aligned = ALIGN(kva + size, PAGE_SIZE) - kva_aligned; + step = PAGE_SIZE; + is_hugepage = false; + } else { + pr_err("share pool: check vmap hugepage failed, ret %d\n", ret); + return -EINVAL; + } + + if (kva_aligned + size_aligned < kva_aligned) { + if (printk_ratelimit()) + pr_err("share pool: overflow happened in unshare kva\n"); + return -EINVAL; + } + + for (addr = kva_aligned; addr < (kva_aligned + size_aligned); addr += step) { + if (is_hugepage) + page = vmalloc_to_hugepage((void *)addr); + else + page = vmalloc_to_page((void *)addr); + if (page) + put_page(page); + else + pr_err("share pool: vmalloc to hugepage failed\n"); + } + + vunmap((void *)kva_aligned); + + return 0; +} + +/** + * Unshare the kernel or user memory which shared by calling sp_make_share_{k2u,u2k}(). + * @va: the specified virtual address of memory + * @size: the size of unshared memory + * @pid: the pid of the specified process if the VA is user address + * @spg_id: the ID of the specified sp_group if the VA is user address + * + * Return -errno if fail. + * + * Use spg_id of current thread if spg_id == SPG_ID_DEFAULT. + */ +int sp_unshare(unsigned long va, unsigned long size, int pid, int spg_id) +{ + int ret = 0; + + if (va < TASK_SIZE) { + /* user address */ + ret = sp_unshare_uva(va, size, pid, spg_id); + } else if (va >= VA_START) { + /* kernel address */ + ret = sp_unshare_kva(va, size); + } else { + /* regard user and kernel address ranges as bad address */ + if (printk_ratelimit()) + pr_err("share pool: unshare addr %pK is not a user or kernel addr", (void *)va); + ret = -EFAULT; + } + + return ret; +} +EXPORT_SYMBOL_GPL(sp_unshare); + +/** + * Return 0 when success. + * When return value < 0, information in sp_walk_data is useless + */ +int sp_walk_page_range(unsigned long uva, unsigned long size, + struct task_struct *tsk, struct sp_walk_data *sp_walk_data) +{ + int ret = 0; + + if (unlikely(!sp_walk_data)) { + if (printk_ratelimit()) + pr_err("share pool: null pointer when walk page range\n"); + return -EINVAL; + } + if (!tsk || (tsk->flags & PF_EXITING)) + return -ESRCH; + + sp_walk_data->page_count = 0; + + get_task_struct(tsk); + if (!mmget_not_zero(tsk->mm)) { + put_task_struct(tsk); + return -EINVAL; + } + down_write(&tsk->mm->mmap_sem); + ret = __sp_walk_page_range(uva, size, tsk, sp_walk_data); + up_write(&tsk->mm->mmap_sem); + mmput(tsk->mm); + put_task_struct(tsk); + + return ret; +} +EXPORT_SYMBOL_GPL(sp_walk_page_range); + +void sp_walk_page_free(struct sp_walk_data *sp_walk_data) +{ + struct page *page; + unsigned int i = 0; + + if (!sp_walk_data) + return; + + while (i < sp_walk_data->page_count) { + page = sp_walk_data->pages[i++]; + put_page(page); + } + + kvfree(sp_walk_data->pages); +} +EXPORT_SYMBOL_GPL(sp_walk_page_free); + +/** + * Walk the mm_struct of processes in the specified sp_group + * and call CALLBACK once for each mm_struct. + * @spg_id: the ID of the specified sp_group + * @data: the param for callback function + * @func: caller specific callback function + * + * Return -errno if fail. + */ +int sp_group_walk(int spg_id, void *data, int (*func)(struct mm_struct *mm, void *)) +{ + struct sp_group *spg; + int ret = -ESRCH; + + if (!func) { + if (printk_ratelimit()) + pr_err("share pool: null func pointer\n"); + return -EINVAL; + } + + mutex_lock(&sp_mutex); + spg = idr_find(&sp_group_idr, spg_id); + if (spg_valid(spg)) { + struct mm_struct *mm; + struct mm_struct *tmp; + list_for_each_entry_safe(mm, tmp, &spg->procs, sp_node) { + if (func) { + ret = func(mm, data); + if (ret) + goto out_unlock; + } + } + } +out_unlock: + mutex_unlock(&sp_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(sp_group_walk); + +int sp_register_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&sp_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(sp_register_notifier); + +int sp_unregister_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&sp_notifier_chain, nb); +} +EXPORT_SYMBOL_GPL(sp_unregister_notifier); + +/** + * user can config the share pool start addrese of each Da-vinci device + * @start: the value of share pool start + * @size: the value of share pool + * @device_id: the num of Da-vinci device + * @pid: the pid of device process + * + * Return false if parameter invalid of has been set up. + */ +bool sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) +{ + struct sp_group *spg; + + if (device_id < 0 || device_id >= MAX_DEVID || pid < 0 || size <= 0 || + size > MMAP_SHARE_POOL_16G_SIZE) + return false; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(pid, SPG_ID_DEFAULT); + if (!spg_valid(spg) || spg->dvpp_multi_spaces == true) { + mutex_unlock(&sp_mutex); + return false; + } + spg->dvpp_va_start = start; + spg->dvpp_size = size; + spg->dvpp_multi_spaces = true; + host_svm_sp_enable = true; + mutex_unlock(&sp_mutex); + + return true; +} +EXPORT_SYMBOL_GPL(sp_config_dvpp_range); + +/* Check whether the address belongs to the share pool. */ +bool is_sharepool_addr(unsigned long addr) +{ + if (host_svm_sp_enable == false) + return (addr >= MMAP_SHARE_POOL_START) && + addr < (MMAP_SHARE_POOL_16G_START + MMAP_SHARE_POOL_16G_SIZE); + + return addr >= MMAP_SHARE_POOL_START && addr < MMAP_SHARE_POOL_END; +} +EXPORT_SYMBOL_GPL(is_sharepool_addr); + +static int __init mdc_default_group(char *s) +{ + enable_mdc_default_group = 1; + return 1; +} +__setup("enable_mdc_default_group", mdc_default_group); + +int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct sp_group *spg = NULL; + struct sp_proc_stat *stat; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(task->pid, SPG_ID_DEFAULT); + if (spg_valid(spg)) { + /* print the file header */ + stat = idr_find(&sp_stat_idr, task->mm->sp_stat_id); + if (!stat) { + mutex_unlock(&sp_mutex); + return 0; + } + seq_printf(m, "%-10s %-18s %-15s\n", + "Group ID", "Aligned Apply(KB)", "HugePage Fails"); + seq_printf(m, "%-10d %-18ld %-15d\n", + spg->id, byte2kb(stat->amount), spg->hugepage_failures); + } + mutex_unlock(&sp_mutex); + + return 0; +} + +static int idr_proc_stat_cb(int id, void *p, void *data) +{ + struct sp_group *spg; + struct sp_proc_stat *stat = p; + struct seq_file *seq = data; + + mutex_lock(&sp_mutex); + spg = __sp_find_spg(id, SPG_ID_DEFAULT); + if (spg) { + seq_printf(seq, "%-12d %-10d %-18ld\n", + id, spg->id, byte2kb(stat->amount)); + } + mutex_unlock(&sp_mutex); + + return 0; +} + +static int proc_stat_show(struct seq_file *seq, void *offset) +{ + /* print the file header */ + seq_printf(seq, "%-12s %-10s %-18s\n", + "Process ID", "Group ID", "Aligned Apply(KB)"); + /* print kthread buff_module_guard_work */ + seq_printf(seq, "%-12s %-10s %-18ld\n", + "guard", "-", byte2kb(kthread_stat.amount)); + idr_for_each(&sp_stat_idr, idr_proc_stat_cb, seq); + return 0; +} + +static void rb_spa_stat_show(struct seq_file *seq) +{ + struct rb_node *node; + struct sp_area *spa; + + spin_lock(&sp_area_lock); + + for (node = rb_first(&sp_area_root); node; node = rb_next(node)) { + spa = rb_entry(node, struct sp_area, rb_node); + atomic_inc(&spa->use_count); + spin_unlock(&sp_area_lock); + + mutex_lock(&sp_mutex); + if (spg_valid(spa->spg)) + seq_printf(seq, "%-10d ", spa->spg->id); + else /* k2u for task or spg is dead */ + seq_printf(seq, "%-10s ", "None"); + mutex_unlock(&sp_mutex); + + seq_printf(seq, "%2s%-14lx %2s%-14lx %-13ld ", + "0x", spa->va_start, + "0x", spa->va_end, + byte2kb(spa->real_size)); + + switch (spa->type) { + case SPA_TYPE_ALLOC: + seq_printf(seq, "%-7s ", "ALLOC"); + break; + case SPA_TYPE_K2TASK: + seq_printf(seq, "%-7s ", "TASK"); + break; + case SPA_TYPE_K2SPG: + seq_printf(seq, "%-7s ", "SPG"); + break; + default: + /* usually impossible, perhaps a developer's mistake */ + break; + } + + if (spa->is_hugepage) + seq_printf(seq, "%-5s ", "Y"); + else + seq_printf(seq, "%-5s ", "N"); + + seq_printf(seq, "%-10d\n", atomic_read(&spa->use_count)); + + spin_lock(&sp_area_lock); + __sp_area_drop_locked(spa); + } + + spin_unlock(&sp_area_lock); +} + +static void spa_overview_show(struct seq_file *seq) +{ + unsigned int total_num, alloc_num, k2u_task_num, k2u_spg_num; + unsigned long total_size, alloc_size, k2u_task_size, k2u_spg_size; + + spin_lock(&sp_area_lock); + total_num = spa_stat.total_num; + alloc_num = spa_stat.alloc_num; + k2u_task_num = spa_stat.k2u_task_num; + k2u_spg_num = spa_stat.k2u_spg_num; + total_size = spa_stat.total_size; + alloc_size = spa_stat.alloc_size; + k2u_task_size = spa_stat.k2u_task_size; + k2u_spg_size = spa_stat.k2u_spg_size; + spin_unlock(&sp_area_lock); + + seq_printf(seq, "Spa total num %u.\n", total_num); + seq_printf(seq, "Spa alloc num %u, k2u(task) num %u, k2u(spg) num %u.\n", + alloc_num, k2u_task_num, k2u_spg_num); + seq_printf(seq, "Spa total size: %13lu KB\n", byte2kb(total_size)); + seq_printf(seq, "Spa alloc size: %13lu KB\n", byte2kb(alloc_size)); + seq_printf(seq, "Spa k2u(task) size: %13lu KB\n", byte2kb(k2u_task_size)); + seq_printf(seq, "Spa k2u(spg) size: %13lu KB\n", byte2kb(k2u_spg_size)); + seq_printf(seq, "\n"); +} + +/* the caller must hold sp_mutex */ +static int idr_spg_stat_cb(int id, void *p, void *data) +{ + struct sp_group *spg = p; + struct seq_file *seq = data; + + seq_printf(seq, "Group %-10d size: %13d KB, spa num: %d.\n", + id, byte2kb(atomic_read(&spg->size)), + atomic_read(&spg->spa_num)); + + return 0; +} + +static void spg_overview_show(struct seq_file *seq) +{ + mutex_lock(&sp_mutex); + idr_for_each(&sp_group_idr, idr_spg_stat_cb, seq); + mutex_unlock(&sp_mutex); + seq_printf(seq, "Share pool total size: %13d KB, spa total num: %d.\n\n", + byte2kb(atomic_read(&spg_stat.spa_total_size)), + atomic_read(&spg_stat.spa_total_num)); +} + +static int spa_stat_show(struct seq_file *seq, void *offset) +{ + spg_overview_show(seq); + spa_overview_show(seq); + /* print the file header */ + seq_printf(seq, "%-10s %-16s %-16s %-13s %-7s %-5s %-10s\n", + "Group ID", "va_start", "va_end", "Aligned KB", "Type", "Huge", "Ref"); + rb_spa_stat_show(seq); + return 0; +} + +/* + * Called by proc_root_init() to initialize the /proc/sharepool subtree + */ +void __init proc_sharepool_init(void) +{ + if (!proc_mkdir("sharepool", NULL)) + return; + + proc_create_single_data("sharepool/proc_stat", 0, NULL, proc_stat_show, NULL); + proc_create_single_data("sharepool/spa_stat", 0, NULL, spa_stat_show, NULL); +} + + +struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, + unsigned int page_order, int node) +{ + if (area->flags & VM_HUGE_PAGES) + return hugetlb_alloc_hugepage(NUMA_NO_NODE); + else + return alloc_pages_node(node, mask, page_order); +} + +int enable_ascend_share_pool; + +static int __init enable_share_pool(char *s) +{ + enable_ascend_share_pool = 1; + + pr_info("Ascend enable share pool features\n"); + + return 1; +} +__setup("enable_ascend_share_pool", enable_share_pool);