hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/release-management/issues/ID3TGE -------------------------------- Provide an efficient intra-node data transfer interface, enabling applications to map the pages associated with a specified virtual memory address space in the source process to the virtual address space in the destination process. Signed-off-by: Liu Mingrui <liumingrui@huawei.com> --- drivers/zcopy/zcopy.c | 474 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 469 insertions(+), 5 deletions(-) diff --git a/drivers/zcopy/zcopy.c b/drivers/zcopy/zcopy.c index 9f224eac4e39..b718e14fcff3 100644 --- a/drivers/zcopy/zcopy.c +++ b/drivers/zcopy/zcopy.c @@ -7,6 +7,41 @@ #include <linux/fs.h> #include <linux/cdev.h> #include <linux/uaccess.h> +#include <linux/kallsyms.h> +#include <linux/mm.h> +#include <linux/kprobes.h> +#include <linux/huge_mm.h> +#include <linux/mm_types.h> +#include <linux/mm_types_task.h> +#include <linux/rmap.h> +#include <linux/sched/mm.h> +#include <linux/pgtable.h> +#include <asm-generic/pgalloc.h> +#include <asm/tlbflush.h> +#include <asm/pgtable-hwdef.h> + +#ifndef PUD_SHIFT +#define ARM64_HW_PGTABLE_LEVEL_SHIFT(n) ((PAGE_SHIFT - 3) * (4 - (n)) + 3) +#define PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) +#endif + +enum pgt_entry { + NORMAL_PMD, + HPAGE_PMD, +}; + +enum { + IO_ATTACH = 1, + IO_MAX +}; + +struct zcopy_ioctl_pswap { + unsigned long src_addr; + unsigned long dst_addr; + int src_pid; + int dst_pid; + unsigned long size; +}; struct zcopy_cdev { struct cdev chrdev; @@ -18,17 +53,439 @@ struct zcopy_cdev { static struct zcopy_cdev z_cdev; -long zcopy_ioctl(struct file *file, unsigned int type, unsigned long ptr) +static int (*__zcopy_pte_alloc)(struct mm_struct *, pmd_t *); +static int (*__zcopy_pmd_alloc)(struct mm_struct *, pud_t *, unsigned long); +static int (*__zcopy_pud_alloc)(struct mm_struct *, p4d_t *, unsigned long); +static unsigned long (*kallsyms_lookup_name_funcp)(const char *); + +static struct kretprobe __kretprobe; + +static unsigned long __kprobe_lookup_name(const char *symbol_name) +{ + int ret; + void *addr; + + __kretprobe.kp.symbol_name = symbol_name; + ret = register_kretprobe(&__kretprobe); + if (ret < 0) { + pr_err("register_kprobe failed, returned %d\n", ret); + return 0; + } + pr_info("Planted %s kprobe at %pK\n", symbol_name, __kretprobe.kp.addr); + addr = __kretprobe.kp.addr; + unregister_kretprobe(&__kretprobe); + return (unsigned long)addr; +} + +static inline unsigned long __kallsyms_lookup_name(const char *symbol_name) +{ + if (kallsyms_lookup_name_funcp == NULL) + return 0; + return kallsyms_lookup_name_funcp(symbol_name); +} + +static inline pud_t *zcopy_pud_alloc(struct mm_struct *mm, p4d_t *p4d, + unsigned long address) +{ + return (unlikely(p4d_none(*p4d)) && + __zcopy_pud_alloc(mm, p4d, address)) ? NULL : pud_offset(p4d, address); +} + +static inline pmd_t *zcopy_pmd_alloc(struct mm_struct *mm, pud_t *pud, + unsigned long address) +{ + return (unlikely(pud_none(*pud)) && + __zcopy_pmd_alloc(mm, pud, address)) ? NULL : pmd_offset(pud, address); +} + +static inline bool zcopy_pte_alloc(struct mm_struct *mm, pmd_t *pmd) +{ + return unlikely(pmd_none(*pmd)) && __zcopy_pte_alloc(mm, pmd); +} + +static pud_t *zcopy_get_pud(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return NULL; + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return NULL; + + return pud; +} + +static pmd_t *zcopy_get_pmd(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = zcopy_get_pud(mm, addr); + if (!pud) + return NULL; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return NULL; + + return pmd; +} + +static pud_t *zcopy_alloc_new_pud(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + + pgd = pgd_offset(mm, addr); + p4d = p4d_alloc(mm, pgd, addr); + if (!p4d) + return NULL; + + return zcopy_pud_alloc(mm, p4d, addr); +} + +static pmd_t *zcopy_alloc_pmd(struct mm_struct *mm, unsigned long addr) +{ + pud_t *pud; + pmd_t *pmd; + + pud = zcopy_alloc_new_pud(mm, addr); + if (!pud) + return NULL; + + pmd = zcopy_pmd_alloc(mm, pud, addr); + if (!pmd) + return NULL; + + return pmd; +} + +static inline void zcopy_add_mm_counter(struct mm_struct *mm, int member, long value) { + atomic_long_add(value, &mm->rss_stat.count[member]); +} + +static inline void zcopy_add_mm_rss_vec(struct mm_struct *mm, int *rss) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) + if (rss[i]) + zcopy_add_mm_counter(mm, i, rss[i]); +} + +static __always_inline unsigned long get_extent(enum pgt_entry entry, + unsigned long old_addr, unsigned long old_end, + unsigned long new_addr) +{ + unsigned long next, extent, mask, size; + + switch (entry) { + case HPAGE_PMD: + case NORMAL_PMD: + mask = PMD_MASK; + size = PMD_SIZE; + break; + default: + BUILD_BUG(); + break; + } + + next = (old_addr + size) & mask; + /* even if next overflowed, extent below will be ok */ + extent = next - old_addr; + if (extent > old_end - old_addr) + extent = old_end - old_addr; + next = (new_addr + size) & mask; + if (extent > next - new_addr) + extent = next - new_addr; + return extent; +} + +static int attach_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, + unsigned long dst_addr, unsigned long src_addr, pmd_t *dst_pmdp, + pmd_t *src_pmdp, unsigned long len) +{ + struct mm_struct *dst_mm = dst_vma->vm_mm; + pte_t *src_ptep, *dst_ptep, pte, orig_pte; + struct page *src_page, *orig_page; + spinlock_t *dst_ptl; + int rss[NR_MM_COUNTERS]; + unsigned long src_addr_end = src_addr + len; + + memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); + + src_ptep = pte_offset_map(src_pmdp, src_addr); + dst_ptep = pte_offset_map(dst_pmdp, dst_addr); + dst_ptl = pte_lockptr(dst_mm, dst_pmdp); + spin_lock_nested(dst_ptl, SINGLE_DEPTH_NESTING); + + for (; src_addr < src_addr_end; src_ptep++, src_addr += PAGE_SIZE, + dst_ptep++, dst_addr += PAGE_SIZE) { + /* + * For special pte, there may not be corresponding page. Hence, + * we skip this situation. + */ + pte = ptep_get(src_ptep); + if (pte_none(*src_ptep) || pte_special(*src_ptep) || !pte_present(pte)) + continue; + + src_page = pte_page(pte); + atomic_inc(&src_page->_refcount); + atomic_inc(&src_page->_mapcount); + rss[MM_ANONPAGES]++; + + /* + * If dst virtual addr has page mapping, before setup the new mapping. + * we should decrease the orig page mapcount and refcount. + */ + orig_pte = *dst_ptep; + if (!pte_none(orig_pte)) { + orig_page = pte_page(orig_pte); + atomic_dec(&orig_page->_refcount); + atomic_dec(&orig_page->_mapcount); + rss[MM_ANONPAGES]--; + } + set_pte_at(dst_mm, dst_addr, dst_ptep, pte); + } + + flush_tlb_range(dst_vma, dst_addr, dst_addr + len); + zcopy_add_mm_rss_vec(dst_mm, rss); + spin_unlock(dst_ptl); + return 0; } +static int attach_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + unsigned long dst_addr, unsigned long src_addr, unsigned long size) +{ + struct vm_area_struct *src_vma, *dst_vma; + unsigned long extent, src_addr_end; + pmd_t *src_pmd, *dst_pmd; + int ret = 0; + + src_addr_end = src_addr + size; + src_vma = find_vma(src_mm, src_addr); + dst_vma = find_vma(dst_mm, dst_addr); + /* Check the vma has not been freed again.*/ + if (!src_vma || !dst_vma) + return -ENOENT; + + for (; src_addr < src_addr_end; src_addr += extent, dst_addr += extent) { + cond_resched(); + + extent = get_extent(NORMAL_PMD, src_addr, src_addr_end, dst_addr); + src_pmd = zcopy_get_pmd(src_mm, src_addr); + if (!src_pmd) + continue; + dst_pmd = zcopy_alloc_pmd(dst_mm, dst_addr); + if (!dst_pmd) { + ret = -ENOMEM; + break; + } + + if (pmd_trans_huge(*src_pmd)) { + /* Not support hugepage mapping */ + ret = -EOPNOTSUPP; + break; + } else if (is_swap_pmd(*src_pmd) || pmd_devmap(*src_pmd)) { + ret = -EOPNOTSUPP; + break; + } + + if (zcopy_pte_alloc(dst_mm, dst_pmd)) { + ret = -ENOMEM; + break; + } + + ret = attach_ptes(dst_vma, src_vma, dst_addr, src_addr, dst_pmd, + src_pmd, extent); + if (ret < 0) + break; + } + + return ret; +} + +static int attach_pages(unsigned long dst_addr, unsigned long src_addr, + int dst_pid, int src_pid, unsigned long size) +{ + struct mm_struct *dst_mm, *src_mm; + struct task_struct *src_task, *dst_task; + struct page **process_pages; + unsigned long nr_pages; + unsigned int flags = 0; + int pinned_pages; + int locked = 1; + int ret; + + ret = -EINVAL; + if (size <= 0) + goto out; + + if ((src_addr & (PAGE_SIZE-1)) != 0 || + (dst_addr & (PAGE_SIZE-1)) != 0 || + (size & (PAGE_SIZE-1)) != 0) { + pr_err("Not aligned with PAGE_SIZE\n"); + goto out; + } + + /* check the addr is in userspace. wo do not allow */ + if (!is_ttbr0_addr(dst_addr) || !is_ttbr0_addr(src_addr)) { + pr_err("Not allow kernelspace\n"); + goto out; + } + + ret = -ESRCH; + src_task = find_get_task_by_vpid(src_pid); + if (!src_task) + goto out; + + src_mm = mm_access(src_task, PTRACE_MODE_ATTACH_REALCREDS); + if (!src_mm || IS_ERR(src_mm)) { + ret = IS_ERR(src_mm) ? PTR_ERR(src_mm) : -ESRCH; + if (ret == -EACCES) + ret = -EPERM; + goto put_src_task; + } + + dst_task = find_get_task_by_vpid(dst_pid); + if (!dst_task) + goto put_src_mm; + + dst_mm = mm_access(dst_task, PTRACE_MODE_ATTACH_REALCREDS); + if (!dst_mm || IS_ERR(dst_mm)) { + ret = IS_ERR(dst_mm) ? PTR_ERR(dst_mm) : -ESRCH; + if (ret == -EACCES) + ret = -EPERM; + goto put_dst_task; + } + + if (src_mm == dst_mm) { + ret = -EINVAL; + pr_err("Attach is not allowed within the same address space"); + goto put_dst_mm; + } + + nr_pages = (src_addr + size - 1) / PAGE_SIZE - src_addr / PAGE_SIZE + 1; + process_pages = kvmalloc_array(nr_pages, sizeof(struct pages *), GFP_KERNEL); + if (!process_pages) { + ret = -ENOMEM; + goto put_dst_mm; + } + + mmap_read_lock(src_mm); + pinned_pages = pin_user_pages_remote(src_mm, src_addr, nr_pages, + flags, process_pages, + NULL, &locked); + if (locked) + mmap_read_unlock(src_mm); + + if (pinned_pages <= 0) { + ret = -EFAULT; + goto free_pages_array; + } + + ret = attach_page_range(dst_mm, src_mm, dst_addr, src_addr, size); + + unpin_user_pages_dirty_lock(process_pages, pinned_pages, 0); + +free_pages_array: + kvfree(process_pages); +put_dst_mm: + mmput(dst_mm); +put_dst_task: + put_task_struct(dst_task); +put_src_mm: + mmput(src_mm); +put_src_task: + put_task_struct(src_task); +out: + return ret; +} + +static long zcopy_ioctl(struct file *file, unsigned int type, unsigned long ptr) +{ + long ret = 0; + + switch (type) { + case IO_ATTACH: + { + struct zcopy_ioctl_pswap ctx; + + if (copy_from_user((void *)&ctx, (void *)ptr, + sizeof(struct zcopy_ioctl_pswap))) { + pr_err("copy from user for attach failed\n"); + ret = -EFAULT; + break; + } + ret = attach_pages(ctx.dst_addr, ctx.src_addr, ctx.dst_pid, + ctx.src_pid, ctx.size); + break; + } + default: + break; + } + + return ret; +} + static const struct file_operations zcopy_fops = { .owner = THIS_MODULE, .unlocked_ioctl = zcopy_ioctl, }; -int register_device_zcopy(void) +#define REGISTER_CHECK(_var, _errstr) ({ \ + int __ret = 0; \ + if (!(_var)) { \ + pr_warn("Not found %s\n", _errstr); \ + __ret = -ENOENT; \ + } \ + __ret; \ +}) + +static int register_unexport_func(void) +{ + int ret; + + kallsyms_lookup_name_funcp + = (unsigned long (*)(const char *))__kprobe_lookup_name("kallsyms_lookup_name"); + ret = REGISTER_CHECK(kallsyms_lookup_name_funcp, "kallsyms_lookup_name"); + if (ret) + goto out; + + __zcopy_pte_alloc + = (int (*)(struct mm_struct *, pmd_t *))__kallsyms_lookup_name("__pte_alloc"); + ret = REGISTER_CHECK(__zcopy_pte_alloc, "__pte_alloc"); + if (ret) + goto out; + + __zcopy_pmd_alloc + = (int (*)(struct mm_struct *, pud_t *, unsigned long)) + __kallsyms_lookup_name("__pmd_alloc"); + ret = REGISTER_CHECK(__zcopy_pmd_alloc, "__pmd_alloc"); + if (ret) + goto out; + + __zcopy_pud_alloc + = (int (*)(struct mm_struct *, p4d_t *, unsigned long)) + __kallsyms_lookup_name("__pud_alloc"); + ret = REGISTER_CHECK(__zcopy_pud_alloc, "__pud_alloc"); + +out: + return ret; +} + +static int register_device_zcopy(void) { int ret; @@ -74,7 +531,7 @@ int register_device_zcopy(void) return ret; } -void unregister_device_zcopy(void) +static void unregister_device_zcopy(void) { device_destroy(z_cdev.dev_class, MKDEV(z_cdev.major, 0)); class_destroy(z_cdev.dev_class); @@ -86,13 +543,20 @@ static int __init zcopy_init(void) { int ret; + ret = register_unexport_func(); + if (ret) { + pr_err("register_unexport_func failed\n"); + goto out; + } + ret = register_device_zcopy(); if (ret) { pr_err("register_device_zcopy failed\n"); - return -1; + goto out; } - return 0; +out: + return ret; } static void __exit zcopy_exit(void) -- 2.25.1