From: Liu Zixian liuzixian4@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4X1VR CVE: NA
------------ Copy file content to huge page during hugetlb memory fault. The file is recorded in vm_area_struct (used 1 reserved member). Gdb and perf needs file name to resolve symbols, so we use the recorded file in procfs and perf record. Glibc can use this feature to load libraries into huge pages, so add MAP_FILE_HUGETLB to mmap syscall.
Signed-off-by: Liu Zixian liuzixian4@huawei.com Reviewed-by: Zhou Kang zhoukang7@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com --- fs/Kconfig | 12 ++++++++ fs/proc/task_mmu.c | 5 ++++ include/linux/mm_types.h | 4 +++ include/uapi/asm-generic/mman-common.h | 1 + kernel/events/core.c | 7 +++++ kernel/fork.c | 5 ++++ mm/hugetlb.c | 25 ++++++++++++++++ mm/mmap.c | 20 +++++++++++++ mm/util.c | 40 ++++++++++++++++++++++++++ 9 files changed, 119 insertions(+)
diff --git a/fs/Kconfig b/fs/Kconfig index aa097ca64ef6..cde0ec856dfd 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -270,6 +270,18 @@ config DYNAMIC_HUGETLB pages automatically. The tasks in the memcg prefer to alloc dynamic hugepage.
+config ENHANCED_HUGETLB_MMAP + bool "enhanced hugetlb mmap" + default n + depends on HUGETLBFS + help + Add private file mmap for hugetlb. + This feature adds vm_actual_file in vma to record the original file and + copies file contents to hugetlb pages during page fault. + Procfs and perf record will show file name of vm_actual_file. + Hugetlb is useful for optimizing TLB miss rate, and this feature is + aimed to extend its usage. + config MEMFD_CREATE def_bool TMPFS || HUGETLBFS
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7b8a513d9f69..391b967fcfbf 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -280,6 +280,11 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) dev_t dev = 0; const char *name = NULL;
+#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (vma->vm_actual_file) + file = vma->vm_actual_file; +#endif + if (file) { struct inode *inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1c22e294f083..9de02b116185 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -376,7 +376,11 @@ struct vm_area_struct { #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#if defined(CONFIG_ENHANCED_HUGETLB_MMAP) && !defined(__GENKSYMS__) + KABI_USE(1, struct file *vm_actual_file); +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index e75b65364dce..2a396d81aca6 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -31,6 +31,7 @@ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */
#define MAP_REPLACE 0x1000000 +#define MAP_FILE_HUGETLB 0x2000000 /* hugetlb private file map support */
#define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 68dc8a8e7990..bbc770d8cbdb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8119,6 +8119,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) flags |= MAP_LOCKED; if (is_vm_hugetlb_page(vma)) flags |= MAP_HUGETLB; +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (vma->vm_actual_file) { + /* perf will ignore hugetlb vma, so remove this flag */ + flags &= ~MAP_HUGETLB; + file = vma->vm_actual_file; + } +#endif
if (file) { struct inode *inode; diff --git a/kernel/fork.c b/kernel/fork.c index 0fb86b65ae60..c8ec029e158a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -571,6 +571,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, i_mmap_unlock_write(mapping); }
+#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (tmp->vm_actual_file) + get_file(tmp->vm_actual_file); +#endif + /* * Clear hugetlb-related page reserves for children. This only * affects MAP_PRIVATE mappings. Faults generated by the child diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c5168c7f282a..817ae73d40bd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4536,6 +4536,20 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, i_mmap_unlock_write(mapping); }
+#ifdef CONFIG_ENHANCED_HUGETLB_MMAP +static int read_actual_file(struct page *page, struct vm_area_struct *vma, + loff_t *off, size_t size) +{ + void *kaddr; + unsigned long read_size = 0; + + kaddr = kmap(page); + read_size = kernel_read(vma->vm_actual_file, kaddr, size, off); + kunmap(page); + return IS_ERR_VALUE(read_size) ? read_size : 0; +} +#endif + /* * Hugetlb_cow() should be called with page lock of the original hugepage held. * Called with hugetlb_instantiation_mutex held and pte_page locked so we @@ -4837,6 +4851,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (vma->vm_actual_file) { + loff_t off = haddr - vma->vm_start + + (vma->vm_pgoff << PAGE_SHIFT); + size_t page_size = huge_page_size(h); + + ret = read_actual_file(page, vma, &off, page_size); + if (ret) + goto out; + } +#endif __SetPageUptodate(page); new_page = true;
diff --git a/mm/mmap.c b/mm/mmap.c index 5489d70db84e..515d668e1301 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -188,6 +188,10 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) vma->vm_ops->close(vma); if (vma->vm_file) fput(vma->vm_file); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (vma->vm_actual_file) + fput(vma->vm_actual_file); +#endif mpol_put(vma_policy(vma)); sp_area_drop(vma); vm_area_free(vma); @@ -1849,6 +1853,17 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, return -EBADF; if (is_file_hugepages(file)) { len = ALIGN(len, huge_page_size(hstate_file(file))); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + /* + * glibc can use this flag to load libraries, + * a similar feature of exec_hugetlb. + */ + } else if (unlikely(flags & MAP_FILE_HUGETLB)) { + if (!(flags & MAP_PRIVATE)) { + retval = -EINVAL; + goto out_fput; + } +#endif } else if (unlikely(flags & MAP_HUGETLB)) { retval = -EINVAL; goto out_fput; @@ -3047,6 +3062,11 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (new->vm_file) get_file(new->vm_file);
+#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (new->vm_actual_file) + get_file(new->vm_actual_file); +#endif + if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new);
diff --git a/mm/util.c b/mm/util.c index 67b350f4ffdc..05efa0b50be7 100644 --- a/mm/util.c +++ b/mm/util.c @@ -496,6 +496,31 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc) } EXPORT_SYMBOL_GPL(account_locked_vm);
+#ifdef CONFIG_ENHANCED_HUGETLB_MMAP +static struct file *prepare_hugetlb_mmap(unsigned long flags, unsigned long size) +{ + int page_size_log = (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK; + struct user_struct *user = NULL; + + return hugetlb_file_setup(HUGETLB_ANON_FILE, size, VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, page_size_log); +} + +static unsigned long finish_hugetlb_mmap(unsigned long addr, struct file *actual_file, + struct file *huge_file) +{ + struct vm_area_struct *vma; + + fput(huge_file); + vma = find_vma(current->mm, addr); + if (!vma) + return -EINVAL; + vma->vm_actual_file = get_file(actual_file); + + return addr; +} +#endif + unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flag, unsigned long pgoff) @@ -504,13 +529,28 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; unsigned long populate; LIST_HEAD(uf); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + struct file *actual_file = NULL; +#endif
ret = security_mmap_file(file, prot, flag); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (flag & MAP_FILE_HUGETLB) { + actual_file = file; + file = prepare_hugetlb_mmap(flag, len + (pgoff << PAGE_SHIFT)); + if (IS_ERR(file)) + return PTR_ERR(file); + } +#endif if (!ret) { if (mmap_write_lock_killable(mm)) return -EINTR; ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate, &uf); +#ifdef CONFIG_ENHANCED_HUGETLB_MMAP + if (!IS_ERR_VALUE(addr) && (flag & MAP_FILE_HUGETLB)) + ret = finish_hugetlb_mmap(ret, actual_file, file); +#endif mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate)