
From: Nikita Panov <panov.nikita@huawei.com> kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBOJU2 ------------------------------------------------- In order to support generic vmalloc, several modifications are required. If the allocated region belongs to the non-replicated part of the table, then the normal vmalloc mechanism is suitable. If the allocated region belongs to the replicated part of the table, a replicated table for each replica node must be created for this memory. This region might be replicated after its initialization, for example, to support replication of text and ro-data for loadable kernel modules. Acked-by: Alexander Grubnikov <alexander.grubnikov@huawei.com> Acked-by: Ilya Hanov <ilya.hanov@huawei-partners.com> Acked-by: Denis Darvish <darvish.denis@huawei.com> Co-developed-by: Artem Kuzin <artem.kuzin@huawei.com> Signed-off-by: Artem Kuzin <artem.kuzin@huawei.com> Co-developed-by: Nikita Panov <panov.nikita@huawei.com> Signed-off-by: Nikita Panov <panov.nikita@huawei.com> --- include/linux/mm.h | 2 + include/linux/vmalloc.h | 19 ++ mm/memory.c | 60 +++++ mm/vmalloc.c | 469 ++++++++++++++++++++++++++++++++-------- 4 files changed, 457 insertions(+), 93 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 9da5e64c23d9..49d7e2b2cdff 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1174,6 +1174,8 @@ int region_intersects(resource_size_t offset, size_t size, unsigned long flags, struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); +struct page *walk_to_page_node(int nid, const void *addr); + /* * Determine if an address is within the vmalloc range * diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 333f5a67d171..c0a29ec901f7 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -29,6 +29,10 @@ struct iov_iter; /* in uio.h */ #define VM_MAP_PUT_PAGES 0x00000200 /* put pages and free array in vfree */ #define VM_ALLOW_HUGE_VMAP 0x00000400 /* Allow for huge pages on archs with HAVE_ARCH_HUGE_VMALLOC */ +#ifdef CONFIG_KERNEL_REPLICATION +#define VM_NUMA_SHARED 0x00002000 /* Pages shared between per-NUMA node TT*/ +#endif + #if (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)) && \ !defined(CONFIG_KASAN_VMALLOC) #define VM_DEFER_KMEMLEAK 0x00000800 /* defer kmemleak object creation */ @@ -65,6 +69,10 @@ struct vm_struct { unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; +#ifdef CONFIG_KERNEL_REPLICATION + KABI_EXTEND(int node) + KABI_EXTEND(bool replicated) +#endif }; struct vmap_area { @@ -156,6 +164,17 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) __alloc_size(1); + +#ifdef CONFIG_KERNEL_REPLICATION + /* + * DO NOT USE this function if you don't understand what it is doing + * Use only in pair with vmalloc(vm_flags|=VM_NUMA_SHARED) + */ +int __vmalloc_node_replicate_range(const void *addr, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags); +void vunmap_range_replicas(unsigned long addr, unsigned long end); +#endif + void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) __alloc_size(1); void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1); diff --git a/mm/memory.c b/mm/memory.c index 25ac55f22b08..f05772babfe0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6863,3 +6863,63 @@ void ptlock_free(struct ptdesc *ptdesc) kmem_cache_free(page_ptl_cachep, ptdesc->ptl); } #endif + +/** + * Walk in replicated tranlation table specified by nid. + * If kernel replication is disabled or text is not replicated yet, + * value of nid is not used + */ +struct page *walk_to_page_node(int nid, const void *vmalloc_addr) +{ + unsigned long addr = (unsigned long)vmalloc_addr; + struct page *page = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + + if (is_text_replicated()) + pgd = pgd_offset_pgd(per_node_pgd(&init_mm, nid), addr); + else + pgd = pgd_offset_pgd(init_mm.pgd, addr); + + if (pgd_none(*pgd)) + return NULL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return NULL; /* XXX: no allowance for huge pgd */ + if (WARN_ON_ONCE(pgd_bad(*pgd))) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return NULL; + if (p4d_leaf(*p4d)) + return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(p4d_bad(*p4d))) + return NULL; + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return NULL; + if (pud_leaf(*pud)) + return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pud_bad(*pud))) + return NULL; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return NULL; + if (pmd_leaf(*pmd)) + return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pmd_bad(*pmd))) + return NULL; + + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + + return page; +} diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cb0951fea238..a136e86e6480 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -10,6 +10,7 @@ #include <linux/vmalloc.h> #include <linux/mm.h> +#include <linux/numa_kernel_replication.h> #include <linux/module.h> #include <linux/highmem.h> #include <linux/sched/signal.h> @@ -420,18 +421,17 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, } /* - * vunmap_range_noflush is similar to vunmap_range, but does not - * flush caches or TLBs. + * vunmap_range_noflush_pgd is similar to vunmap_range, but does not + * flush caches or TLBs, and able to work with pgd granularity. * * The caller is responsible for calling flush_cache_vmap() before calling * this function, and flush_tlb_kernel_range after it has returned * successfully (and before the addresses are expected to cause a page fault * or be re-mapped for something else, if TLB flushes are being delayed or * coalesced). - * - * This is an internal function only. Do not use outside mm/. */ -void __vunmap_range_noflush(unsigned long start, unsigned long end) +static void vunmap_range_noflush_pgd(pgd_t *pgtable, + unsigned long start, unsigned long end) { unsigned long next; pgd_t *pgd; @@ -439,7 +439,7 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end) pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); - pgd = pgd_offset_k(addr); + pgd = pgd_offset_pgd(pgtable, addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) @@ -453,6 +453,17 @@ void __vunmap_range_noflush(unsigned long start, unsigned long end) arch_sync_kernel_mappings(start, end); } +/* + * vunmap_range_noflush is similar to vunmap_range_noflush_pgd, but works + * only with init_mm->pgd. + * + * This is an internal function only. Do not use outside mm/. + */ +void __vunmap_range_noflush(unsigned long start, unsigned long end) +{ + vunmap_range_noflush_pgd(init_mm.pgd, start, end); +} + void vunmap_range_noflush(unsigned long start, unsigned long end) { kmsan_vunmap_range_noflush(start, end); @@ -475,6 +486,18 @@ void vunmap_range(unsigned long addr, unsigned long end) flush_tlb_kernel_range(addr, end); } +#ifdef CONFIG_KERNEL_REPLICATION +void vunmap_range_replicas(unsigned long addr, unsigned long end) +{ + int nid; + + flush_cache_vunmap(addr, end); + for_each_memory_node(nid) + vunmap_range_noflush_pgd(init_mm.pgd_numa[nid], addr, end); + flush_tlb_kernel_range(addr, end); +} +#endif /* CONFIG_KERNEL_REPLICATION && CONFIG_ARM64 */ + static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) @@ -560,7 +583,8 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, return 0; } -static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, +static int vmap_small_pages_range_noflush_pgd(pgd_t *pgtable, + unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages) { unsigned long start = addr; @@ -571,7 +595,7 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); - pgd = pgd_offset_k(addr); + pgd = pgd_offset_pgd(pgtable, addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) @@ -587,8 +611,38 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, return 0; } +static int vmap_range_noflush_pgd(pgd_t *pgtable, + unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + pgd_t *pgd; + unsigned long start; + unsigned long next; + int err; + pgtbl_mod_mask mask = 0; + + might_sleep(); + BUG_ON(addr >= end); + + start = addr; + pgd = pgd_offset_pgd(pgtable, addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, + max_page_shift, &mask); + if (err) + break; + } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return err; +} + /* - * vmap_pages_range_noflush is similar to vmap_pages_range, but does not + * vmap_pages_range_noflush_pgd is similar to vmap_pages_range, but does not * flush caches. * * The caller is responsible for calling flush_cache_vmap() after this @@ -596,8 +650,10 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, * * This is an internal function only. Do not use outside mm/. */ -int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) +static int vmap_pages_range_noflush_pgd(pgd_t *pgtable, + unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; @@ -605,12 +661,13 @@ int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || page_shift == PAGE_SHIFT) - return vmap_small_pages_range_noflush(addr, end, prot, pages); + return vmap_small_pages_range_noflush_pgd(pgtable, addr, end, + prot, pages); for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { int err; - err = vmap_range_noflush(addr, addr + (1UL << page_shift), + err = vmap_range_noflush_pgd(pgtable, addr, addr + (1UL << page_shift), page_to_phys(pages[i]), prot, page_shift); if (err) @@ -630,7 +687,8 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, if (ret) return ret; - return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + + return vmap_pages_range_noflush_pgd(init_mm.pgd, addr, end, prot, pages, page_shift); } /** @@ -730,57 +788,12 @@ EXPORT_SYMBOL_GPL(is_vmalloc_or_module_addr); */ struct page *vmalloc_to_page(const void *vmalloc_addr) { - unsigned long addr = (unsigned long) vmalloc_addr; - struct page *page = NULL; - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for * architectures that do not vmalloc module space */ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); - - if (pgd_none(*pgd)) - return NULL; - if (WARN_ON_ONCE(pgd_leaf(*pgd))) - return NULL; /* XXX: no allowance for huge pgd */ - if (WARN_ON_ONCE(pgd_bad(*pgd))) - return NULL; - - p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) - return NULL; - if (p4d_leaf(*p4d)) - return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(p4d_bad(*p4d))) - return NULL; - - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) - return NULL; - if (pud_leaf(*pud)) - return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(pud_bad(*pud))) - return NULL; - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) - return NULL; - if (pmd_leaf(*pmd)) - return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(pmd_bad(*pmd))) - return NULL; - - ptep = pte_offset_kernel(pmd, addr); - pte = ptep_get(ptep); - if (pte_present(pte)) - page = pte_page(pte); - - return page; + return walk_to_page_node(first_memory_node, vmalloc_addr); } EXPORT_SYMBOL(vmalloc_to_page); @@ -2357,7 +2370,22 @@ static void free_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); +#ifdef CONFIG_KERNEL_REPLICATION + if (numa_addr_has_replica((void *)va->va_start)) { + int node; + /** + * In some scenarios we might clear + * empty entries here, which is totally fine + */ + for_each_memory_node(node) + vunmap_range_noflush_pgd(init_mm.pgd_numa[node], + va->va_start, va->va_end); + } else { + vunmap_range_noflush(va->va_start, va->va_end); + } +#else vunmap_range_noflush(va->va_start, va->va_end); +#endif /* CONFIG_KERNEL_REPLICATION */ if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); @@ -3216,16 +3244,73 @@ struct vm_struct *remove_vm_area(const void *addr) return vm; } +#ifdef CONFIG_KERNEL_REPLICATION +static inline void set_direct_map_page_replicas(const struct vm_struct *area, + struct page *page, + int (*set_direct_map)(struct page *page)) +{ + if (area->replicated) { + struct page *cursor; + + list_for_each_entry(cursor, &page->lru, lru) { + if (page_address(cursor)) + set_direct_map(cursor); + } + } +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static inline void set_area_direct_map(const struct vm_struct *area, int (*set_direct_map)(struct page *page)) { int i; /* HUGE_VMALLOC passes small pages to set_direct_map */ - for (i = 0; i < area->nr_pages; i++) + for (i = 0; i < area->nr_pages; i++) { if (page_address(area->pages[i])) set_direct_map(area->pages[i]); +#ifdef CONFIG_KERNEL_REPLICATION + set_direct_map_page_replicas(area, + area->pages[i], set_direct_map); +#endif /* CONFIG_KERNEL_REPLICATION */ + } +} + +#ifdef CONFIG_KERNEL_REPLICATION +static void vm_account_replicated_range(struct vm_struct *area, + struct page *page, + unsigned long *s, + unsigned long *e, + int *flush) +{ + int flush_dmap = 0; + unsigned long start = ULONG_MAX, end = 0; + unsigned int page_order = vm_area_page_order(area); + + if (area->replicated) { + struct page *cursor; + + list_for_each_entry(cursor, &page->lru, lru) { + unsigned long addr = (unsigned long)page_address(cursor); + + if (addr) { + unsigned long page_size; + + page_size = PAGE_SIZE << page_order; + start = min(addr, start); + end = max(addr + page_size, end); + flush_dmap = 1; + } + } + } + + if (flush_dmap) + *flush = flush_dmap; + + *s = start; + *e = end; } +#endif /* CONFIG_KERNEL_REPLICATION */ /* * Flush the vm mapping and reset the direct map. @@ -3252,6 +3337,10 @@ static void vm_reset_perms(struct vm_struct *area) end = max(addr + page_size, end); flush_dmap = 1; } +#ifdef CONFIG_KERNEL_REPLICATION + vm_account_replicated_range(area, area->pages[i], + &start, &end, &flush_dmap); +#endif /* CONFIG_KERNEL_REPLICATION */ } /* @@ -3297,6 +3386,28 @@ void vfree_atomic(const void *addr) schedule_work(&p->wq); } +#ifdef CONFIG_KERNEL_REPLICATION +static void vfree_page_replicas(struct vm_struct *area, struct page *page) +{ + if (area->replicated) { + struct page *cursor, *tmp; + + list_for_each_entry_safe(cursor, tmp, &page->lru, lru) { + BUG_ON(!cursor); + + list_del(&cursor->lru); + mod_memcg_page_state(cursor, MEMCG_VMALLOC, -1); + /* + * High-order allocs for huge vmallocs are split, so + * can be freed as an array of order-0 allocations + */ + __free_pages(cursor, 0); + cond_resched(); + } + } +} +#endif /* CONFIG_KERNEL_REPLICATION */ + /** * vfree - Release memory allocated by vmalloc() * @addr: Memory base address @@ -3343,6 +3454,9 @@ void vfree(const void *addr) for (i = 0; i < vm->nr_pages; i++) { struct page *page = vm->pages[i]; +#ifdef CONFIG_KERNEL_REPLICATION + vfree_page_replicas(vm, page); +#endif /* CONFIG_KERNEL_REPLICATION */ BUG_ON(!page); mod_memcg_page_state(page, MEMCG_VMALLOC, -1); /* @@ -3600,26 +3714,91 @@ vm_area_alloc_pages(gfp_t gfp, int nid, return nr_allocated; } +static int vmalloc_map_area_pages_pgd(unsigned long addr, + struct page **pages, unsigned long size, + gfp_t gfp_mask, pgprot_t prot, + unsigned int page_shift, pgd_t *pgd) +{ + int ret = 0; + unsigned int flags; + bool nofail = gfp_mask & __GFP_NOFAIL; + + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + do { + ret = vmap_pages_range_noflush_pgd(pgd, addr, addr + size, + prot, pages, page_shift); + if (nofail && (ret < 0)) + schedule_timeout_uninterruptible(1); + } while (nofail && (ret < 0)); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + + if (ret < 0) { + warn_alloc(gfp_mask, NULL, + "vmalloc error: size %lu, failed to map pages", + size); + } + + return ret; +} + +static int vmalloc_map_area_pages(unsigned long addr, unsigned long size, + struct vm_struct *area, + gfp_t gfp_mask, pgprot_t prot, + unsigned int page_shift) +{ + int ret; +#ifdef CONFIG_KERNEL_REPLICATION + int nid; + + if (area->flags & VM_NUMA_SHARED) { + for_each_memory_node(nid) { + pgd_t *pgd = per_node_pgd(&init_mm, nid); + + ret = vmalloc_map_area_pages_pgd(addr, area->pages, size, + gfp_mask, prot, page_shift, pgd); + if (ret) + return ret; + } + } else { + ret = vmalloc_map_area_pages_pgd(addr, area->pages, size, + gfp_mask, prot, page_shift, init_mm.pgd); + } +#else + ret = vmalloc_map_area_pages_pgd(addr, area->pages, size, + gfp_mask, prot, page_shift, init_mm.pgd); +#endif /* CONFIG_KERNEL_REPLICATION */ + return ret; +} + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; - unsigned int flags; - int ret; + int ret = 0; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; - /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, area->caller); @@ -3631,8 +3810,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, warn_alloc(gfp_mask, NULL, "vmalloc error: size %lu, failed to allocated page array size %lu", nr_small_pages * PAGE_SIZE, array_size); - free_vm_area(area); - return NULL; + goto fail; } set_vm_area_page_order(area, page_shift - PAGE_SHIFT); @@ -3671,33 +3849,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } - /* - * page tables allocations ignore external gfp mask, enforce it - * by the scope API - */ - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - flags = memalloc_nofs_save(); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - flags = memalloc_noio_save(); - - do { - ret = vmap_pages_range(addr, addr + size, prot, area->pages, - page_shift); - if (nofail && (ret < 0)) - schedule_timeout_uninterruptible(1); - } while (nofail && (ret < 0)); - - if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) - memalloc_nofs_restore(flags); - else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) - memalloc_noio_restore(flags); - - if (ret < 0) { - warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, failed to map pages", - area->nr_pages * PAGE_SIZE); + ret = vmalloc_map_area_pages(addr, size, area, gfp_mask, prot, page_shift); + if (ret) goto fail; - } + flush_cache_vmap(addr, addr + size); return area->addr; @@ -3797,6 +3952,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } +#ifdef CONFIG_KERNEL_REPLICATION + if (numa_addr_has_replica(area->addr)) + vm_flags |= VM_NUMA_SHARED; + area->node = node; +#endif /* * Prepare arguments for __vmalloc_area_node() and * kasan_unpoison_vmalloc(). @@ -3891,6 +4051,129 @@ void *__vmalloc_node(unsigned long size, unsigned long align, return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } + +#ifdef CONFIG_KERNEL_REPLICATION +static void numa_replicate_page_range(struct page **src, struct page **dst, int nr_pages) +{ + int i; + void *from, *to; + + for (i = 0; i < nr_pages; i++) { + from = kmap(src[i]); + to = kmap(dst[i]); + + copy_page(to, from); + + kunmap(src[i]); + kunmap(dst[i]); + } +} + +int __vmalloc_node_replicate_range(const void *addr, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags) +{ + int i, ret, node = 0; + struct vm_struct *area; + unsigned int page_order; + unsigned int nr_allocated; + struct page **pages; + unsigned long area_start, area_end; + const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + unsigned long array_size; + + gfp_mask |= __GFP_NOWARN; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) + gfp_mask |= __GFP_HIGHMEM; + + if (unlikely(!numa_addr_has_replica(addr))) + return -EINVAL; + + area = find_vm_area(addr); + if (unlikely(!area)) + return -ENOENT; + + if (area->node == NUMA_NO_NODE) + return -EINVAL; + + array_size = sizeof(struct page *) * area->nr_pages; + if (array_size > PAGE_SIZE) + pages = __vmalloc(array_size, nested_gfp); + else + pages = kmalloc(array_size, nested_gfp); + + if (!pages) + return -ENOMEM; + + page_order = vm_area_page_order(area); + for (i = 0; i < area->nr_pages; i++) + INIT_LIST_HEAD(&area->pages[i]->lru); + + area_start = (unsigned long)area->addr; + area_end = (unsigned long)(area->addr + area->nr_pages * PAGE_SIZE); + + for_each_memory_node(node) { + if (area->node == node) + continue; + + nr_allocated = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, + node, page_order, area->nr_pages, pages); + if (nr_allocated != area->nr_pages) + goto fail_alloc_pages; + + for (i = 0; i < area->nr_pages; i++) + list_add(&pages[i]->lru, &area->pages[i]->lru); + + vunmap_range_noflush_pgd(init_mm.pgd_numa[node], + area_start, area_end); + + /* + * We can't fail here (hopefully) + * Possible errors: not enough memory for tables and not empty entries. + * Both unrealistic because we just cleared entries in existed tables. + */ + ret = vmalloc_map_area_pages_pgd(area_start, pages, + nr_allocated * PAGE_SIZE, + gfp_mask, prot, PAGE_SHIFT, + per_node_pgd(&init_mm, node)); + if (ret != 0) + goto fail_map_pages; + + atomic_long_add(nr_allocated, &nr_vmalloc_pages); + if (gfp_mask & __GFP_ACCOUNT) { + for (i = 0; i < nr_allocated; i++) + mod_memcg_page_state(pages[i], MEMCG_VMALLOC, 1); + } + numa_replicate_page_range(area->pages, pages, area->nr_pages); + + for (i = 0; i < area->nr_pages; i++) + pages[i] = NULL; + } + kvfree(pages); + + flush_tlb_kernel_range(area_start, area_end); + area->replicated = true; + + return 0; +fail_alloc_pages: + for (i = 0; i < nr_allocated; i++) + __free_pages(pages[i], 0); + +fail_map_pages: + kfree(pages); + for (i = 0; i < area->nr_pages; i++) { + struct page *page, *tmp; + + list_for_each_entry_safe(page, tmp, &area->pages[i]->lru, lru) { + list_del(&page->lru); + mod_memcg_page_state(page, MEMCG_VMALLOC, -1); + __free_pages(page, 0); + } + } + + return ret; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + /* * This is only for performance analysis of vmalloc and stress purpose. * It is required by vmalloc test module, therefore do not use it other -- 2.34.1