
kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC2BJZ ------------------------------------------------- Implement initial support for RO-data NUMA replication in user space. To enable or disable replication, use /proc/*/numa_*_replication or /sys/fs/cgroup/memory/*/memory.numa_*_replication entries. This is a preliminary implementation, some features might not be supported yet. Co-developed-by: Alexander Grubnikov <alexander.grubnikov@huawei.com> Signed-off-by: Alexander Grubnikov <alexander.grubnikov@huawei.com> Co-developed-by: Ilya Hanov <ilya.hanov@huawei-partners.com> Signed-off-by: Ilya Hanov <ilya.hanov@huawei-partners.com> Co-developed-by: Denis Darvish <darvish.denis@huawei.com> Signed-off-by: Denis Darvish <darvish.denis@huawei.com> Co-developed-by: Artem Kuzin <artem.kuzin@huawei.com> Signed-off-by: Artem Kuzin <artem.kuzin@huawei.com> Co-developed-by: Nikita Panov <panov.nikita@huawei.com> Signed-off-by: Nikita Panov <panov.nikita@huawei.com> --- arch/arm64/include/asm/mmu_context.h | 7 +- arch/arm64/include/asm/numa_replication.h | 10 +- arch/arm64/include/asm/pgtable.h | 11 + arch/arm64/kernel/alternative.c | 3 +- arch/arm64/kernel/cpufeature.c | 2 +- arch/arm64/kernel/hibernate.c | 2 +- arch/arm64/kernel/insn.c | 4 +- arch/arm64/kernel/smp.c | 2 +- arch/arm64/kernel/suspend.c | 2 +- arch/arm64/kernel/vmlinux.lds.S | 12 +- arch/arm64/mm/context.c | 2 +- arch/arm64/mm/hugetlbpage.c | 65 +- arch/arm64/mm/init.c | 19 +- arch/arm64/mm/kasan_init.c | 47 +- arch/arm64/mm/mmu.c | 4 +- arch/arm64/mm/pgd.c | 52 +- arch/arm64/mm/ptdump.c | 2 +- arch/x86/mm/init_64.c | 2 +- drivers/firmware/efi/arm-runtime.c | 2 +- drivers/firmware/efi/libstub/arm64-stub.c | 5 + fs/dax.c | 5 +- fs/exec.c | 9 + fs/proc/base.c | 189 + fs/proc/task_mmu.c | 209 +- include/asm-generic/pgalloc.h | 23 + include/asm-generic/pgtable-nopmd.h | 5 + include/asm-generic/pgtable-nopud.h | 5 + include/asm-generic/tlb.h | 11 + include/linux/cgroup.h | 1 + include/linux/memcontrol.h | 15 + include/linux/mm.h | 70 + include/linux/mm_types.h | 62 +- include/linux/mman.h | 2 +- include/linux/module.h | 7 +- include/linux/numa_kernel_replication.h | 297 ++ include/linux/numa_replication.h | 104 - include/linux/numa_user_replication.h | 1472 ++++++++ include/linux/page-flags.h | 20 +- include/linux/pgtable.h | 11 + include/linux/vmalloc.h | 5 +- include/trace/events/kmem.h | 99 + include/uapi/asm-generic/mman-common.h | 3 + init/main.c | 5 +- kernel/cgroup/cgroup.c | 2 +- kernel/events/uprobes.c | 7 +- kernel/fork.c | 22 + kernel/module.c | 6 +- kernel/sched/fair.c | 16 +- mm/Kconfig | 23 + mm/Makefile | 4 +- mm/gup.c | 35 +- mm/huge_memory.c | 647 +++- mm/hugetlb.c | 53 +- mm/khugepaged.c | 59 +- mm/ksm.c | 33 +- mm/madvise.c | 31 +- mm/memcontrol.c | 262 +- mm/memory.c | 895 +++-- mm/mempolicy.c | 7 + mm/migrate.c | 26 +- mm/mlock.c | 3 +- mm/mmap.c | 72 +- mm/mmu_gather.c | 42 +- mm/mprotect.c | 303 +- mm/mremap.c | 106 +- ...eplication.c => numa_kernel_replication.c} | 134 +- mm/numa_user_replication.c | 3105 +++++++++++++++++ mm/page_alloc.c | 5 +- mm/page_idle.c | 3 +- mm/page_vma_mapped.c | 3 +- mm/pgtable-generic.c | 11 +- mm/rmap.c | 76 +- mm/share_pool.c | 3 +- mm/shmem.c | 3 +- mm/swap.c | 1 - mm/swap_state.c | 3 +- mm/swapfile.c | 3 +- mm/userfaultfd.c | 6 +- mm/vmalloc.c | 5 +- 79 files changed, 8190 insertions(+), 709 deletions(-) create mode 100644 include/linux/numa_kernel_replication.h delete mode 100644 include/linux/numa_replication.h create mode 100644 include/linux/numa_user_replication.h rename mm/{numa_replication.c => numa_kernel_replication.c} (82%) create mode 100644 mm/numa_user_replication.c diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 97c3ba775ac0e..24026537df109 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -25,7 +25,9 @@ #include <asm/tlbflush.h> extern bool rodata_full; - +#ifdef CONFIG_USER_REPLICATION +extern void numa_account_switch(struct mm_struct *mm); +#endif static inline void contextidr_thread_switch(struct task_struct *next) { if (!IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR)) @@ -207,6 +209,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { +#ifdef CONFIG_USER_REPLICATION + numa_account_switch(next); +#endif if (prev != next) __switch_mm(next); diff --git a/arch/arm64/include/asm/numa_replication.h b/arch/arm64/include/asm/numa_replication.h index 43068b5ce2e61..b363038714906 100644 --- a/arch/arm64/include/asm/numa_replication.h +++ b/arch/arm64/include/asm/numa_replication.h @@ -7,7 +7,6 @@ #include <asm/tlbflush.h> #include <asm/pgalloc.h> #include <asm/memory.h> -#include <asm/mmu_context.h> #include <linux/mm.h> #include <linux/seq_file.h> @@ -22,15 +21,17 @@ static inline pgd_t *numa_replicate_pgt_pgd(int nid) pgd_page = alloc_pages_node(nid, GFP_PGTABLE_KERNEL, 2); BUG_ON(pgd_page == NULL); + SetPageReplicated(pgd_page); + SetPageReplicated(pgd_page + 2); + new_pgd = (pgd_t *)page_address(pgd_page); - new_pgd += (PTRS_PER_PGD * 2); //Extra pages for KPTI + new_pgd += (PAGE_SIZE * 2 / sizeof(pgd_t)); //Extra pages for KPTI copy_page((void *)new_pgd, (void *)swapper_pg_dir); return new_pgd; } - void cpu_replace_ttbr1(pgd_t *pgdp); static inline void numa_load_replicated_pgd(pgd_t *pgd) { @@ -50,7 +51,8 @@ static inline ssize_t numa_cpu_dump(struct seq_file *m) static inline void numa_sync_text_replicas(unsigned long start, unsigned long end) { - __flush_icache_range(start, end); + __flush_dcache_area((void *)start, end - start); + __flush_icache_all(); } #endif /* CONFIG_KERNEL_REPLICATION */ #endif /* __ASM_NUMA_REPLICATION_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 30769c82bab7a..d6a4c8a41de2e 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -472,14 +472,25 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) #ifdef CONFIG_KERNEL_REPLICATION +/* + * Select all bits except the pfn + */ static inline pgprot_t pmd_pgprot(pmd_t pmd) { unsigned long pfn = pmd_pfn(pmd); return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd)); } + #endif /* CONFIG_KERNEL_REPLICATION */ +static inline pgprot_t pte_pgprot(pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); +} + #define pud_young(pud) pte_young(pud_pte(pud)) #define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 52444aab41afc..51ac4f7a98a1b 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -10,7 +10,7 @@ #include <linux/init.h> #include <linux/cpu.h> -#include <linux/numa_replication.h> +#include <linux/numa_kernel_replication.h> #include <asm/cacheflush.h> #include <asm/alternative.h> #include <asm/cpufeature.h> @@ -144,7 +144,6 @@ static void __write_alternatives(struct alt_instr *alt, for_each_memory_node(nid) { __le32 *ptr = numa_get_replica(origptr, nid); - alt_cb(alt, origptr, ptr, nr_inst); clean_dcache_range_nopatch((u64)ptr, (u64)(ptr + nr_inst)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 575ae1d565c59..ac63a7cdc79a2 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -75,7 +75,7 @@ #include <linux/init.h> #include <linux/libfdt.h> #include <linux/pbha.h> -#include <linux/numa_replication.h> +#include <linux/numa_kernel_replication.h> #include <asm/cpu.h> #include <asm/cpufeature.h> diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index b2eec98d80fa4..1ab8496c9e079 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -21,7 +21,7 @@ #include <linux/sched.h> #include <linux/suspend.h> #include <linux/utsname.h> -#include <linux/numa_replication.h> +#include <linux/numa_kernel_replication.h> #include <asm/barrier.h> #include <asm/cacheflush.h> diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index 4c484545dc592..e753788e037de 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -15,7 +15,7 @@ #include <linux/stop_machine.h> #include <linux/types.h> #include <linux/uaccess.h> -#include <linux/numa_replication.h> +#include <linux/numa_kernel_replication.h> #include <asm/cacheflush.h> #include <asm/debug-monitors.h> @@ -188,6 +188,7 @@ void __kprobes aarch64_literal64_write(void *addr, u64 data) } raw_spin_unlock_irqrestore(&patch_lock, flags); } +EXPORT_SYMBOL(aarch64_literal64_write); #else static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) { @@ -218,6 +219,7 @@ void __kprobes aarch64_literal64_write(void *addr, u64 data) patch_unmap(FIX_TEXT_POKE0); raw_spin_unlock_irqrestore(&patch_lock, flags); } +EXPORT_SYMBOL(aarch64_literal64_write); #endif /* CONFIG_KERNEL_REPLICATION */ int __kprobes aarch64_insn_write(void *addr, u32 insn) diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index b2ba8f4ff3056..6513eda27f4c2 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -37,7 +37,7 @@ #include <linux/kvm_host.h> #include <linux/perf/arm_pmu.h> #include <linux/crash_dump.h> -#include <linux/numa_replication.h> +#include <linux/numa_kernel_replication.h> #include <asm/alternative.h> #include <asm/atomic.h> diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 6cafcd5e65bf1..31e56153dab5f 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -4,7 +4,7 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/pgtable.h> -#include <linux/numa_replication.h> +#include <linux/numa_kernel_replication.h> #include <asm/alternative.h> #include <asm/cacheflush.h> diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 193f98e7da748..6c87e2eb56c3b 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -122,7 +122,11 @@ SECTIONS HEAD_TEXT } #ifdef CONFIG_KERNEL_REPLICATION +#ifdef CONFIG_ARM64_4K_PAGES . = ALIGN(PMD_SIZE); +#else + . = ALIGN(CONT_PTE_SIZE); +#endif #endif .text : { /* Real text segment */ _stext = .; /* Text and read-only data */ @@ -153,16 +157,18 @@ SECTIONS "Unexpected GOT/PLT entries detected!") . = ALIGN(SEGMENT_ALIGN); -#ifdef CONFIG_KERNEL_REPLICATION +#if defined(CONFIG_KERNEL_REPLICATION) && defined(CONFIG_ARM64_4K_PAGES) . = ALIGN(PMD_SIZE); +#else + . = ALIGN(CONT_PTE_SIZE); #endif _etext = .; /* End of text section */ /* everything from this point to __init_begin will be marked RO NX */ -#ifdef CONFIG_KERNEL_REPLICATION +#if defined(CONFIG_KERNEL_REPLICATION) && defined(CONFIG_ARM64_4K_PAGES) RO_DATA(PMD_SIZE) #else - RO_DATA(PAGE_SIZE) + RO_DATA(CONT_PTE_SIZE) #endif idmap_pg_dir = .; diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 81162bd0182ee..65d7488b65550 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -11,7 +11,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/mm.h> -#include <linux/numa_replication.h> +#include <linux/numa_kernel_replication.h> #include <asm/cpufeature.h> #include <asm/mmu_context.h> diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 4effa2dd05185..42293d2aece63 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -14,9 +14,11 @@ #include <linux/pagemap.h> #include <linux/err.h> #include <linux/sysctl.h> +#include <linux/numa_user_replication.h> #include <asm/mman.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/pgtable.h> /* * HugeTLB Support Matrix @@ -90,16 +92,6 @@ int pud_huge(pud_t pud) #endif } -/* - * Select all bits except the pfn - */ -static inline pgprot_t pte_pgprot(pte_t pte) -{ - unsigned long pfn = pte_pfn(pte); - - return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte)); -} - static int find_num_contig(struct mm_struct *mm, unsigned long addr, pte_t *ptep, size_t *pgsize) { @@ -342,6 +334,59 @@ pte_t *huge_pte_offset(struct mm_struct *mm, return NULL; } +#ifdef CONFIG_USER_REPLICATION +pte_t *huge_pte_alloc_copy_tables(struct mm_struct *dst, struct mm_struct *src, + unsigned long addr, unsigned long sz) +{ + pgd_t *pgdp_dst, *pgdp_src; + p4d_t *p4dp_dst, *p4dp_src; + pud_t *pudp_dst, *pudp_src; + pmd_t *pmdp_dst, *pmdp_src; + pte_t *ptep_dst = NULL; + + pgdp_dst = pgd_offset(dst, addr); + pgdp_src = pgd_offset(src, addr); + p4dp_dst = p4d_offset(pgdp_dst, addr); + p4dp_src = p4d_offset(pgdp_src, addr); + + pudp_dst = cpr_alloc_pud(dst, addr, p4dp_src, p4dp_dst); + pudp_src = pud_offset(p4dp_src, addr); + if (!pudp_dst) + return NULL; + + if (sz == PUD_SIZE) { + ptep_dst = (pte_t *)pudp_dst; + } else if (sz == (CONT_PTE_SIZE)) { + pmdp_dst = cpr_alloc_pmd(dst, addr, pudp_src, pudp_dst); + pmdp_src = pmd_offset(pudp_src, addr); + if (!pmdp_dst) + return NULL; + + WARN_ON(addr & (sz - 1)); + /* + * Note that if this code were ever ported to the + * 32-bit arm platform then it will cause trouble in + * the case where CONFIG_HIGHPTE is set, since there + * will be no pte_unmap() to correspond with this + * pte_alloc_map(). + */ + ptep_dst = cpr_alloc_pte_map(dst, addr, pmdp_src, pmdp_dst); + } else if (sz == PMD_SIZE) { + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && + pud_none(READ_ONCE(*pudp_dst))) + ptep_dst = huge_pmd_share(dst, addr, pudp_dst); + else + ptep_dst = (pte_t *)cpr_alloc_pmd(dst, addr, pudp_src, pudp_dst); + } else if (sz == (CONT_PMD_SIZE)) { + pmdp_dst = cpr_alloc_pmd(dst, addr, pudp_src, pudp_dst); + WARN_ON(addr & (sz - 1)); + return (pte_t *)pmdp_dst; + } + + return ptep_dst; +} +#endif + pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma, struct page *page, int writable) { diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 4c80ddba5e20a..3240c780bd187 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -683,7 +683,7 @@ static void __init free_unused_memmap(void) * It is necessary to preallocate vmalloc pages in advance, * otherwise the replicated page-tables can be incomplete. */ -static void __init preallocate_vmalloc_pages(void) +void __init preallocate_vmalloc_pages(void) { unsigned long addr; @@ -692,6 +692,8 @@ static void __init preallocate_vmalloc_pages(void) pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; + pmd_t *pmd; + int pte; p4d = p4d_alloc(&init_mm, pgd, addr); /* @@ -701,6 +703,18 @@ static void __init preallocate_vmalloc_pages(void) pud = pud_alloc(&init_mm, p4d, addr); if (!pud) panic("Failed to pre-allocate pud pages for vmalloc area\n"); + if (!mm_pud_folded(&init_mm)) + continue; + + pmd = pmd_alloc(&init_mm, pud, addr); + if (!pmd) + panic("Failed to pre-allocate pmd pages for vmalloc area\n"); + if (!mm_pmd_folded(&init_mm)) + continue; + + pte = pte_alloc(&init_mm, pmd); + if (pte) + panic("Failed to pre-allocate pte pages for vmalloc area\n"); } } #endif /* CONFIG_KERNEL_REPLICATION */ @@ -749,9 +763,6 @@ void __init mem_init(void) */ sysctl_overcommit_memory = OVERCOMMIT_ALWAYS; } -#ifdef CONFIG_KERNEL_REPLICATION - preallocate_vmalloc_pages(); -#endif /* CONFIG_KERNEL_REPLICATION */ } void free_initmem(void) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 14bdd9738ec3f..715203718467b 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -13,7 +13,7 @@ #include <linux/memblock.h> #include <linux/start_kernel.h> #include <linux/mm.h> -#include <linux/numa_replication.h> +#include <linux/numa_kernel_replication.h> #include <asm/mmu_context.h> #include <asm/kernel-pgtable.h> @@ -57,6 +57,24 @@ static phys_addr_t __init kasan_alloc_raw_page(int node) return __pa(p); } +static void __init __kasan_pmd_populate(pmd_t *pmdp, phys_addr_t pte_phys, unsigned long addr) +{ +#ifdef CONFIG_KERNEL_REPLICATION + if (get_propagation_level() == PMD_PROPAGATION) { + int nid; + pmd_t *target; + + for_each_memory_node(nid) { + target = (pmd_t *)pgd_offset_pgd(per_node_pgd(&init_mm, nid), addr); + __pmd_populate(target, pte_phys, PMD_TYPE_TABLE); + } + } else + __pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE); +#else + __pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE); +#endif /* CONFIG_KERNEL_REPLICATION */ +} + static pte_t *__init kasan_pte_offset(pmd_t *pmdp, unsigned long addr, int node, bool early) { @@ -64,13 +82,32 @@ static pte_t *__init kasan_pte_offset(pmd_t *pmdp, unsigned long addr, int node, phys_addr_t pte_phys = early ? __pa_symbol(kasan_early_shadow_pte) : kasan_alloc_zeroed_page(node); - __pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE); + __kasan_pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE); } return early ? pte_offset_kimg(pmdp, addr) : pte_offset_kernel(pmdp, addr); } +static void __init __kasan_pud_populate(pud_t *pudp, phys_addr_t pmd_phys, unsigned long addr) +{ +#ifdef CONFIG_KERNEL_REPLICATION + if (get_propagation_level() == PUD_PROPAGATION) { + int nid; + pud_t *target; + + for_each_memory_node(nid) { + target = (pud_t *)pgd_offset_pgd(per_node_pgd(&init_mm, nid), addr); + __pud_populate(target, pmd_phys, PMD_TYPE_TABLE); + } + } else + __pud_populate(pudp, pmd_phys, PMD_TYPE_TABLE); +#else + __pud_populate(pudp, pmd_phys, PMD_TYPE_TABLE); +#endif /* CONFIG_KERNEL_REPLICATION */ +} + + static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, bool early) { @@ -78,7 +115,7 @@ static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, phys_addr_t pmd_phys = early ? __pa_symbol(kasan_early_shadow_pmd) : kasan_alloc_zeroed_page(node); - __pud_populate(pudp, pmd_phys, PMD_TYPE_TABLE); + __kasan_pud_populate(pudp, pmd_phys, PMD_TYPE_TABLE); } return early ? pmd_offset_kimg(pudp, addr) : pmd_offset(pudp, addr); @@ -87,7 +124,7 @@ static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, static void __init __kasan_p4d_populate(p4d_t *p4dp, phys_addr_t pud_phys, unsigned long addr) { #ifdef CONFIG_KERNEL_REPLICATION - if (is_text_replicated()) { + if (get_propagation_level() == P4D_PROPAGATION) { int nid; p4d_t *target; @@ -279,7 +316,7 @@ void __init kasan_init(void) (void *)mod_shadow_start); if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { - BUILD_BUG_ON(VMALLOC_START != MODULES_END); + // BUILD_BUG_ON(VMALLOC_START != MODULES_END); kasan_populate_early_shadow((void *)vmalloc_shadow_end, (void *)KASAN_SHADOW_END); } else { diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 01761486fd6e7..17a38dffb73b2 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -24,7 +24,7 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/pbha.h> -#include <linux/numa_replication.h> +#include <linux/numa_kernel_replication.h> #include <asm/barrier.h> #include <asm/cputype.h> @@ -697,7 +697,7 @@ static void __init populate_trampoline_mappings(void) /* Copy trampoline mappings in replicated tables */ for_each_memory_node(nid) { - memcpy(per_node_pgd(&init_mm, nid) - (PTRS_PER_PGD * 2), + memcpy(per_node_pgd(&init_mm, nid) - (PAGE_SIZE * 2 / sizeof(pgd_t)), tramp_pg_dir, PGD_SIZE); } /* Be sure that replicated page table can be observed properly */ diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 56e8047485a5e..d5f1651e2a058 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0-only + // SPDX-License-Identifier: GPL-2.0-only /* * PGD allocation/freeing * @@ -10,7 +10,7 @@ #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/slab.h> -#include <linux/numa_replication.h> +#include <linux/numa_kernel_replication.h> #include <asm/pgalloc.h> #include <asm/page.h> @@ -19,23 +19,17 @@ static struct kmem_cache *pgd_cache __ro_after_init; #ifdef CONFIG_KERNEL_REPLICATION -pgd_t *pgd_alloc(struct mm_struct *mm) + +static pgd_t *page_pgd_alloc(struct mm_struct *mm) { int nid; gfp_t gfp = GFP_PGTABLE_USER | __GFP_THISNODE; - pgd_t **pgd_numa = (pgd_t **)kmalloc(sizeof(pgd_t *) * MAX_NUMNODES, GFP_PGTABLE_KERNEL); - - if (!pgd_numa) - goto pgd_numa_fail; - - mm->pgd_numa = pgd_numa; /* * Kernel replication is not supproted in case of non-page size pgd, * in general we can support it, but maybe later, due to we need to * update page tables allocation significantly, so, let's panic here. */ - BUG_ON(PGD_SIZE != PAGE_SIZE); for_each_memory_node(nid) { struct page *page; @@ -43,6 +37,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (!page) goto fail; + SetPageReplicated(page); + per_node_pgd(mm, nid) = (pgd_t *)page_address(page); } @@ -51,16 +47,28 @@ pgd_t *pgd_alloc(struct mm_struct *mm) mm->pgd = per_node_pgd(mm, numa_get_memory_node(0)); + build_pgd_chain(mm->pgd_numa); + return mm->pgd; fail: pgd_free(mm, mm->pgd); -pgd_numa_fail: - kfree(pgd_numa); - return NULL; } + +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + pgd_t **pgd_numa = (pgd_t **)kmalloc(sizeof(pgd_t *) * MAX_NUMNODES, GFP_PGTABLE_KERNEL); + + if (!pgd_numa) + return NULL; + + mm->pgd_numa = pgd_numa; + + return page_pgd_alloc(mm); +} + #else pgd_t *pgd_alloc(struct mm_struct *mm) { @@ -74,7 +82,8 @@ pgd_t *pgd_alloc(struct mm_struct *mm) #endif /* CONFIG_KERNEL_REPLICATION */ #ifdef CONFIG_KERNEL_REPLICATION -void pgd_free(struct mm_struct *mm, pgd_t *pgd) + +static void page_pgd_free(struct mm_struct *mm, pgd_t *pgd) { int nid; /* @@ -82,17 +91,30 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) * in general we can support it, but maybe later, due to we need to * update page tables allocation significantly, so, let's panic here. */ - BUG_ON(PGD_SIZE != PAGE_SIZE); + + if (per_node_pgd(mm, first_memory_node) == NULL) + return; + + clear_pgtable_list(virt_to_page(per_node_pgd(mm, first_memory_node))); for_each_memory_node(nid) { if (per_node_pgd(mm, nid) == NULL) break; + ClearPageReplicated(virt_to_page(per_node_pgd(mm, nid))); + free_page((unsigned long)per_node_pgd(mm, nid)); } for_each_online_node(nid) per_node_pgd(mm, nid) = NULL; +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + page_pgd_free(mm, pgd); + kfree(mm->pgd_numa); } + #else void pgd_free(struct mm_struct *mm, pgd_t *pgd) { diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index a1e6a4ecde4eb..2694dadf9f30b 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -18,7 +18,7 @@ #include <linux/ptdump.h> #include <linux/sched.h> #include <linux/seq_file.h> -#include <linux/numa_replication.h> +#include <linux/numa_kernel_replication.h> #include <asm/fixmap.h> #include <asm/kasan.h> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index cad100b5a9c01..ba8af38490d06 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1238,7 +1238,7 @@ static void __init register_page_bootmem_info(void) * Only the level which needs to be synchronized between all page-tables is * allocated because the synchronization can be expensive. */ -static void __init preallocate_vmalloc_pages(void) +void __init preallocate_vmalloc_pages(void) { unsigned long addr; const char *lvl; diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 563a82c941092..d13d1406a6252 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -19,7 +19,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/pgtable.h> -#include <linux/numa_replication.h> +#include <linux/numa_kernel_replication.h> #include <asm/cacheflush.h> #include <asm/efi.h> diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 0ad824b36f2e2..f2dfc46551ac3 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -312,7 +312,12 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, * TLB usage. Due to this fact for now we map kernel by huge pages even * in case of KASLR enabled. Ugly but works. */ +#ifdef CONFIG_ARM64_4K_PAGES u64 min_kimg_align = HPAGE_SIZE; +#else + u64 min_kimg_align = CONT_PTE_SIZE; +#endif + #else u64 min_kimg_align = efi_nokaslr ? MIN_KIMG_ALIGN : EFI_KIMG_ALIGN; #endif diff --git a/fs/dax.c b/fs/dax.c index d87b31b70b678..d70c809202fa2 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -25,6 +25,7 @@ #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> +#include <linux/numa_user_replication.h> #include <asm/pgalloc.h> #define CREATE_TRACE_POINTS @@ -862,10 +863,10 @@ static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, goto unlock_pte; flush_cache_page(vma, address, pfn); - pte = ptep_clear_flush(vma, address, ptep); + pte = ptep_clear_flush_replicated(vma, address, ptep); pte = pte_wrprotect(pte); pte = pte_mkclean(pte); - set_pte_at(vma->vm_mm, address, ptep, pte); + set_pte_at_replicated(vma->vm_mm, address, ptep, pte); unlock_pte: pte_unmap_unlock(ptep, ptl); } diff --git a/fs/exec.c b/fs/exec.c index e328d435be648..1c1ec4893f189 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -65,6 +65,7 @@ #include <linux/compat.h> #include <linux/vmalloc.h> #include <linux/io_uring.h> +#include <linux/numa_user_replication.h> #ifndef __GENKSYMS__ #include <linux/ksm.h> #endif @@ -275,11 +276,19 @@ static int __bprm_mm_init(struct linux_binprm *bprm) vma->vm_flags = VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); +#ifdef CONFIG_USER_REPLICATION + if (get_table_replication_policy(vma->vm_mm) == TABLE_REPLICATION_ALL) { + vma->vm_flags |= VM_REPLICA_INIT; + } +#endif /* CONFIG_USER_REPLICATION */ + + err = insert_vm_struct(mm, vma); if (err) goto err; mm->stack_vm = mm->total_vm = 1; + mmap_write_unlock(mm); bprm->p = vma->vm_end - sizeof(void *); return 0; diff --git a/fs/proc/base.c b/fs/proc/base.c index 4e0054a37c4c5..59424636f6daa 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -103,6 +103,7 @@ #include <linux/share_pool.h> #include <linux/ksm.h> #include <linux/pbha.h> +#include <linux/numa_user_replication.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" @@ -3595,6 +3596,189 @@ static const struct file_operations proc_pid_sg_level_operations = { static const struct file_operations proc_task_operations; static const struct inode_operations proc_task_inode_operations; +#ifdef CONFIG_USER_REPLICATION + +static ssize_t numa_data_replication_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct mm_struct *mm; + table_replication_policy_t result; + char buffer[PROC_NUMBUF]; + int len; + + if (!task) + return -ESRCH; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + return -ESRCH; + + result = get_data_replication_policy(mm); + + mmput(mm); + + len = snprintf(buffer, sizeof(buffer), "%d\n", result); + + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t numa_data_replication_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct mm_struct *mm; + int val = 0; + char buffer[PROC_NUMBUF]; + int err = 0; + + if (!task) + return -ESRCH; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + return -ESRCH; + + memset(buffer, 0, sizeof(buffer)); + + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &val); + if (err) + goto out; + + err = numa_dispatch_data_replication_request(mm, val); + +out: + mmput(mm); + + return err < 0 ? err : count; +} + + +static ssize_t numa_table_replication_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct mm_struct *mm; + table_replication_policy_t result; + char buffer[PROC_NUMBUF]; + int len; + + if (!task) + return -ESRCH; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + return -ESRCH; + + result = get_table_replication_policy(mm); + + mmput(mm); + + len = snprintf(buffer, sizeof(buffer), "%d\n", result); + + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t numa_table_replication_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct mm_struct *mm; + int val = 0; + char buffer[PROC_NUMBUF]; + int err = 0; + + if (!task) + return -ESRCH; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + return -ESRCH; + + memset(buffer, 0, sizeof(buffer)); + + if (copy_from_user(buffer, buf, count)) { + err = -EFAULT; + goto out; + } + + err = kstrtoint(strstrip(buffer), 0, &val); + if (err) + goto out; + + err = numa_dispatch_table_replication_request(mm, val); + +out: + mmput(mm); + + return err < 0 ? err : count; +} + + +#define REPLICATION_STATS_BUFFER_SIZE 1024 +static ssize_t numa_replication_stats_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file_inode(file)); + struct mm_struct *mm; + unsigned long replicated_data_bytes = 0; + unsigned long replicated_table_bytes = 0; + struct timespec64 uptime; + char buffer[REPLICATION_STATS_BUFFER_SIZE]; + int len; + + if (!task) + return -ESRCH; + + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + return -ESRCH; + + ktime_get_boottime_ts64(&uptime); + timens_add_boottime(&uptime); + + replicated_data_bytes = total_replicated_data_bytes_mm(mm); + replicated_table_bytes = total_replicated_table_bytes_mm(mm); + + mmput(mm); + + len = snprintf(buffer, sizeof(buffer), "{\n" + " \"timestamp\": \"%lu.%02lu\",\n" + " \"replicated_data_bytes\": \"%lu\",\n" + " \"replicated_table_bytes\": \"%lu\"\n" + "}\n", + (unsigned long) uptime.tv_sec, + (uptime.tv_nsec / (NSEC_PER_SEC / 100)), replicated_data_bytes, replicated_table_bytes); + + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +const struct file_operations numa_table_replication_operations = { + .read = numa_table_replication_read, + .write = numa_table_replication_write, +}; + +const struct file_operations numa_data_replication_operations = { + .read = numa_data_replication_read, + .write = numa_data_replication_write, +}; + +const struct file_operations numa_replication_stats_operations = { + .read = numa_replication_stats_read, +}; + +#endif + static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), @@ -3725,6 +3909,11 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif +#ifdef CONFIG_USER_REPLICATION + REG("numa_table_replication", 0600, numa_table_replication_operations), + REG("numa_data_replication", 0600, numa_data_replication_operations), + REG("numa_replication_stats", 0400, numa_replication_stats_operations), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6dddac4548e11..7f92d2d8f096b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -21,6 +21,7 @@ #include <linux/pkeys.h> #include <linux/module.h> #include <linux/pbha.h> +#include <linux/numa_user_replication.h> #include <asm/elf.h> #include <asm/tlb.h> @@ -401,6 +402,20 @@ struct mem_size_stats { u64 pss_locked; u64 swap_pss; bool check_shmem_swap; +#ifdef CONFIG_USER_REPLICATION + KABI_EXTEND(unsigned long repl_page) + KABI_EXTEND(unsigned long repl_thp) + KABI_EXTEND(unsigned long repl_tables) +#endif +}; + +struct smaps_private { + struct mem_size_stats *mss; + struct vm_area_struct *vma; +#ifdef CONFIG_USER_REPLICATION + int _nid; /* Node ID of the page of upper level */ + struct mm_struct *mm; +#endif }; static void smaps_page_accumulate(struct mem_size_stats *mss, @@ -488,7 +503,8 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, static int smaps_pte_hole(unsigned long addr, unsigned long end, __always_unused int depth, struct mm_walk *walk) { - struct mem_size_stats *mss = walk->private; + struct smaps_private *priv = walk->private; + struct mem_size_stats *mss = priv->mss; mss->swap += shmem_partial_swap_usage( walk->vma->vm_file->f_mapping, addr, end); @@ -502,8 +518,9 @@ static int smaps_pte_hole(unsigned long addr, unsigned long end, static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { - struct mem_size_stats *mss = walk->private; - struct vm_area_struct *vma = walk->vma; + struct smaps_private *priv = walk->private; + struct mem_size_stats *mss = priv->mss; + struct vm_area_struct *vma = priv->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool migration = false, young = false, dirty = false; @@ -545,6 +562,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; +#ifdef CONFIG_USER_REPLICATION + if (PageReplicated(page) && priv->_nid == page_to_nid(page)) + mss->repl_page += PAGE_SIZE; +#endif + smaps_account(mss, page, false, young, dirty, locked, migration); } @@ -552,8 +574,9 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mm_walk *walk) { - struct mem_size_stats *mss = walk->private; - struct vm_area_struct *vma = walk->vma; + struct smaps_private *priv = walk->private; + struct mem_size_stats *mss = priv->mss; + struct vm_area_struct *vma = priv->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; bool migration = false; @@ -571,6 +594,10 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, } if (IS_ERR_OR_NULL(page)) return; +#ifdef CONFIG_USER_REPLICATION + if (PageReplicated(page) && priv->_nid == page_to_nid(page)) + mss->repl_thp += HPAGE_SIZE; +#endif if (PageAnon(page)) mss->anonymous_thp += HPAGE_PMD_SIZE; else if (PageSwapBacked(page)) @@ -711,8 +738,9 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct mem_size_stats *mss = walk->private; - struct vm_area_struct *vma = walk->vma; + struct smaps_private *priv = walk->private; + struct mem_size_stats *mss = priv->mss; + struct vm_area_struct *vma = priv->vma; struct page *page = NULL; if (pte_present(*pte)) { @@ -737,6 +765,10 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, #define smaps_hugetlb_range NULL #endif /* HUGETLB_PAGE */ +/* + * In case if CONFIG_USER_REPLICATION=y PGD is always replicated, so + * must be accounted on all available nodes. + */ static const struct mm_walk_ops smaps_walk_ops = { .pmd_entry = smaps_pte_range, .hugetlb_entry = smaps_hugetlb_range, @@ -748,6 +780,35 @@ static const struct mm_walk_ops smaps_shmem_walk_ops = { .pte_hole = smaps_pte_hole, }; +static void smaps_walk_vma(struct vm_area_struct *vma, + unsigned long start, + const struct mm_walk_ops *ops, + struct smaps_private *priv) +{ +#ifdef CONFIG_USER_REPLICATION + if (numa_is_vma_replicant(vma)) { + int nid; + + BUG_ON(is_vm_hugetlb_page(vma)); + + for_each_node_state(nid, N_MEMORY) { + priv->_nid = nid; + walk_page_range_novma(vma->vm_mm, + start ? start : vma->vm_start, + vma->vm_end, ops, + vma->vm_mm->pgd_numa[nid], + priv); + } + return; + } +#endif + + if (!start) + walk_page_vma(vma, ops, priv); + else + walk_page_range(vma->vm_mm, start, vma->vm_end, ops, priv); +} + /* * Gather mem stats from @vma with the indicated beginning * address @start, and keep them in @mss. @@ -758,6 +819,10 @@ static void smap_gather_stats(struct vm_area_struct *vma, struct mem_size_stats *mss, unsigned long start) { const struct mm_walk_ops *ops = &smaps_walk_ops; + struct smaps_private priv = { + .mss = mss, + .vma = vma, + }; /* Invalid start */ if (start >= vma->vm_end) @@ -788,11 +853,9 @@ static void smap_gather_stats(struct vm_area_struct *vma, } } #endif + /* mmap_lock is held in m_start */ - if (!start) - walk_page_vma(vma, ops, mss); - else - walk_page_range(vma->vm_mm, start, vma->vm_end, ops, mss); + smaps_walk_vma(vma, start, ops, &priv); } #define SEQ_PUT_DEC(str, val) \ @@ -834,6 +897,11 @@ static void __show_smap(struct seq_file *m, const struct mem_size_stats *mss, mss->swap_pss >> PSS_SHIFT); SEQ_PUT_DEC(" kB\nLocked: ", mss->pss_locked >> PSS_SHIFT); +#ifdef CONFIG_USER_REPLICATION + SEQ_PUT_DEC(" kB\nReplPages: ", mss->repl_page); + SEQ_PUT_DEC(" kB\nReplThpPages: ", mss->repl_thp); + SEQ_PUT_DEC(" kB\nReplTablePages: ", mss->repl_tables); +#endif seq_puts(m, " kB\n"); } @@ -865,6 +933,121 @@ static int show_smap(struct seq_file *m, void *v) return 0; } +#ifdef CONFIG_USER_REPLICATION +static int smaps_tables_pgd_callback(pgd_t *pgd, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct smaps_private *priv = walk->private; + struct mem_size_stats *mss = priv->mss; + p4d_t *p4d; + + if (mm_p4d_folded(priv->mm)) + return 0; + + if (pgd_none_or_clear_bad(pgd)) + return 0; + + p4d = (p4d_t *)pgd_page_vaddr(*pgd); + + if (numa_pgtable_replicated(p4d)) + mss->repl_tables += (replica_count * PAGE_SIZE); + + return 0; +} + +static int smaps_tables_p4d_callback(p4d_t *p4d, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct smaps_private *priv = walk->private; + struct mem_size_stats *mss = priv->mss; + pud_t *pud; + + if (mm_pud_folded(priv->mm)) + return 0; + + if (p4d_none_or_clear_bad(p4d)) + return 0; + + pud = p4d_pgtable(*p4d); + + if (numa_pgtable_replicated(pud)) + mss->repl_tables += (replica_count * PAGE_SIZE); + return 0; +} + +static int smaps_tables_pud_callback(pud_t *pud, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct smaps_private *priv = walk->private; + struct mem_size_stats *mss = priv->mss; + pmd_t *pmd; + + if (mm_pmd_folded(priv->mm)) + return 0; + if (pud_none_or_clear_bad(pud)) + return 0; + + pmd = pud_pgtable(*pud); + + if (numa_pgtable_replicated(pmd)) + mss->repl_tables += (replica_count * PAGE_SIZE); + + return 0; +} + +static int smaps_tables_pmd_callback(pmd_t *pmd, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + struct smaps_private *priv = walk->private; + struct mem_size_stats *mss = priv->mss; + + struct page *pte; + + if (pmd_none(*pmd) || is_swap_pmd(*pmd) || pmd_devmap(*pmd) || pmd_trans_huge(*pmd)) + return 0; + + pte = pmd_pgtable(*pmd); + + if (numa_pgtable_replicated(page_to_virt(pte))) + mss->repl_tables += (replica_count * PAGE_SIZE); + + return 0; +} + +const struct mm_walk_ops smaps_tables_ops = { + .pgd_entry = smaps_tables_pgd_callback, + .p4d_entry = smaps_tables_p4d_callback, + .pud_entry = smaps_tables_pud_callback, + .pmd_entry = smaps_tables_pmd_callback, +}; + +static struct vm_area_struct *find_last_vma(struct mm_struct *mm) +{ + struct vm_area_struct *vma = mm->mmap; + + while (vma->vm_next) { + vma = vma->vm_next; + } + return vma; +} + +static void smaps_gather_tables(struct mm_struct *mm, struct mem_size_stats *mss) +{ + const struct mm_walk_ops *ops = &smaps_tables_ops; + struct smaps_private priv = { + .mss = mss, + .mm = mm + }; + + walk_page_range_novma(mm, mm->mmap->vm_start, find_last_vma(mm)->vm_end, ops, NULL, &priv); + mss->repl_tables += (PAGE_SIZE * replica_count); +} +#endif + static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; @@ -961,6 +1144,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v) vma = vma->vm_next; } +#ifdef CONFIG_USER_REPLICATION + smaps_gather_tables(priv->mm, &mss); +#endif + show_vma_header_prefix(m, priv->mm->mmap ? priv->mm->mmap->vm_start : 0, last_vma_end, 0, 0, 0, 0); seq_pad(m, ' '); diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index ec28f86ea36dd..6a24d610043ac 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -71,6 +71,22 @@ static inline pgtable_t __pte_alloc_one(struct mm_struct *mm, gfp_t gfp) return pte; } +static inline pgtable_t __pte_alloc_one_node(unsigned int nid, + struct mm_struct *mm, gfp_t gfp) +{ + struct page *pte; + + pte = alloc_pages_node(nid, gfp, 0); + if (!pte) + return NULL; + if (!pgtable_pte_page_ctor(pte)) { + __free_page(pte); + return NULL; + } + + return pte; +} + #ifndef __HAVE_ARCH_PTE_ALLOC_ONE /** * pte_alloc_one - allocate a page for PTE-level user page table @@ -84,6 +100,12 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) { return __pte_alloc_one(mm, GFP_PGTABLE_USER); } + +static inline pgtable_t pte_alloc_one_node(unsigned int nid, struct mm_struct *mm) +{ + return __pte_alloc_one_node(nid, mm, GFP_PGTABLE_USER | __GFP_THISNODE); +} + #endif /* @@ -98,6 +120,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm) */ static inline void pte_free(struct mm_struct *mm, struct page *pte_page) { + ClearPageReplicated(pte_page); pgtable_pte_page_dtor(pte_page); __free_page(pte_page); } diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h index 10789cf51d160..5b02f40b1837a 100644 --- a/include/asm-generic/pgtable-nopmd.h +++ b/include/asm-generic/pgtable-nopmd.h @@ -58,6 +58,11 @@ static inline pmd_t * pmd_offset(pud_t * pud, unsigned long address) * inside the pud, so has no extra memory associated with it. */ #define pmd_alloc_one(mm, address) NULL + +#ifdef CONFIG_USER_REPLICATION +#define pmd_alloc_one_node(nid, mm, address) NULL +#endif + static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { } diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h index eb70c6d7ceff2..1eb202d8474e1 100644 --- a/include/asm-generic/pgtable-nopud.h +++ b/include/asm-generic/pgtable-nopud.h @@ -56,6 +56,11 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address) * inside the p4d, so has no extra memory associated with it. */ #define pud_alloc_one(mm, address) NULL + +#ifdef CONFIG_USER_REPLICATION +#define pud_alloc_one_node(nid, mm, address) NULL +#endif + #define pud_free(mm, x) do { } while (0) #define pud_free_tlb(tlb, x, a) do { } while (0) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index f40c9534f20be..8872e578adb4c 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -223,7 +223,11 @@ static inline void tlb_remove_table_sync_one(void) { } * If we can't allocate a page to make a big batch of page pointers * to work on, then just handle a few from the on-stack structure. */ +#ifdef CONFIG_USER_REPLICATION +#define MMU_GATHER_BUNDLE MAX_NUMNODES +#else #define MMU_GATHER_BUNDLE 8 +#endif struct mmu_gather_batch { struct mmu_gather_batch *next; @@ -245,6 +249,8 @@ struct mmu_gather_batch { extern bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size); +extern bool __tlb_remove_replica_pages_size(struct mmu_gather *tlb, struct page **pages, + int page_size); #endif /* @@ -442,6 +448,11 @@ static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) return __tlb_remove_page_size(tlb, page, PAGE_SIZE); } +static inline bool __tlb_remove_replica_pages(struct mmu_gather *tlb, struct page **pages) +{ + return __tlb_remove_replica_pages_size(tlb, pages, PAGE_SIZE); +} + /* tlb_remove_page * Similar to __tlb_remove_page but will call tlb_flush_mmu() itself when * required. diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b7ffef090ea7e..f7a63251814b6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -138,6 +138,7 @@ int cgroup_init_early(void); int cgroup_init(void); int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v); +bool cgroup_has_tasks(struct cgroup *cgrp); /* * Iteration helpers and macros. diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6b63b39cc24de..61c3a1f5e55c9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -250,6 +250,16 @@ struct swap_device { int type; }; +struct memcg_replication_ctl { + fork_policy_t fork_policy; + table_replication_policy_t table_policy; + data_replication_policy_t data_policy; + unsigned long __percpu *pcp_replicated_pages; + unsigned long __percpu *pcp_dereplicated_pages; + unsigned long __percpu *pcp_replicated_tables; + unsigned long __percpu *pcp_dereplicated_tables; +}; + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -406,7 +416,12 @@ struct mem_cgroup { KABI_RESERVE(3) KABI_RESERVE(4) #endif + +#ifdef CONFIG_USER_REPLICATION + KABI_USE(5, struct memcg_replication_ctl *replication_ctl) +#else KABI_RESERVE(5) +#endif #if defined(CONFIG_DYNAMIC_HUGETLB) && defined(CONFIG_ARM64) KABI_USE(6, struct dhugetlb_pool *hpool) #else diff --git a/include/linux/mm.h b/include/linux/mm.h index 6eb790b220e5f..3528e46ed5183 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -340,12 +340,16 @@ extern unsigned int kobjsize(const void *objp); #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */ +#define VM_HIGH_ARCH_BIT_6 38 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_7 39 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) +#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5) +#define VM_HIGH_ARCH_6 BIT(VM_HIGH_ARCH_BIT_6) #define VM_HIGH_ARCH_7 BIT(VM_HIGH_ARCH_BIT_7) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ @@ -362,6 +366,20 @@ extern unsigned int kobjsize(const void *objp); #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ +#ifdef CONFIG_USER_REPLICATION +# define VM_REPLICA_INIT VM_HIGH_ARCH_6 /* + * Page tables for this vma will be replicated during page faults + * Set and clear this flag in pair with calling numa_mm_store_range + */ +# define VM_REPLICA_COMMIT VM_HIGH_ARCH_7 /* + * Phys memory of this vma migth has replicas + * due to mprotect call or numa_balancer replication. + * Also, this flag is used by numa balancer as a hint, that this memory should + * be replicated. Obviously, that if this flag is set, + * VM_REPLICA_INIT also must be set. + */ +#endif /* CONFIG_USER_REPLICATION */ + #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ #elif defined(CONFIG_PPC) @@ -558,6 +576,20 @@ static inline bool fault_flag_allow_retry_first(unsigned int flags) { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } +typedef enum { + /* Switch to default handling */ + REPLICA_NONE, + /* + * User replication stops here, + * already replicated levels need propagation + */ + REPLICA_PROPAGATE, + /* Keep replicating page tables */ + REPLICA_KEEP, + /* Failed to replicate page table level */ + REPLICA_FAIL, +} replica_action_t; + /* * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask @@ -609,6 +641,17 @@ struct vm_fault { * page table to avoid allocation from * atomic context. */ + + KABI_EXTEND(unsigned long real_address) /* Faulting virtual address - unmasked */ + KABI_EXTEND(unsigned long left_replicant) /* Closest vmas that require replicated tables */ + KABI_EXTEND(unsigned long right_replicant) + KABI_EXTEND(p4d_t *p4d) + KABI_EXTEND(pgd_t *pgd) + KABI_EXTEND(bool pte_replicated :1) + KABI_EXTEND(bool pmd_replicated :1) + KABI_EXTEND(bool pud_replicated :1) + KABI_EXTEND(bool p4d_replicated :1) + KABI_EXTEND(replica_action_t replica_action) /* last action performed with page table */ }; /* page entry size for vm->huge_fault() */ @@ -2296,28 +2339,54 @@ void __init ptlock_cache_init(void); extern bool ptlock_alloc(struct page *page); extern void ptlock_free(struct page *page); +#ifdef CONFIG_KERNEL_REPLICATION +static inline spinlock_t *ptlock_ptr(struct page *page) +{ + return page->master_table->ptl; +} +#else static inline spinlock_t *ptlock_ptr(struct page *page) { return page->ptl; } +#endif + #else /* ALLOC_SPLIT_PTLOCKS */ static inline void ptlock_cache_init(void) { } +#ifdef CONFIG_KERNEL_REPLICATION static inline bool ptlock_alloc(struct page *page) { + page->master_table = page; return true; } static inline void ptlock_free(struct page *page) { + page->master_table = NULL; } +static inline spinlock_t *ptlock_ptr(struct page *page) +{ + return &page->master_table->ptl; +} +#else +static inline bool ptlock_alloc(struct page *page) +{ + return true; +} + +static inline void ptlock_free(struct page *page) +{ } + static inline spinlock_t *ptlock_ptr(struct page *page) { return &page->ptl; } +#endif + #endif /* ALLOC_SPLIT_PTLOCKS */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) @@ -2609,6 +2678,7 @@ extern void memmap_init_zone(unsigned long, int, unsigned long, extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); +extern void preallocate_vmalloc_pages(void); extern void __init mmap_init(void); extern void show_mem(unsigned int flags, nodemask_t *nodemask); extern long si_mem_available(void); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 382d018bbc157..0e6c929b50995 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -19,8 +19,6 @@ #include <asm/mmu.h> -#include <linux/numa.h> - #ifndef AT_VECTOR_SIZE_ARCH #define AT_VECTOR_SIZE_ARCH 0 #endif @@ -169,11 +167,46 @@ struct page { struct { /* Page table pages */ unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ +#ifdef CONFIG_KERNEL_REPLICATION + KABI_REPLACE( + unsigned long _pt_pad_2, + union { + unsigned long _pt_pad_2; /* mapping */ + struct llist_head replica_list_head; /* required for connecting */ + struct llist_node replica_list_node; /* replicated tables into lists */ + } + ) + /* + * master_page is used only for pte and pmd levels, + * If we have, for example, 4 replicated pmd tables, + * we need to use single lock to correctly serialize modifications of this level. + * Previously, lock from first_memory_node was used. + * However, this design does not handle correctly + * replication of existing table. + * So, now this field points to lock from original table. + * If tables are not replicated, or table is master, master_lock + * equals to ptl (or &ptl) + * Another usage here - list of deposited ptes for thp + */ + KABI_REPLACE( + union { + struct mm_struct *pt_mm; /* x86 pgds only */ + atomic_t pt_frag_refcount; /* powerpc */ + }, + union { + struct mm_struct *pt_mm; /* x86 pgds only */ + atomic_t pt_frag_refcount; /* powerpc */ + struct page *master_table; + } + ) +#else unsigned long _pt_pad_2; /* mapping */ union { struct mm_struct *pt_mm; /* x86 pgds only */ atomic_t pt_frag_refcount; /* powerpc */ }; +#endif + #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else @@ -410,6 +443,24 @@ struct core_state { struct completion startup; }; +typedef enum { + FORK_DISCARD_REPLICA, + FORK_KEEP_REPLICA +} fork_policy_t; + +typedef enum { + TABLE_REPLICATION_NONE = 0, + TABLE_REPLICATION_MINIMAL = 1, + TABLE_REPLICATION_ALL = 2 +} table_replication_policy_t; + +typedef enum { + DATA_REPLICATION_NONE = 0, + DATA_REPLICATION_ON_DEMAND = 1, + DATA_REPLICATION_ALL_MAPPED_ON_DEMAND = 2, + DATA_REPLICATION_ALL = 3 +} data_replication_policy_t; + struct kioctx_table; #if defined(CONFIG_X86_64) @@ -646,14 +697,21 @@ struct mm_struct { #else KABI_RESERVE(4) #endif + #ifdef CONFIG_KERNEL_REPLICATION KABI_USE(5, pgd_t **pgd_numa) #else KABI_RESERVE(5) #endif KABI_RESERVE(6) +#ifdef CONFIG_USER_REPLICATION + KABI_USE(7, struct numa_replication_control *replication_ctl) + KABI_USE(8, struct numa_context_switch_stat *context_switch_stats) +#else KABI_RESERVE(7) KABI_RESERVE(8) +#endif + #if IS_ENABLED(CONFIG_KVM) && !defined(__GENKSYMS__) struct kvm *kvm; diff --git a/include/linux/mman.h b/include/linux/mman.h index f13546c357e13..e682d62c11ee0 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -182,7 +182,7 @@ static inline void vm_unacct_memory(long pages) */ static inline bool arch_validate_prot(unsigned long prot, unsigned long addr) { - return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM)) == 0; + return (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM | PROT_REPLICA)) == 0; } #define arch_validate_prot arch_validate_prot #endif diff --git a/include/linux/module.h b/include/linux/module.h index b58fb669a00c4..57b7f1f6bcde9 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -564,6 +564,7 @@ struct module { #else KABI_RESERVE(1) #endif + KABI_USE(2, struct module_layout *mutable_data_layout) KABI_RESERVE(3) KABI_RESERVE(4) @@ -614,15 +615,13 @@ static inline bool within_module_mutable(unsigned long addr, const struct module *mod) { return (unsigned long)mod->mutable_data_layout->base <= addr && - addr < (unsigned long)mod->mutable_data_layout->base + - mod->mutable_data_layout->size; + addr < (unsigned long)mod->mutable_data_layout->base + mod->mutable_data_layout->size; } static inline bool within_module(unsigned long addr, const struct module *mod) { - return within_module_init(addr, mod) || within_module_core(addr, mod) - || within_module_mutable(addr, mod); + return within_module_init(addr, mod) || within_module_core(addr, mod) || within_module_mutable(addr, mod); } /* Search for module by name: must hold module_mutex. */ diff --git a/include/linux/numa_kernel_replication.h b/include/linux/numa_kernel_replication.h new file mode 100644 index 0000000000000..b832a342ed656 --- /dev/null +++ b/include/linux/numa_kernel_replication.h @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_NUMA_KERNEL_REPLICATION_H +#define _LINUX_NUMA_KERNEL_REPLICATION_H + +#include <linux/kabi.h> + +/* + * Why? Because linux is defined to 1 for some reason, + * and linux/mm.h converted to 1/mm.h. Perhaps compiler? + * Do not ask me, I have no idea. + */ +#if defined(linux) +#define tmp_linux_value linux +#undef linux +#endif + +#include KABI_HIDE_INCLUDE(<linux/mm_types.h>) +#include KABI_HIDE_INCLUDE(<linux/nodemask.h>) +#include KABI_HIDE_INCLUDE(<linux/module.h>) +#include KABI_HIDE_INCLUDE(<linux/mm.h>) +#include KABI_HIDE_INCLUDE(<linux/llist.h>) + +#ifdef CONFIG_KERNEL_REPLICATION +#include KABI_HIDE_INCLUDE(<asm/numa_replication.h>) +#endif + +#if defined(tmp_linux_value) +#define linux tmp_linux_value +#undef tmp_linux_value +#endif + +typedef enum { + NONE = 0, + PMD_PROPAGATION = 1, + PUD_PROPAGATION = 2, + P4D_PROPAGATION = 3, + PGD_PROPAGATION = 4 +} propagation_level_t; + +extern nodemask_t replica_nodes; + +#define for_each_memory_node(nid) \ + for (nid = first_node(replica_nodes); \ + nid != MAX_NUMNODES; \ + nid = next_node(nid, replica_nodes)) + +#ifdef CONFIG_KERNEL_REPLICATION + +#define this_node_pgd(mm) ((mm)->pgd_numa[numa_node_id()]) +#define per_node_pgd(mm, nid) ((mm)->pgd_numa[nid]) + + +static inline bool numa_addr_has_replica(const void *addr) +{ + return ((unsigned long)addr >= PAGE_TABLE_REPLICATION_LEFT) && + ((unsigned long)addr <= PAGE_TABLE_REPLICATION_RIGHT); +} + +static inline void clear_pgtable_list(struct page *head) +{ + struct llist_node *node; + + /* Replica list already have been destroyed */ + if (head->replica_list_node.next == NULL) + return; + + for (node = llist_del_first(&head->replica_list_head); + node != &head->replica_list_node; + node = llist_del_first(&head->replica_list_head)) + node->next = NULL; + head->replica_list_node.next = NULL; +} + +static inline void build_pgd_chain(pgd_t **tables) +{ + int nid; + int prev_node = -1; + + for_each_memory_node(nid) { + virt_to_page(tables[nid])->replica_list_head.first = NULL; + if (prev_node != -1) { + llist_add(&virt_to_page(tables[nid])->replica_list_node, &virt_to_page(tables[prev_node])->replica_list_head); + } else { + /* + * This list is not supposed to be circular, + * but in order to simplify macro implementation, + * we do it anyway. + * God help us + */ + virt_to_page(tables[nid])->replica_list_node.next = &virt_to_page(tables[nid])->replica_list_node; + } + prev_node = nid; + } +} + +static inline bool numa_pgtable_replicated(void *table) +{ + return PageReplicated(virt_to_page(table)); +} + +void __init numa_replication_init(void); +void __init numa_replicate_kernel_text(void); +void numa_replicate_kernel_rodata(void); +void numa_replication_fini(void); + +bool is_text_replicated(void); +propagation_level_t get_propagation_level(void); +void numa_setup_pgd(void); +void __init_or_module *numa_get_replica(void *vaddr, int nid); +int numa_get_memory_node(int nid); +void dump_mm_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end); + +static inline unsigned long offset_in_table(void *ptr) +{ + return (unsigned long)ptr & (~PAGE_MASK); +} + +static inline unsigned long get_table_ptr(struct page *table, unsigned long offset) +{ + return ((unsigned long)page_to_virt(table) + offset); +} + +/** + * @pos: struct page* of current replica + * @table: current table entry to write (virtual address) + * @head_table: table entry form 0th node, will not be a part of this loop + * @nid: node id of current pgtable + * @offset: offset of current table entry in table page in bytes [0 .. 4088] + * @start: boolean value for tmp storage + */ +#define for_each_pgtable(pos, table, head_table, nid, offset, start) \ + for (pos = llist_entry(&virt_to_page(head_table)->replica_list_node, typeof(*pos), replica_list_node), \ + start = true, nid = page_to_nid(pos), \ + offset = offset_in_table(head_table), table = (typeof(table))get_table_ptr(pos, offset); \ + pos != virt_to_page(head_table) || start; \ + pos = llist_entry((pos)->replica_list_node.next, typeof(*pos), replica_list_node), \ + table = (typeof(table))get_table_ptr(pos, offset), \ + nid = page_to_nid(pos), start = false) + +/** + * @pos: struct page* of current replica + * @table: current table entry to write (virtual address) + * @head_table: table entry form 0th node, will not be a part of this loop + * @offset: offset of current table entry in table page in bytes [0 .. 4088] + */ +#define for_each_pgtable_replica(pos, table, head_table, offset) \ + for (pos = llist_entry(virt_to_page(head_table)->replica_list_node.next, typeof(*pos), replica_list_node), \ + offset = offset_in_table(head_table), table = (typeof(table))get_table_ptr(pos, offset); \ + pos != virt_to_page(head_table); \ + pos = llist_entry((pos)->replica_list_node.next, typeof(*pos), replica_list_node), \ + table = (typeof(table))get_table_ptr(pos, offset)) + +/** Safe against removal pos + * @pos: struct page* of current replica + * @n: tmp storage + * @table: current table entry to write (virtual address) + * @head_table: table entry form 0th node, will not be a part of this loop + * @offset: offset of current table entry in table page in bytes [0 .. 4088] + */ +#define for_each_pgtable_replica_safe(pos, n, table, head_table, offset) \ + for (pos = llist_entry(virt_to_page(head_table)->replica_list_node.next, typeof(*pos), replica_list_node), \ + n = llist_entry((pos)->replica_list_node.next, typeof(*pos), replica_list_node), \ + offset = offset_in_table(head_table), table = (typeof(table))get_table_ptr(pos, offset); \ + pos != virt_to_page(head_table); \ + pos = n, n = llist_entry((pos)->replica_list_node.next, typeof(*pos), replica_list_node), \ + table = (typeof(table))get_table_ptr(pos, offset)) + +static inline void pgd_populate_replicated(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp) +{ + pgd_populate(mm, pgdp, p4dp); + + if (!is_text_replicated()) + return; + + if (numa_pgtable_replicated(pgdp)) { + unsigned long offset; + struct page *curr; + pgd_t *curr_pgd; + + for_each_pgtable_replica(curr, curr_pgd, pgdp, offset) { + pgd_populate(mm, curr_pgd, p4dp); + } + } +} + +static inline void p4d_populate_replicated(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp) +{ + p4d_populate(mm, p4dp, pudp); + + if (!is_text_replicated()) + return; + + if (numa_pgtable_replicated(p4dp)) { + unsigned long offset; + struct page *curr; + p4d_t *curr_p4d; + + for_each_pgtable_replica(curr, curr_p4d, p4dp, offset) { + p4d_populate(mm, curr_p4d, pudp); + } + } +} + +static inline void pud_populate_replicated(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp) +{ + pud_populate(mm, pudp, pmdp); + + if (!is_text_replicated()) + return; + + if (numa_pgtable_replicated(pudp)) { + unsigned long offset; + struct page *curr; + pud_t *curr_pud; + + for_each_pgtable_replica(curr, curr_pud, pudp, offset) { + pud_populate(mm, curr_pud, pmdp); + } + } +} + +static inline void pmd_populate_replicated(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) +{ + pmd_populate(mm, pmdp, ptep); + + if (!is_text_replicated()) + return; + + if (numa_pgtable_replicated(pmdp)) { + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + + for_each_pgtable_replica(curr, curr_pmd, pmdp, offset) { + pmd_populate(mm, curr_pmd, ptep); + } + } +} + +#else +#define this_node_pgd(mm) ((mm)->pgd) +#define per_node_pgd(mm, nid) ((mm)->pgd) + +static inline bool numa_pgtable_replicated(void *table) +{ + return false; +} + +static inline void numa_setup_pgd(void) +{ +} + +static inline void __init numa_replication_init(void) +{ +} + +static inline void __init numa_replicate_kernel_text(void) +{ +} + +static inline void numa_replicate_kernel_rodata(void) +{ +} + +static inline void numa_replication_fini(void) +{ +} + +static inline bool numa_addr_has_replica(const void *addr) +{ + return false; +} + +static inline bool is_text_replicated(void) +{ + return false; +} + +static inline void __init_or_module *numa_get_replica(void *vaddr, int nid) +{ + return lm_alias(vaddr); +} + +static inline void dump_mm_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + +#define pgd_populate_replicated pgd_populate +#define p4d_populate_replicated p4d_populate +#define pud_populate_replicated pud_populate +#define pmd_populate_replicated pmd_populate + +#endif /*CONFIG_KERNEL_REPLICATION*/ + +#endif /*_LINUX_NUMA_REPLICATION_H*/ diff --git a/include/linux/numa_replication.h b/include/linux/numa_replication.h deleted file mode 100644 index 1a22b56d9312b..0000000000000 --- a/include/linux/numa_replication.h +++ /dev/null @@ -1,104 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef _LINUX_NUMA_REPLICATION_H -#define _LINUX_NUMA_REPLICATION_H - -#include <linux/kabi.h> - -/* - * Why? Because linux is defined to 1 for some reason, - * and linux/mm.h converted to 1/mm.h. Perhaps compiler? - * Do not ask me, I have no idea. - */ -#if defined(linux) -#define tmp_linux_value linux -#undef linux -#endif - -#include KABI_HIDE_INCLUDE(<linux/mm_types.h>) -#include KABI_HIDE_INCLUDE(<linux/nodemask.h>) -#include KABI_HIDE_INCLUDE(<linux/module.h>) -#include KABI_HIDE_INCLUDE(<linux/mm.h>) - -#ifdef CONFIG_KERNEL_REPLICATION -#include KABI_HIDE_INCLUDE(<asm/numa_replication.h>) -#endif - -#if defined(tmp_linux_value) -#define linux tmp_linux_value -#undef tmp_linux_value -#endif - - -extern nodemask_t replica_nodes; - -#define for_each_memory_node(nid) \ - for (nid = first_node(replica_nodes); \ - nid != MAX_NUMNODES; \ - nid = next_node(nid, replica_nodes)) - -#ifdef CONFIG_KERNEL_REPLICATION -#define this_node_pgd(mm) ((mm)->pgd_numa[numa_node_id()]) -#define per_node_pgd(mm, nid) ((mm)->pgd_numa[nid]) - -static inline bool numa_addr_has_replica(const void *addr) -{ - return ((unsigned long)addr >= PAGE_TABLE_REPLICATION_LEFT) && - ((unsigned long)addr <= PAGE_TABLE_REPLICATION_RIGHT); -} - -void __init numa_replication_init(void); -void __init numa_replicate_kernel_text(void); -void numa_replicate_kernel_rodata(void); -void numa_replication_fini(void); - -bool is_text_replicated(void); -void numa_setup_pgd(void); -void __init_or_module *numa_get_replica(void *vaddr, int nid); -int numa_get_memory_node(int nid); -void dump_mm_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end); -#else -#define this_node_pgd(mm) ((mm)->pgd) -#define per_node_pgd(mm, nid) ((mm)->pgd) - -static inline void numa_setup_pgd(void) -{ -} - -static inline void __init numa_replication_init(void) -{ -} - -static inline void __init numa_replicate_kernel_text(void) -{ -} - -static inline void numa_replicate_kernel_rodata(void) -{ -} - -static inline void numa_replication_fini(void) -{ -} - -static inline bool numa_addr_has_replica(const void *addr) -{ - return false; -} - -static inline bool is_text_replicated(void) -{ - return false; -} - -static inline void __init_or_module *numa_get_replica(void *vaddr, int nid) -{ - return lm_alias(vaddr); -} - -static inline void dump_mm_pgtables(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ -} -#endif /*CONFIG_KERNEL_REPLICATION*/ -#endif /*_LINUX_NUMA_REPLICATION_H*/ diff --git a/include/linux/numa_user_replication.h b/include/linux/numa_user_replication.h new file mode 100644 index 0000000000000..9916481eed353 --- /dev/null +++ b/include/linux/numa_user_replication.h @@ -0,0 +1,1472 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_NUMA_USER_REPLICATION_H +#define _LINUX_NUMA_USER_REPLICATION_H + +#include <linux/kabi.h> +#include <linux/numa_kernel_replication.h> + +/* Same as in numa_kernel_replication.h */ +#if defined(linux) +#define tmp_linux_value linux +#undef linux +#endif + +#include KABI_HIDE_INCLUDE(<linux/mmu_notifier.h>) +#include KABI_HIDE_INCLUDE(<linux/memcontrol.h>) +#include KABI_HIDE_INCLUDE(<trace/events/kmem.h>) + +#if defined(tmp_linux_value) +#define linux tmp_linux_value +#undef tmp_linux_value +#endif + +/* + * pgwlk_for_each_replica_page - Iterates over pgwlk->replica_pages. + * + * @walk: the owner of replcia_pages (a.k.a pgtable_walker) + * @page: assigns each page from replica_pages to this variable. + * @nid: assigns each node ID of _memory_ nodes to this variable. + * + * Note that page can be NULL if replica_pages has not been assigned + * yet. Caller must check each page for NULL if needed. + */ +#define pgwlk_for_each_replica_page(zp, page, nid) \ + for (nid = first_node(node_states[N_MEMORY]), (page) = (zp)->replica_pages[first_memory_node]; \ + nid != MAX_NUMNODES; \ + nid = next_node(nid, node_states[N_MEMORY]), (page) = (zp)->replica_pages[nid]) + +#ifdef CONFIG_USER_REPLICATION_DEBUG +#define UREPLICA_DEBUG(code) code; +#else +#define UREPLICA_DEBUG(code) {} +#endif + +#ifdef CONFIG_USER_REPLICATION + +extern unsigned long replica_count; + +struct pgtable_private { + pte_t *pte_numa[MAX_NUMNODES]; + struct page *replica_pages[MAX_NUMNODES]; + bool pte_replicated; +}; + +static inline void pgtable_pte_step(struct pgtable_private *zp, int nr) +{ + int nid; + + if (zp->pte_replicated) + for_each_memory_node(nid) + zp->pte_numa[nid] += nr; +} + +static inline void pgtable_update_pte(struct pgtable_private *zp, pte_t *pte) +{ + + zp->pte_numa[page_to_nid(virt_to_page(pte))] = pte; + zp->pte_replicated = false; + if (numa_pgtable_replicated(pte)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + int nid; + + zp->pte_replicated = true; + for_each_pgtable_replica(curr, curr_pte, pte, offset) { + nid = page_to_nid(curr); + zp->pte_numa[nid] = curr_pte; + } + } +} + +static inline pmd_t *get_master_pmd(pmd_t *pmd) +{ + return (pmd_t *)get_table_ptr(virt_to_page(pmd)->master_table, offset_in_table(pmd)); +} + +static inline pud_t *get_master_pud(pud_t *pud) +{ + return (pud_t *)get_table_ptr(virt_to_page(pud)->master_table, offset_in_table(pud)); +} + +static inline void set_master_page_for_puds(int allocated_node, pud_t **new) +{ + int nid; + struct page *master_table; + struct page *curr_table; + + if (allocated_node == NUMA_NO_NODE) + allocated_node = first_memory_node; + + master_table = virt_to_page(new[allocated_node]); + + for_each_memory_node(nid) { + curr_table = virt_to_page(new[nid]); + curr_table->master_table = master_table; + } +} + +static inline void set_master_page_for_pmds(int allocated_node, pmd_t **new) +{ + int nid; + struct page *master_table; + struct page *curr_table; + + if (allocated_node == NUMA_NO_NODE) + allocated_node = first_memory_node; + + master_table = virt_to_page(new[allocated_node]); + + for_each_memory_node(nid) { + if (nid == allocated_node) + continue; + curr_table = virt_to_page(new[nid]); + curr_table->master_table = master_table; + } +} + +static inline void set_master_page_for_ptes(int allocated_node, struct page **new) +{ + int nid; + struct page *master_table; + struct page *curr_table; + + if (allocated_node == NUMA_NO_NODE) + allocated_node = first_memory_node; + + master_table = new[allocated_node]; + + for_each_memory_node(nid) { + if (nid == allocated_node) + continue; + curr_table = new[nid]; + curr_table->master_table = master_table; + } +} + +void numa_mm_apply_replication(struct mm_struct *mm); +int numa_clone_pte(struct vm_area_struct *vma, unsigned long start, unsigned long end); +int numa_remove_replicas(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool alloc_new_page); +int phys_duplicate(struct vm_area_struct *vma, unsigned long start, size_t len); +int phys_deduplicate(struct vm_area_struct *vma, unsigned long start, size_t len, bool alloc_new_page); +unsigned long phys_duplicate_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end); +int phys_duplicate_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end); + +static inline int numa_is_vma_replicant(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_REPLICA_INIT) + return 1; + + return 0; +} + +static inline bool vma_has_replicas(struct vm_area_struct *vma) +{ + return vma->vm_flags & VM_REPLICA_COMMIT; +} + +static inline bool vma_might_be_replicated(struct vm_area_struct *vma) +{ + return (vma->vm_file || vma_is_anonymous(vma)) && ((vma->vm_flags & VM_REPLICA_INIT) && vma_is_accessible(vma) && !(vma->vm_flags & (VM_WRITE | VM_SHARED))); +} + +static inline bool __vm_flags_replica_candidate(struct vm_area_struct *vma, unsigned long flags) +{ + return (vma->vm_file || vma_is_anonymous(vma)) && ((flags & VM_ACCESS_FLAGS) && !(flags & (VM_WRITE | VM_SHARED))); +} + +static inline bool vma_replica_candidate(struct vm_area_struct *vma) +{ + return __vm_flags_replica_candidate(vma, vma->vm_flags); +} + +/* + * Arch specific implementation + * TODO: remove these functions from generic header + */ +#ifdef CONFIG_ARM64 +static inline void set_pte_replicated(pte_t *ptep, pte_t pte) +{ + WRITE_ONCE(*ptep, pte); + + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, ptep, offset) { + WRITE_ONCE(*curr_pte, pte); + } + } + + if (pte_valid_not_user(pte)) { + dsb(ishst); + isb(); + } +} + +static inline void pte_clear_replicated(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + UREPLICA_DEBUG(ktime_t start; + ktime_t end;) + UREPLICA_DEBUG(start = ktime_get()); + set_pte_replicated(ptep, __pte(0)); + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_pte_clear(end - start);) +} + +static inline void set_pte_at_replicated(struct mm_struct *mm, + unsigned long address, + pte_t *ptep, pte_t pte) +{ + UREPLICA_DEBUG(ktime_t start; + ktime_t end;) + UREPLICA_DEBUG(start = ktime_get()); + + if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte)) + __sync_icache_dcache(pte); + + if (system_supports_mte() && + pte_present(pte) && pte_tagged(pte) && !pte_special(pte)) + mte_sync_tags(ptep, pte); + + __check_racy_pte_update(mm, ptep, pte); + + set_pte_replicated(ptep, pte); + + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_set_pte_at(end - start);) + +} + +static inline pte_t ptep_get_and_clear_replicated(struct mm_struct *mm, + unsigned long address, + pte_t *pte) +{ + pte_t pteval; + + UREPLICA_DEBUG(ktime_t start; + ktime_t end;) + UREPLICA_DEBUG(start = ktime_get()); + pteval = ptep_get_and_clear(mm, address, pte); + if (numa_pgtable_replicated(pte)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, pte, offset) { + pteval = ptep_get_and_clear(mm, address, curr_pte); + } + } + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_pte_get_and_clear(end - start);) + return pteval; +} + +static inline pte_t ptep_clear_flush_replicated(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + struct mm_struct *mm = (vma)->vm_mm; + pte_t pte; + + pte = ptep_get_and_clear_replicated(mm, address, ptep); + if (pte_accessible(mm, pte)) + flush_tlb_page(vma, address); + return pte; +} + +static inline int ptep_test_and_clear_young_replicated(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte) +{ + int ret; + + UREPLICA_DEBUG(ktime_t start; + ktime_t end;) + UREPLICA_DEBUG(start = ktime_get()); + ret = ptep_test_and_clear_young(vma, address, pte); + if (numa_pgtable_replicated(pte)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, pte, offset) { + ret |= ptep_test_and_clear_young(vma, address, curr_pte); + } + } + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_ptep_test_and_clear_young(end - start);) + return ret; +} + +static inline int ptep_clear_flush_young_replicated(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + int young = ptep_test_and_clear_young_replicated(vma, address, ptep); + + if (young) { + /* + * We can elide the trailing DSB here since the worst that can + * happen is that a CPU continues to use the young entry in its + * TLB and we mistakenly reclaim the associated page. The + * window for such an event is bounded by the next + * context-switch, which provides a DSB to complete the TLB + * invalidation. + */ + flush_tlb_page_nosync(vma, address); + } + + return young; +} + +#ifdef CONFIG_MMU_NOTIFIER +#define set_pte_at_notify_replicated(__mm, __address, __ptep, __pte) \ +({ \ + struct mm_struct *___mm = __mm; \ + unsigned long ___address = __address; \ + pte_t ___pte = __pte; \ + \ + mmu_notifier_change_pte(___mm, ___address, ___pte); \ + set_pte_at_replicated(___mm, ___address, __ptep, ___pte); \ +}) + +#define ptep_clear_flush_young_notify_replicated(__vma, __address, __ptep) \ +({ \ + int __young; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + __young = ptep_clear_flush_young_replicated(___vma, ___address, __ptep);\ + __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ + ___address, \ + ___address + \ + PAGE_SIZE); \ + __young; \ +}) + +#define ptep_clear_flush_notify_replicated(__vma, __address, __ptep) \ +({ \ + unsigned long ___addr = __address & PAGE_MASK; \ + struct mm_struct *___mm = (__vma)->vm_mm; \ + pte_t ___pte; \ + \ + ___pte = ptep_clear_flush_replicated(__vma, __address, __ptep); \ + mmu_notifier_invalidate_range(___mm, ___addr, \ + ___addr + PAGE_SIZE); \ + \ + ___pte; \ +}) + +#define ptep_clear_young_notify_replicated(__vma, __address, __ptep) \ +({ \ + int __young; \ + struct vm_area_struct *___vma = __vma; \ + unsigned long ___address = __address; \ + __young = ptep_test_and_clear_young_replicated(___vma, ___address, __ptep); \ + __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ + ___address + PAGE_SIZE); \ + __young; \ +}) + +#else +#define set_pte_at_notify_replicated set_pte_at_replicated +#define ptep_clear_flush_young_notify_replicated ptep_clear_flush_young_replicated +#define ptep_clear_flush_notify_replicated ptep_clear_flush_replicated +#define ptep_clear_young_notify_replicated ptep_test_and_clear_young_replicated +#endif + +static inline pte_t ptep_get_and_clear_full_replicated(struct mm_struct *mm, + unsigned long address, + pte_t *pte, int full) +{ + return ptep_get_and_clear_replicated(mm, address, pte); +} + +static inline void pte_clear_not_present_full_replicated(struct mm_struct *mm, + unsigned long address, + pte_t *ptep, + int full) +{ + pte_clear_replicated(mm, address, ptep); +} + +static inline void ptep_set_wrprotect_replicated(struct mm_struct *mm, + unsigned long addr, + pte_t *pte) +{ + UREPLICA_DEBUG(ktime_t start; + ktime_t end;) + UREPLICA_DEBUG(start = ktime_get()); + ptep_set_wrprotect(mm, addr, pte); + if (numa_pgtable_replicated(pte)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, pte, offset) { + ptep_set_wrprotect(mm, addr, curr_pte); + } + } + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_ptep_set_wrprotect(end - start);) +} + +static inline int ptep_set_access_flags_nosync(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + pteval_t old_pteval, pteval; + pte_t pte = READ_ONCE(*ptep); + + if (pte_same(pte, entry)) + return 0; + + /* only preserve the access flags and write permission */ + pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY; + + /* + * Setting the flags must be done atomically to avoid racing with the + * hardware update of the access/dirty state. The PTE_RDONLY bit must + * be set to the most permissive (lowest value) of *ptep and entry + * (calculated as: a & b == ~(~a | ~b)). + */ + pte_val(entry) ^= PTE_RDONLY; + pteval = pte_val(pte); + do { + old_pteval = pteval; + pteval ^= PTE_RDONLY; + pteval |= pte_val(entry); + pteval ^= PTE_RDONLY; + pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); + } while (pteval != old_pteval); + + /* Invalidate a stale read-only entry */ + + return 1; +} + +static inline int ptep_set_access_flags_replicated(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int res; + + UREPLICA_DEBUG(ktime_t start; + ktime_t end;) + UREPLICA_DEBUG(start = ktime_get()); + res = ptep_set_access_flags_nosync(vma, address, ptep, entry, dirty); + + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + unsigned long flags = pte_val(entry) & (~PTE_ADDR_MASK); + + for_each_pgtable_replica(curr, curr_pte, ptep, offset) { + pte_t new_entry; + + WARN_ON(!pte_present(*curr_pte)); + + new_entry = __pte((pte_val(*curr_pte) & PTE_ADDR_MASK) | flags); + res |= ptep_set_access_flags_nosync(vma, address, curr_pte, new_entry, dirty); + } + } + + if (dirty) + flush_tlb_page(vma, address); + + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_ptep_set_access_flags(end - start);) + return res; +} + +static inline pte_t ptep_modify_prot_start_replicated(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t res; + + UREPLICA_DEBUG(ktime_t start; + ktime_t end;) + UREPLICA_DEBUG(start = ktime_get()); + res = ptep_modify_prot_start(vma, addr, ptep); + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, ptep, offset) { + ptep_modify_prot_start(vma, addr, curr_pte); + } + } + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_ptep_modify_prot_start(end - start);) + return res; +} + +static inline void ptep_modify_prot_commit_replicated(struct vm_area_struct *vma, + unsigned long addr, + pte_t *ptep, pte_t old_pte, pte_t pte) +{ + set_pte_at_replicated(vma->vm_mm, addr, ptep, pte); +} + +static inline void set_huge_pte_at_replicated(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + set_huge_pte_at(mm, addr, ptep, pte); + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, ptep, offset) { + set_huge_pte_at(mm, addr, curr_pte, pte); + } + } +} + +static inline int huge_ptep_set_access_flags_replicated(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t pte, int dirty) +{ + int ret = huge_ptep_set_access_flags(vma, addr, ptep, pte, dirty); + + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, ptep, offset) { + ret |= huge_ptep_set_access_flags(vma, addr, curr_pte, pte, dirty); + } + } + return ret; +} + +static inline void huge_ptep_clear_flush_replicated(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + huge_ptep_clear_flush(vma, addr, ptep); + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, ptep, offset) { + huge_ptep_clear_flush(vma, addr, curr_pte); + } + } +} + +static inline void set_huge_swap_pte_at_replicated(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte, unsigned long sz) +{ + set_huge_swap_pte_at(mm, addr, ptep, pte, sz); + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, ptep, offset) { + set_huge_swap_pte_at(mm, addr, curr_pte, pte, sz); + } + } +} + +static inline pte_t huge_ptep_modify_prot_start_replicated(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + pte_t res = huge_ptep_modify_prot_start(vma, addr, ptep); + + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, ptep, offset) { + res = huge_ptep_modify_prot_start(vma, addr, curr_pte); + } + } + return res; +} + +static inline void huge_ptep_modify_prot_commit_replicated(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t old_pte, pte_t pte) +{ + huge_ptep_modify_prot_commit(vma, addr, ptep, old_pte, pte); + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, ptep, offset) { + huge_ptep_modify_prot_commit(vma, addr, curr_pte, old_pte, pte); + } + } +} + +static inline void huge_pte_clear_replicated(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, unsigned long sz) +{ + huge_pte_clear(mm, addr, ptep, sz); + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, ptep, offset) { + huge_pte_clear(mm, addr, curr_pte, sz); + } + } +} + +static inline pte_t huge_ptep_get_and_clear_replicated(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + pte_t ret = huge_ptep_get_and_clear(mm, addr, ptep); + + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, ptep, offset) { + ret = huge_ptep_get_and_clear(mm, addr, curr_pte); + } + } + return ret; +} + +static inline void huge_ptep_set_wrprotect_replicated(struct mm_struct *mm, + unsigned long addr, pte_t *ptep) +{ + huge_ptep_set_wrprotect(mm, addr, ptep); + if (numa_pgtable_replicated(ptep)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, ptep, offset) { + huge_ptep_set_wrprotect(mm, addr, curr_pte); + } + } +} + +static inline void pmd_clear_replicated(pmd_t *pmdp) +{ + pmd_clear(pmdp); + + if (numa_pgtable_replicated(pmdp)) { + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + + for_each_pgtable_replica(curr, curr_pmd, pmdp, offset) { + pmd_clear(curr_pmd); + } + } +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmdp_huge_get_and_clear_replicated(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) +{ + pmd_t pmdval; + + pmdval = pmdp_huge_get_and_clear(mm, address, pmdp); + if (numa_pgtable_replicated(pmdp)) { + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + + for_each_pgtable_replica(curr, curr_pmd, pmdp, offset) { + pmdval = pmdp_huge_get_and_clear(mm, address, curr_pmd); + } + } + return pmdval; +} + +static inline void pmdp_set_wrprotect_replicated(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) +{ + pmdp_set_wrprotect(mm, address, pmdp); + if (numa_pgtable_replicated(pmdp)) { + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + + for_each_pgtable_replica(curr, curr_pmd, pmdp, offset) { + pmdp_set_wrprotect(mm, address, curr_pmd); + } + } +} + +#endif + +static inline void huge_pmd_set_accessed_replicated(struct vm_fault *vmf) +{ + pmd_t entry; + unsigned long haddr; + pmd_t orig_pmd = vmf->orig_pmd; + bool write = vmf->flags & FAULT_FLAG_WRITE; + bool update = false; + + vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) + goto skip_orig; + + entry = pmd_mkyoung(orig_pmd); + if (write) + entry = pmd_mkdirty(entry); + haddr = vmf->address & HPAGE_PMD_MASK; + update |= pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, write); + +skip_orig: + + if (numa_pgtable_replicated(vmf->pmd)) { + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + + for_each_pgtable_replica(curr, curr_pmd, vmf->pmd, offset) { + entry = pmd_mkyoung(*curr_pmd); + if (write) + entry = pmd_mkdirty(entry); + update |= pmdp_set_access_flags(vmf->vma, haddr, curr_pmd, entry, write); + } + } + + if (update) + update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); + spin_unlock(vmf->ptl); +} + +static inline int pmdp_set_access_flags_replicated(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ + int res = pmdp_set_access_flags(vma, address, pmdp, entry, dirty); + + if (numa_pgtable_replicated(pmdp)) { + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + + for_each_pgtable_replica(curr, curr_pmd, pmdp, offset) { + res |= pmdp_set_access_flags(vma, address, curr_pmd, entry, dirty); + } + } + return res; +} + +static inline void set_pmd_at_replicated(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ + set_pmd_at(mm, addr, pmdp, pmd); + if (numa_pgtable_replicated(pmdp)) { + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + + for_each_pgtable_replica(curr, curr_pmd, pmdp, offset) { + set_pmd_at(mm, addr, curr_pmd, pmd); + } + } +} + +static inline pmd_t pmdp_invalidate_replicated(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmdval; + + pmdval = pmdp_invalidate(vma, address, pmdp); + if (numa_pgtable_replicated(pmdp)) { + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + + for_each_pgtable_replica(curr, curr_pmd, pmdp, offset) { + pmdval = pmdp_invalidate(vma, address, curr_pmd); + } + } + return pmdval; +} + +static inline void pud_clear_replicated(pud_t *pudp) +{ + pud_clear(pudp); + + if (numa_pgtable_replicated(pudp)) { + unsigned long offset; + struct page *curr; + pud_t *curr_pud; + + for_each_pgtable_replica(curr, curr_pud, pudp, offset) { + pud_clear(curr_pud); + } + } +} +#endif + +static inline void build_pte_chain(struct page **tables) +{ + int nid; + int prev_node = -1; + + for_each_memory_node(nid) { + tables[nid]->replica_list_head.first = NULL; + if (prev_node != -1) { + llist_add(&tables[nid]->replica_list_node, &tables[prev_node]->replica_list_head); + } else { + tables[nid]->replica_list_node.next = &tables[nid]->replica_list_node; + } + prev_node = nid; + } +} + +static inline void build_pmd_chain(pmd_t **tables) +{ + int nid; + int prev_node = -1; + + for_each_memory_node(nid) { + virt_to_page(tables[nid])->replica_list_head.first = NULL; + if (prev_node != -1) { + llist_add(&virt_to_page(tables[nid])->replica_list_node, &virt_to_page(tables[prev_node])->replica_list_head); + } else { + virt_to_page(tables[nid])->replica_list_node.next = &virt_to_page(tables[nid])->replica_list_node; + } + prev_node = nid; + } +} + +static inline void build_pud_chain(pud_t **tables) +{ + int nid; + int prev_node = -1; + + for_each_memory_node(nid) { + virt_to_page(tables[nid])->replica_list_head.first = NULL; + if (prev_node != -1) { + llist_add(&virt_to_page(tables[nid])->replica_list_node, &virt_to_page(tables[prev_node])->replica_list_head); + } else { + virt_to_page(tables[nid])->replica_list_node.next = &virt_to_page(tables[nid])->replica_list_node; + } + prev_node = nid; + } +} + +static inline void build_p4d_chain(p4d_t **tables) +{ + int nid; + int prev_node = -1; + + for_each_memory_node(nid) { + virt_to_page(tables[nid])->replica_list_head.first = NULL; + if (prev_node != -1) { + llist_add(&virt_to_page(tables[nid])->replica_list_node, &virt_to_page(tables[prev_node])->replica_list_head); + } else { + virt_to_page(tables[nid])->replica_list_node.next = &virt_to_page(tables[nid])->replica_list_node; + } + prev_node = nid; + } +} + + +pgd_t *fault_pgd_offset(struct vm_fault *vmf, unsigned long address); +p4d_t *fault_p4d_alloc(struct vm_fault *vmf, struct mm_struct *mm, pgd_t *pgd, unsigned long address); +pud_t *fault_pud_alloc(struct vm_fault *vmf, struct mm_struct *mm, p4d_t *p4d, unsigned long address); +pmd_t *fault_pmd_alloc(struct vm_fault *vmf, struct mm_struct *mm, pud_t *pud, unsigned long address); +int fault_pte_alloc(struct vm_fault *vmf); +pte_t *huge_pte_alloc_copy_tables(struct mm_struct *dst, struct mm_struct *src, + unsigned long addr, unsigned long sz); +pte_t *huge_pte_alloc_replica(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long sz); + +pte_t *cpr_alloc_pte_map(struct mm_struct *mm, unsigned long addr, + pmd_t *src_pmd, pmd_t *dst_pmd); +pte_t *cpr_alloc_pte_map_lock(struct mm_struct *mm, unsigned long addr, + pmd_t *src_pmd, pmd_t *dst_pmd, spinlock_t **ptl); +pmd_t *cpr_alloc_pmd(struct mm_struct *mm, unsigned long addr, + pud_t *src_pud, pud_t *dst_pud); +pud_t *cpr_alloc_pud(struct mm_struct *mm, unsigned long addr, + p4d_t *src_p4d, p4d_t *dst_p4d); +p4d_t *cpr_alloc_p4d(struct mm_struct *mm, unsigned long addr, + pgd_t *src_pgd, pgd_t *dst_pgd); + +/* + * Copied from rmap.c + * We need to only to decrease compound_mapcount + * and if this was the last thp mapping decrease mapcount + * of tail pages + */ +static inline void dec_compound_mapcount(struct page *head) +{ + int i, nr; + + if (!atomic_add_negative(-1, compound_mapcount_ptr(head))) + return; + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) + return; + if (TestClearPageDoubleMap(head)) { + /* + * Subpages can be mapped with PTEs too. Check how many of + * them are still mapped. + */ + for (i = 0, nr = 0; i < thp_nr_pages(head); i++) { + if (atomic_add_negative(-1, &head[i]._mapcount)) + nr++; + } + + /* + * Queue the page for deferred split if at least one small + * page of the compound page is unmapped, but at least one + * small page is still mapped. + */ + if (nr && nr < thp_nr_pages(head)) { + if (!PageHotreplace(head)) + deferred_split_huge_page(head); + } + } +} + +static inline void cleanup_pte_list(pgtable_t table) +{ + table->replica_list_node.next = NULL; +} + +static inline void cleanup_pmd_list(struct page *table) +{ +#ifndef __PAGETABLE_PMD_FOLDED + table->replica_list_node.next = NULL; +#endif +} + +static inline void cleanup_pud_list(struct page *table) +{ +#ifndef __PAGETABLE_PUD_FOLDED + table->replica_list_node.next = NULL; +#endif +} + +static inline void cleanup_p4d_list(struct page *table) +{ +#ifndef __PAGETABLE_P4D_FOLDED + table->replica_list_node.next = NULL; +#endif +} + +struct numa_context_switch_stat { + unsigned long __percpu *pcp_stats; + unsigned long *last_stats; + unsigned long *total_stats; + spinlock_t lock; +}; + +static inline int mm_init_numa_stats(struct mm_struct *mm) +{ + mm->context_switch_stats = (struct numa_context_switch_stat *)kmalloc(sizeof(struct numa_context_switch_stat), GFP_KERNEL); + if (!mm->context_switch_stats) + goto fail1; + + mm->context_switch_stats->last_stats = (unsigned long *)kmalloc(sizeof(unsigned long) * MAX_NUMNODES, GFP_KERNEL | __GFP_ZERO); + if (!mm->context_switch_stats->last_stats) + goto fail2; + + mm->context_switch_stats->total_stats = (unsigned long *)kmalloc(sizeof(unsigned long) * MAX_NUMNODES, GFP_KERNEL | __GFP_ZERO); + if (!mm->context_switch_stats->total_stats) + goto fail3; + + mm->context_switch_stats->pcp_stats = alloc_percpu_gfp(unsigned long, GFP_KERNEL | __GFP_ZERO); + if (!mm->context_switch_stats->pcp_stats) + goto fail4; + + spin_lock_init(&(mm->context_switch_stats->lock)); + + return 0; +fail4: + kfree(mm->context_switch_stats->total_stats); +fail3: + kfree(mm->context_switch_stats->last_stats); +fail2: + kfree(mm->context_switch_stats); + mm->context_switch_stats = NULL; +fail1: + return -ENOMEM; +} + +static inline void mm_free_numa_stats(struct mm_struct *mm) +{ + free_percpu(mm->context_switch_stats->pcp_stats); + kfree(mm->context_switch_stats->total_stats); + kfree(mm->context_switch_stats->last_stats); + kfree(mm->context_switch_stats); + mm->context_switch_stats = NULL; +} +void numa_account_switch(struct mm_struct *mm); + +void numa_accumulate_switches(struct mm_struct *mm); + +struct numa_replication_control { + bool user_replication_active; + fork_policy_t fork_policy; + table_replication_policy_t table_policy; + data_replication_policy_t data_policy; + spinlock_t lock; // serializes modification of numa_replication_control::in_candidate_list + bool in_candidate_list; + struct list_head replication_candidates; + struct mm_struct *owner; + struct rw_semaphore rmap_lock; + unsigned long __percpu *pcp_replicated_pages; + unsigned long __percpu *pcp_dereplicated_pages; + unsigned long __percpu *pcp_replicated_tables; + unsigned long __percpu *pcp_dereplicated_tables; +}; + +static inline bool get_user_replication_policy(struct mm_struct *mm) +{ + return mm->replication_ctl->user_replication_active; +} + +static inline fork_policy_t get_fork_policy(struct mm_struct *mm) +{ + return mm->replication_ctl->fork_policy; +} + +static inline table_replication_policy_t get_table_replication_policy(struct mm_struct *mm) +{ + return mm->replication_ctl->table_policy; +} + +static inline data_replication_policy_t get_data_replication_policy(struct mm_struct *mm) +{ + return mm->replication_ctl->data_policy; +} + +static inline void set_user_replication_policy(struct mm_struct *mm, bool value) +{ + mm->replication_ctl->user_replication_active = value; +} + +static inline void set_fork_policy(struct mm_struct *mm, fork_policy_t value) +{ + mm->replication_ctl->fork_policy = value; +} + +static inline void set_table_replication_policy(struct mm_struct *mm, table_replication_policy_t value) +{ + mm->replication_ctl->table_policy = value; +} + +static inline void set_data_replication_policy(struct mm_struct *mm, data_replication_policy_t value) +{ + mm->replication_ctl->data_policy = value; +} + +static inline void __account_replicated_data_size(struct mm_struct *mm, long long size) +{ + if (size == 0) + return; + + if (size > 0) { + this_cpu_add(*(mm->replication_ctl->pcp_replicated_pages), size); + } else { + this_cpu_add(*(mm->replication_ctl->pcp_dereplicated_pages), -size); + } +} + +static inline void __account_replicated_table_size(struct mm_struct *mm, long long size) +{ + if (size == 0) + return; + + if (size > 0) { + this_cpu_add(*(mm->replication_ctl->pcp_replicated_tables), size); + } else { + this_cpu_add(*(mm->replication_ctl->pcp_dereplicated_tables), -size); + } +} + +static inline void account_replicated_page(struct mm_struct *mm) +{ + __account_replicated_data_size(mm, PAGE_SIZE * replica_count); +} + +static inline void account_replicated_hugepage(struct mm_struct *mm) +{ + __account_replicated_data_size(mm, PMD_SIZE * replica_count); +} + +static inline void account_dereplicated_page(struct mm_struct *mm) +{ + __account_replicated_data_size(mm, -PAGE_SIZE * replica_count); +} + +static inline void account_dereplicated_hugepage(struct mm_struct *mm) +{ + __account_replicated_data_size(mm, -PMD_SIZE * replica_count); +} + +static inline void account_replicated_table(struct mm_struct *mm) +{ + __account_replicated_table_size(mm, PAGE_SIZE * replica_count); +} + +static inline void account_dereplicated_table(struct mm_struct *mm) +{ + __account_replicated_table_size(mm, -PAGE_SIZE * replica_count); +} + +static inline void __memcg_account_replicated_data_size(struct mem_cgroup *memcg, long long size) +{ + if (size == 0) + return; + + css_get(&memcg->css); + if (size > 0) { + this_cpu_add(*(memcg->replication_ctl->pcp_replicated_pages), size); + } else { + this_cpu_add(*(memcg->replication_ctl->pcp_dereplicated_pages), -size); + } + css_put(&memcg->css); +} + +static inline void memcg_account_replicated_pages(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + __memcg_account_replicated_data_size(memcg, PAGE_SIZE * nr_pages); +} + +static inline void memcg_account_dereplicated_pages(struct mem_cgroup *memcg, unsigned int nr_pages) +{ + __memcg_account_replicated_data_size(memcg, -PAGE_SIZE * nr_pages); +} + +static inline void memcg_account_replicated_pgtable_page(struct mm_struct *mm, void *pgtable) +{ + struct page *page = virt_to_page(pgtable); + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + + css_get(&memcg->css); + BUG_ON(page->memcg_data); + + page->memcg_data = (unsigned long)memcg; + + + this_cpu_add(*(memcg->replication_ctl->pcp_replicated_tables), PAGE_SIZE); + + css_put(&memcg->css); + +} + +static inline void memcg_account_dereplicated_pgtable_page(void *pgtable) +{ + struct page *page = virt_to_page(pgtable); + struct mem_cgroup *memcg = __page_memcg(page); + + css_get(&memcg->css); + this_cpu_add(*(memcg->replication_ctl->pcp_replicated_tables), -PAGE_SIZE); + page->memcg_data = 0; + css_put(&memcg->css); +} + +static inline void memcg_account_replicated_p4d_page(struct mm_struct *mm, void *pgtable) +{ +#ifndef __PAGETABLE_P4D_FOLDED + memcg_account_replicated_pgtable_page(mm, pgtable); +#endif +} + +static inline void memcg_account_dereplicated_p4d_page(void *pgtable) +{ +#ifndef __PAGETABLE_P4D_FOLDED + memcg_account_dereplicated_pgtable_page(pgtable); +#endif +} + +static inline void memcg_account_replicated_pud_page(struct mm_struct *mm, void *pgtable) +{ +#ifndef __PAGETABLE_PUD_FOLDED + memcg_account_replicated_pgtable_page(mm, pgtable); +#endif +} + +static inline void memcg_account_dereplicated_pud_page(void *pgtable) +{ +#ifndef __PAGETABLE_PUD_FOLDED + memcg_account_dereplicated_pgtable_page(pgtable); +#endif +} + +static inline void memcg_account_replicated_pmd_page(struct mm_struct *mm, void *pgtable) +{ +#ifndef __PAGETABLE_PMD_FOLDED + memcg_account_replicated_pgtable_page(mm, pgtable); +#endif +} + +static inline void memcg_account_dereplicated_pmd_page(void *pgtable) +{ +#ifndef __PAGETABLE_PMD_FOLDED + memcg_account_dereplicated_pgtable_page(pgtable); +#endif +} + +static inline void memcg_account_replicated_pte_page(struct mm_struct *mm, void *pgtable) +{ + memcg_account_replicated_pgtable_page(mm, pgtable); +} + +static inline void memcg_account_dereplicated_pte_page(void *pgtable) +{ + memcg_account_dereplicated_pgtable_page(pgtable); +} + +static inline long long __total_replicated_data(unsigned long __percpu *pcp_replicated, + unsigned long __percpu *pcp_dereplicated) +{ + long long result = 0; + int cpu; + + for_each_possible_cpu(cpu) { + unsigned long *ptr = per_cpu_ptr(pcp_replicated, cpu); + + result += *ptr; + ptr = per_cpu_ptr(pcp_dereplicated, cpu); + result -= *ptr; + } + return result; +} + +static inline unsigned long total_replicated_data_bytes_mm(struct mm_struct *mm) +{ + return __total_replicated_data(mm->replication_ctl->pcp_replicated_pages, mm->replication_ctl->pcp_dereplicated_pages); +} + +static inline long long total_replicated_data_bytes_memecg(struct mem_cgroup *memcg) +{ + return __total_replicated_data(memcg->replication_ctl->pcp_replicated_pages, memcg->replication_ctl->pcp_dereplicated_pages); +} + +static inline unsigned long total_replicated_table_bytes_mm(struct mm_struct *mm) +{ + return __total_replicated_data(mm->replication_ctl->pcp_replicated_tables, mm->replication_ctl->pcp_dereplicated_tables); +} + +static inline long long total_replicated_table_bytes_memecg(struct mem_cgroup *memcg) +{ + return __total_replicated_data(memcg->replication_ctl->pcp_replicated_tables, memcg->replication_ctl->pcp_dereplicated_tables); +} + +static inline int alloc_numa_replication_ctl(struct mm_struct *mm) +{ + mm->replication_ctl = kmalloc(sizeof(struct numa_replication_control), GFP_KERNEL); + if (!mm->replication_ctl) + return -ENOMEM; + + mm->replication_ctl->owner = mm; + mm->replication_ctl->in_candidate_list = false; + mm->replication_ctl->user_replication_active = get_mem_cgroup_from_mm(mm)->replication_ctl->table_policy != TABLE_REPLICATION_NONE; + mm->replication_ctl->fork_policy = get_mem_cgroup_from_mm(mm)->replication_ctl->fork_policy; + mm->replication_ctl->table_policy = get_mem_cgroup_from_mm(mm)->replication_ctl->table_policy; + mm->replication_ctl->data_policy = get_mem_cgroup_from_mm(mm)->replication_ctl->data_policy; + spin_lock_init(&mm->replication_ctl->lock); + init_rwsem(&mm->replication_ctl->rmap_lock); + + mm->replication_ctl->pcp_replicated_pages = alloc_percpu_gfp(unsigned long, GFP_KERNEL | __GFP_ZERO); + if (!mm->replication_ctl->pcp_replicated_pages) + goto fail1; + + mm->replication_ctl->pcp_dereplicated_pages = alloc_percpu_gfp(unsigned long, GFP_KERNEL | __GFP_ZERO); + if (!mm->replication_ctl->pcp_dereplicated_pages) + goto fail2; + + mm->replication_ctl->pcp_replicated_tables = alloc_percpu_gfp(unsigned long, GFP_KERNEL | __GFP_ZERO); + if (!mm->replication_ctl->pcp_replicated_tables) + goto fail3; + + + mm->replication_ctl->pcp_dereplicated_tables = alloc_percpu_gfp(unsigned long, GFP_KERNEL | __GFP_ZERO); + if (!mm->replication_ctl->pcp_dereplicated_tables) + goto fail4; + + return 0; + +fail4: + free_percpu(mm->replication_ctl->pcp_replicated_tables); +fail3: + free_percpu(mm->replication_ctl->pcp_dereplicated_pages); +fail2: + free_percpu(mm->replication_ctl->pcp_replicated_pages); +fail1: + kfree(mm->replication_ctl); + mm->replication_ctl = NULL; + + return -ENOMEM; + +} + +void free_numa_replication_ctl(struct mm_struct *mm); + +int numa_replication_init_sysfs(void); + +void numa_replication_add_candidate(struct mm_struct *mm); + +int numa_replicate_pgtables_vma(struct vm_area_struct *vma); + +int numa_dispatch_table_replication_request(struct mm_struct *mm, int cmd); +int numa_dispatch_data_replication_request(struct mm_struct *mm, int cmd); + +void numa_replication_post_mprotect(struct vm_area_struct *vma); + +static inline bool vma_want_table_replica(struct vm_area_struct *vma) +{ + struct mm_struct *mm = vma->vm_mm; + table_replication_policy_t policy = get_table_replication_policy(mm); + + if (policy == TABLE_REPLICATION_ALL) + return true; + + if (policy == TABLE_REPLICATION_MINIMAL && vma_replica_candidate(vma)) + return true; + + return false; +} + +static inline void numa_mprotect_vm_flags_modify(unsigned long *newflags, struct vm_area_struct *vma) +{ + table_replication_policy_t table_policy = get_table_replication_policy(vma->vm_mm); + data_replication_policy_t data_policy = get_data_replication_policy(vma->vm_mm); + + switch (table_policy) { + case TABLE_REPLICATION_MINIMAL: { + if (__vm_flags_replica_candidate(vma, *newflags)) + *newflags |= VM_REPLICA_INIT; + break; + } + case TABLE_REPLICATION_ALL: { + *newflags |= VM_REPLICA_INIT; + break; + } + case TABLE_REPLICATION_NONE: { + return; + } + default: + BUG(); + } + + switch (data_policy) { + case DATA_REPLICATION_NONE: { + return; + } + case DATA_REPLICATION_ON_DEMAND: + case DATA_REPLICATION_ALL_MAPPED_ON_DEMAND: + case DATA_REPLICATION_ALL: { + if (__vm_flags_replica_candidate(vma, *newflags)) + *newflags |= VM_REPLICA_COMMIT; + break; + } + default: + BUG(); + } + + return; +} + +#else /* !CONFIG_USER_REPLICATION */ + +struct pgtable_private { + pte_t **pte_numa; + struct page **replica_pages; +}; + +static inline void pgtable_pte_step(struct pgtable_private *zp, int nr) { } +static inline void pgtable_update_pte(struct pgtable_private *zp, pte_t *pte) { } + +static inline pmd_t *get_master_pmd(pmd_t *pmd) +{ + return pmd; +} + +static inline int numa_is_vma_replicant(struct vm_area_struct *vma) +{ + return 0; +} + +static inline bool vma_has_replicas(struct vm_area_struct *vma) +{ + return 0; +} + +static inline bool vma_might_be_replicated(struct vm_area_struct *vma) +{ + return 0; +} + +static inline int numa_replication_init_sysfs(void) +{ + return 0; +} + +static inline void mm_free_numa_stats(struct mm_struct *mm) { } + +static inline int mm_init_numa_stats(struct mm_struct *mm) +{ + return 0; +} + +static inline int alloc_numa_replication_ctl(struct mm_struct *mm) +{ + return 0; +} + +static inline data_replication_policy_t get_data_replication_policy(struct mm_struct *mm) +{ + return DATA_REPLICATION_NONE; +} + +#define pte_clear_replicated pte_clear +#define set_pte_at_notify_replicated set_pte_at_notify +#define set_pte_at_replicated set_pte_at +#define ptep_clear_flush_replicated ptep_clear_flush +#define ptep_clear_flush_young_notify_replicated ptep_clear_flush_young_notify +#define ptep_clear_flush_notify_replicated ptep_clear_flush_notify +#define ptep_clear_young_notify_replicated ptep_clear_young_notify +#define ptep_get_and_clear_replicated ptep_get_and_clear +#define ptep_get_and_clear_full_replicated ptep_get_and_clear_full +#define pte_clear_not_present_full_replicated pte_clear_not_present_full +#define ptep_set_wrprotect_replicated ptep_set_wrprotect +#define ptep_modify_prot_start_replicated ptep_modify_prot_start +#define ptep_modify_prot_commit_replicated ptep_modify_prot_commit +#define set_huge_pte_at_replicated set_huge_pte_at +#define huge_ptep_set_access_flags_replicated huge_ptep_set_access_flags +#define huge_ptep_clear_flush_replicated huge_ptep_clear_flush +#define set_huge_swap_pte_at_replicated set_huge_swap_pte_at +#define huge_ptep_modify_prot_start_replicated huge_ptep_modify_prot_start +#define huge_ptep_modify_prot_commit_replicated huge_ptep_modify_prot_commit +#define huge_pte_clear_replicated huge_pte_clear +#define huge_ptep_get_and_clear_replicated huge_ptep_get_and_clear +#define huge_ptep_set_wrprotect_replicated huge_ptep_set_wrprotect +#define ptep_set_access_flags_replicated ptep_set_access_flags +#define pmd_clear_replicated pmd_clear +#define huge_pte_alloc_copy_tables(dst, src, addr, sz) huge_pte_alloc(dst, addr, sz) +#define huge_pte_alloc_replica(mm, vma, addr, sz) huge_pte_alloc(mm, addr, sz) + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define pmdp_huge_get_and_clear_replicated pmdp_huge_get_and_clear +#define pmdp_set_wrprotect_replicated pmdp_set_wrprotect +#endif + +#define pmdp_set_access_flags_replicated pmdp_set_access_flags +#define huge_pmd_set_accessed_replicated huge_pmd_set_accessed +#define set_pmd_at_replicated set_pmd_at +#define pmdp_invalidate_replicated pmdp_invalidate +#define pud_clear_replicated pud_clear + +#endif /* CONFIG_USER_REPLICATION */ + +#endif /* _LINUX_NUMA_USER_REPLICATION_H */ \ No newline at end of file diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 3ba66946e3732..4cb268d6c3913 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -155,7 +155,6 @@ enum pageflags { PG_reserve_pgflag_0, PG_reserve_pgflag_1, #endif - __NR_PAGEFLAGS, /* Filesystems */ @@ -191,6 +190,10 @@ enum pageflags { /* Only valid for buddy pages. Used to track pages that are reported */ PG_reported = PG_uptodate, + +#ifdef CONFIG_KERNEL_REPLICATION + PG_replicated = PG_reserve_pgflag_0, +#endif }; #ifndef __GENERATING_BOUNDS_H @@ -545,6 +548,13 @@ PAGEFLAG(Idle, idle, PF_ANY) */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) +#ifdef CONFIG_KERNEL_REPLICATION +PAGEFLAG(Replicated, replicated, PF_ANY) +#else +PAGEFLAG_FALSE(Replicated) +#endif + + /* * PagePool() is used to track page allocated from hpool. */ @@ -919,6 +929,12 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) #define __PG_MLOCKED 0 #endif +#ifdef CONFIG_KERNEL_REPLICATION +#define __PG_REPLICATED (1UL << PG_replicated) +#else +#define __PG_REPLICATED 0 +#endif + /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. It they are, there is a problem. @@ -928,7 +944,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_slab | 1UL << PG_active | \ - 1UL << PG_unevictable | __PG_MLOCKED) + 1UL << PG_unevictable | __PG_REPLICATED | __PG_MLOCKED) /* * Flags checked when a page is prepped for return by the page allocator. diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index f924468d84ec4..8ca678c7e8619 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1264,6 +1264,17 @@ static inline int pmd_trans_unstable(pmd_t *pmd) #endif } +/* + * The ordering of these checks is important for pmds with _PAGE_DEVMAP set. + * If we check pmd_trans_unstable() first we will trip the bad_pmd() check + * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly + * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. + */ +static inline int pmd_devmap_trans_unstable(pmd_t *pmd) +{ + return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); +} + #ifndef CONFIG_NUMA_BALANCING /* * Technically a PTE can be PROTNONE even when not doing NUMA balancing but diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 9a3e5baaa47b9..47ce322edca20 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -77,7 +77,6 @@ struct vm_struct { KABI_EXTEND(int node) KABI_EXTEND(bool replicated) #endif - }; struct vmap_area { @@ -152,8 +151,8 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, const void *caller); #ifdef CONFIG_KERNEL_REPLICATION /* - * DO NOT USE this function if you don't understand what it is doing - */ + * DO NOT USE this function if you don't understand what it is doing + */ int __vmalloc_node_replicate_range(const void *addr, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags); #ifdef CONFIG_ARM64 diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 81209f667a63f..dd07c6a02172b 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -317,6 +317,105 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->change_ownership) ); +DECLARE_EVENT_CLASS(mm_ureplica_cost, + TP_PROTO(ktime_t time), + TP_ARGS(time), + + TP_STRUCT__entry( + __field(ktime_t, time) + ), + + TP_fast_assign( + __entry->time = ktime_to_ns(time); + ), + + TP_printk("time=%lldns", __entry->time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_handle_mm_fault, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_set_pte_at, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_pte_clear, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_pte_get_and_clear, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_ptep_test_and_clear_young, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_ptep_set_wrprotect, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_ptep_set_access_flags, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_ptep_modify_prot_start, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_handle_pte_fault, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_do_anonymous_page, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_do_fault, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_do_swap_page, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_do_numa_page, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_do_wp_page, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_fault_p4d_alloc, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_fault_pud_alloc, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); + +DEFINE_EVENT(mm_ureplica_cost, mm_ureplica_cost_fault_pmd_alloc, + TP_PROTO(ktime_t time), + TP_ARGS(time) +); /* * Required for uniquely and securely identifying mm in rss_stat tracepoint. */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index f32e6b3775ea1..e7d07d42b0098 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -18,6 +18,8 @@ #define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ #define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ +#define PROT_REPLICA 0x200000 /* VM_REPLICA_COMMIT make replicated pte entries to point to copied numa-local physical pages */ + /* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ @@ -30,6 +32,7 @@ #define MAP_HUGETLB 0x040000 /* create a huge page mapping */ #define MAP_SYNC 0x080000 /* perform synchronous page faults for the mapping */ #define MAP_FIXED_NOREPLACE 0x100000 /* MAP_FIXED which doesn't unmap underlying mapping */ +#define MAP_REPLICA 0x200000 /* VM_REPLICA_INIT */ #define MAP_UNINITIALIZED 0x4000000 /* For anonymous mmap, memory could be * uninitialized */ diff --git a/init/main.c b/init/main.c index a51e0c1dc54f7..dc6a7ff1c6c1e 100644 --- a/init/main.c +++ b/init/main.c @@ -99,7 +99,7 @@ #include <linux/kcsan.h> #include <linux/init_syscalls.h> #include <linux/randomize_kstack.h> -#include <linux/numa_replication.h> +#include <linux/numa_user_replication.h> #include <asm/io.h> #include <asm/setup.h> @@ -819,6 +819,7 @@ static void __init report_meminit(void) pr_info("mem auto-init: clearing system memory may take some time...\n"); } +void __weak preallocate_vmalloc_pages(void) { } /* * Set up kernel memory allocators */ @@ -839,6 +840,7 @@ static void __init mm_init(void) kmemleak_init(); pgtable_init(); debug_objects_mem_init(); + preallocate_vmalloc_pages(); vmalloc_init(); /* Should be run before the first non-init thread is created */ init_espfix_bsp(); @@ -1461,6 +1463,7 @@ static int __ref kernel_init(void *unused) */ numa_replicate_kernel_rodata(); numa_replication_fini(); + numa_replication_init_sysfs(); /* * Kernel mappings are now finalized - update the userspace page-table * to finalize PTI. diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 22d39f82a0aa6..f7fc01d092620 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -333,7 +333,7 @@ static void cgroup_idr_remove(struct idr *idr, int id) spin_unlock_bh(&cgroup_idr_lock); } -static bool cgroup_has_tasks(struct cgroup *cgrp) +bool cgroup_has_tasks(struct cgroup *cgrp) { return cgrp->nr_populated_csets; } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 8a6c5ee003c2e..91ef0deb997cf 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -27,6 +27,7 @@ #include <linux/task_work.h> #include <linux/shmem_fs.h> #include <linux/khugepaged.h> +#include <linux/numa_user_replication.h> #include <linux/uprobes.h> @@ -197,10 +198,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, reliable_page_counter(old_page, mm, -1); flush_cache_page(vma, addr, pte_pfn(*pvmw.pte)); - ptep_clear_flush_notify(vma, addr, pvmw.pte); + ptep_clear_flush_notify_replicated(vma, addr, pvmw.pte); if (new_page) - set_pte_at_notify(mm, addr, pvmw.pte, - mk_pte(new_page, vma->vm_page_prot)); + set_pte_at_notify_replicated(mm, addr, pvmw.pte, + mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, false); if (!page_mapped(old_page)) diff --git a/kernel/fork.c b/kernel/fork.c index 9b1ea79deaa52..53983a0722996 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -101,6 +101,8 @@ #ifdef CONFIG_QOS_SCHED_SMART_GRID #include <linux/sched/grid_qos.h> #endif +#include <linux/numa_user_replication.h> + #include <linux/share_pool.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> @@ -607,6 +609,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, rb_parent = &tmp->vm_rb; mm->map_count++; + if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(tmp, mpnt); @@ -697,6 +700,10 @@ void __mmdrop(struct mm_struct *mm) BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); WARN_ON_ONCE(mm == current->active_mm); +#ifdef CONFIG_USER_REPLICATION + free_numa_replication_ctl(mm); +#endif + mm_free_numa_stats(mm); mm_free_pgd(mm); destroy_context(mm); mmu_notifier_subscriptions_destroy(mm); @@ -1097,6 +1104,21 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_uprobes_state(mm); hugetlb_count_init(mm); +#ifdef CONFIG_USER_REPLICATION + /* + * Hack, to prevent use after free in case of ENOMEM + */ + mm->replication_ctl = NULL; + mm->context_switch_stats = NULL; + mm->pgd = NULL; + mm->pgd_numa = NULL; +#endif + + if (mm_init_numa_stats(mm)) + goto fail_nopgd; + if (alloc_numa_replication_ctl(mm)) + goto fail_nopgd; + if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; diff --git a/kernel/module.c b/kernel/module.c index 0e1b8e91a45ea..d55f4afd87bad 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2238,7 +2238,9 @@ void __weak module_arch_freeing_init(struct module *mod) /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { - struct module_layout *mut_layout = mod->mutable_data_layout; + /* This memory must be freed after module structure is freed */ + struct module_layout *mut_data = mod->mutable_data_layout; + trace_module_free(mod); mod_sysfs_teardown(mod); @@ -2287,7 +2289,7 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ module_memfree(mod->core_layout.base); module_memfree(mod->mutable_data_layout->base); - kfree(mut_layout); + kfree(mut_data); } void *__symbol_get(const char *symbol) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5fc8d6a25b9ad..d9d67314df2f1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -32,6 +32,8 @@ #include <linux/bpf_sched.h> #include <linux/mem_sampling.h> +#include <linux/numa_user_replication.h> + /* * Targeted preemption latency for CPU-bound tasks: * @@ -2849,7 +2851,9 @@ static void task_numa_work(struct callback_head *work) if (!pages) return; - +#ifdef CONFIG_USER_REPLICATION + numa_accumulate_switches(mm); +#endif if (!mmap_read_trylock(mm)) return; vma = find_vma(mm, start); @@ -2859,9 +2863,8 @@ static void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma) || !vma_policy_mof(vma) || - is_vm_hugetlb_page(vma) || is_cdm_vma(vma) || - (vma->vm_flags & VM_MIXEDMAP)) { + if (!vma_migratable(vma) || (!vma_policy_mof(vma) && !(vma_has_replicas(vma) && (get_data_replication_policy(vma->vm_mm) != DATA_REPLICATION_NONE))) || + is_vm_hugetlb_page(vma) || is_cdm_vma(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; } @@ -2871,8 +2874,9 @@ static void task_numa_work(struct callback_head *work) * hinting faults in read-only file-backed mappings or the vdso * as migrating the pages will be of marginal benefit. */ - if (!vma->vm_mm || - (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + if ((!vma->vm_mm || + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) && + !(vma->vm_mm && (get_data_replication_policy(vma->vm_mm) != DATA_REPLICATION_NONE))) continue; /* diff --git a/mm/Kconfig b/mm/Kconfig index 25bd538856a5e..b1666b6d90bd1 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1058,6 +1058,29 @@ config KERNEL_REPLICATION If unsure, say "n". +config USER_REPLICATION + bool "infrastructure for userspace replication between NUMA nodes" + default n + depends on KERNEL_REPLICATION + select ARCH_USES_HIGH_VMA_FLAGS + + help + Provide interfaces for per NUMA node replication of some userspace mappings. + First of all it made for text, ro-data and ro after init data. This feature + doesn't support THP now, so even with madvise(2) replicated pages should not be + the target for THP. + + If unsure, say "n". + +config USER_REPLICATION_DEBUG + bool "Add tracepoints to measure replication costs" + default n + depends on USER_REPLICATION + + help + Add tracepoints to measure replication costs + + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 259b312f717a4..ae94d8f91a83f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -134,5 +134,5 @@ obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o obj-$(CONFIG_MEM_SAMPLING) += mem_sampling.o -obj-$(CONFIG_KERNEL_REPLICATION) += numa_replication.o - +obj-$(CONFIG_KERNEL_REPLICATION) += numa_kernel_replication.o +obj-$(CONFIG_USER_REPLICATION) += numa_user_replication.o diff --git a/mm/gup.c b/mm/gup.c index 328e38bb1d1f3..8a9bf9cc3a836 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -17,6 +17,7 @@ #include <linux/migrate.h> #include <linux/mm_inline.h> #include <linux/sched/mm.h> +#include <linux/numa_user_replication.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> @@ -453,7 +454,7 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, entry = pte_mkyoung(entry); if (!pte_same(*pte, entry)) { - set_pte_at(vma->vm_mm, address, pte, entry); + set_pte_at_replicated(vma->vm_mm, address, pte, entry); update_mmu_cache(vma, address, pte); } } @@ -530,6 +531,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, } page = vm_normal_page(vma, address, pte); + if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN @@ -602,7 +604,10 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, /* Do not mlock pte-mapped THP */ if (PageTransCompound(page)) goto out; - + if (page && PageReplicated(compound_head(page))) { + page = ERR_PTR(-EEXIST); + goto out; + } /* * The preliminary mapping check is mainly to avoid the * pointless overhead of lock_page on the ZERO_PAGE @@ -881,6 +886,10 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, struct page *page; page = follow_page_mask(vma, address, foll_flags, &ctx); + + if (foll_flags & FOLL_MLOCK) + BUG_ON(page && !IS_ERR(page) && PageReplicated(compound_head(page))); + if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); return page; @@ -1147,10 +1156,20 @@ static long __get_user_pages(struct mm_struct *mm, goto next_page; } - if (!vma || check_vma_flags(vma, gup_flags)) { + if (!vma) { ret = -EFAULT; goto out; } + + /* + * TODO: It seems to me that we cannot (and should not) pin replicated memory. + * Add a check in vma if memory has already been replicated. + */ + + ret = check_vma_flags(vma, gup_flags); + if (ret) + goto out; + if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, &start, &nr_pages, i, @@ -1180,6 +1199,11 @@ static long __get_user_pages(struct mm_struct *mm, cond_resched(); page = follow_page_mask(vma, start, foll_flags, &ctx); + + if ((foll_flags & FOLL_MLOCK) && page && !IS_ERR(page) && PageReplicated(compound_head(page))) { + pr_info("bruh\n"); + BUG_ON(page && !IS_ERR(page) && PageReplicated(compound_head(page))); + } if (!page) { ret = faultin_page(vma, start, &foll_flags, locked); switch (ret) { @@ -1567,6 +1591,11 @@ int do_mm_populate(struct mm_struct *mm, unsigned long start, unsigned long len, } break; } +#ifdef CONFIG_USER_REPLICATION + if (get_data_replication_policy(mm) == DATA_REPLICATION_ALL && vma_might_be_replicated(vma)) { + phys_duplicate(vma, nstart, nstart - nend); + } +#endif nend = nstart + ret * PAGE_SIZE; ret = 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2825c5390fe91..695662cf6046f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -34,6 +34,7 @@ #include <linux/numa.h> #include <linux/page_owner.h> #include <linux/dynamic_hugetlb.h> +#include <linux/numa_user_replication.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -682,8 +683,8 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr, true); lru_cache_add_inactive_or_unevictable(page, vma); - pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + pgtable_trans_huge_deposit(vma->vm_mm, get_master_pmd(vmf->pmd), pgtable); + set_pmd_at_replicated(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); reliable_page_counter(page, vma->vm_mm, HPAGE_PMD_NR); mm_inc_nr_ptes(vma->vm_mm); @@ -712,7 +713,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, * available * never: never stall for any thp allocation */ -static inline gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) +gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma) { const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE); @@ -749,8 +750,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); if (pgtable) - pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, haddr, pmd, entry); + pgtable_trans_huge_deposit(mm, get_master_pmd(pmd), pgtable); + set_pmd_at_replicated(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); return true; } @@ -1057,6 +1058,185 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, return page; } +#ifdef CONFIG_USER_REPLICATION + +/* + * Copy-paste here, which is better than 50 if-defs inside single function + */ + +int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) +{ + spinlock_t *dst_ptl, *src_ptl; + struct page *src_page; + pmd_t pmd; + pgtable_t pgtable = NULL; + int ret = -ENOMEM; + bool page_replicated = false; + struct page *src_page_numa[MAX_NUMNODES]; + pgtable_t pgtable_numa[MAX_NUMNODES]; + unsigned long offset; + bool start; + struct page *curr; + pmd_t *curr_pmd; + int nid; + + /* Skip if can be re-fill on fault */ + if (!vma_is_anonymous(dst_vma)) + return 0; + + for_each_memory_node(nid) { + pgtable_numa[nid] = pte_alloc_one_node(nid, dst_mm); + if (unlikely(!pgtable_numa[nid])) + goto out; + } + + pgtable = pgtable_numa[first_memory_node]; + + dst_ptl = pmd_lock(dst_mm, dst_pmd); + src_ptl = pmd_lockptr(src_mm, src_pmd); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + + ret = -EAGAIN; + pmd = *src_pmd; + +#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION + if (unlikely(is_swap_pmd(pmd))) { + swp_entry_t entry = pmd_to_swp_entry(pmd); + + VM_BUG_ON(!is_pmd_migration_entry(pmd)); + if (is_write_migration_entry(entry)) { + make_migration_entry_read(&entry); + pmd = swp_entry_to_pmd(entry); + if (pmd_swp_soft_dirty(*src_pmd)) + pmd = pmd_swp_mksoft_dirty(pmd); + if (pmd_swp_uffd_wp(*src_pmd)) + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at_replicated(src_mm, addr, src_pmd, pmd); + } + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + mm_inc_nr_ptes(dst_mm); + pgtable_trans_huge_deposit(dst_mm, get_master_pmd(dst_pmd), pgtable); + pgtable_numa[first_memory_node] = NULL; + if (!userfaultfd_wp(dst_vma)) + pmd = pmd_swp_clear_uffd_wp(pmd); + set_pmd_at_replicated(dst_mm, addr, dst_pmd, pmd); + ret = 0; + goto out_unlock; + } +#endif + + if (unlikely(!pmd_trans_huge(pmd))) { + goto out_unlock; + } + /* + * When page table lock is held, the huge zero pmd should not be + * under splitting since we don't split the page itself, only pmd to + * a page table. + */ + if (is_huge_zero_pmd(pmd)) { + /* + * get_huge_zero_page() will never allocate a new page here, + * since we already have a zero page to copy. It just takes a + * reference. + */ + mm_get_huge_zero_page(dst_mm); + goto out_zero_page; + } + + src_page = pmd_page(pmd); + VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + page_replicated = PageReplicated(src_page); + + if (page_replicated) { + for_each_pgtable(curr, curr_pmd, src_pmd, nid, offset, start) { + src_page_numa[nid] = pmd_page(*curr_pmd); + } + } + + /* + * If this page is a potentially pinned page, split and retry the fault + * with smaller page size. Normally this should not happen because the + * userspace should use MADV_DONTFORK upon pinned regions. This is a + * best effort that the pinned pages won't be replaced by another + * random page during the coming copy-on-write. + */ + if (unlikely(is_cow_mapping(src_vma->vm_flags) && + atomic_read(&src_mm->has_pinned) && + page_maybe_dma_pinned(src_page))) { + for_each_memory_node(nid) { + if (pgtable_numa[nid] != NULL) { + pte_free(dst_mm, pgtable_numa[nid]); + pgtable_numa[nid] = NULL; + } + } + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); + return -EAGAIN; + } + + if (page_replicated) { + for_each_memory_node(nid) { + get_page(src_page_numa[nid]); + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + atomic_inc(compound_mapcount_ptr(src_page_numa[nid])); + } + } else { + get_page(src_page); + page_dup_rmap(src_page, true); + add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); + } +out_zero_page: + + if (page_replicated) { + for_each_pgtable(curr, curr_pmd, dst_pmd, nid, offset, start) { + mm_inc_nr_ptes(dst_mm); + pgtable_trans_huge_deposit(dst_mm, curr_pmd, pgtable_numa[nid]); + pgtable_numa[nid] = NULL; + } + } else { + mm_inc_nr_ptes(dst_mm); + pgtable_trans_huge_deposit(dst_mm, get_master_pmd(dst_pmd), pgtable); + pgtable_numa[first_memory_node] = NULL; + } + + pmdp_set_wrprotect_replicated(src_mm, addr, src_pmd); + + if (page_replicated) { + for_each_pgtable(curr, curr_pmd, dst_pmd, nid, offset, start) { + pmd_t pmde = mk_huge_pmd(src_page_numa[nid], dst_vma->vm_page_prot); + + if (!userfaultfd_wp(dst_vma)) + pmde = pmd_clear_uffd_wp(pmde); + pmde = pmd_mkold(pmd_wrprotect(pmde)); + set_pmd_at(dst_mm, addr, curr_pmd, pmde); + } + } else { + if (!userfaultfd_wp(dst_vma)) + pmd = pmd_clear_uffd_wp(pmd); + pmd = pmd_mkold(pmd_wrprotect(pmd)); + set_pmd_at_replicated(dst_mm, addr, dst_pmd, pmd); + } + + ret = 0; +out_unlock: + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + + for_each_memory_node(nid) { + if (pgtable_numa[nid] != NULL) { + pte_free(dst_mm, pgtable_numa[nid]); + pgtable_numa[nid] = NULL; + } + } +out: + return ret; +} + +#else /* !CONFIG_USER_REPLICATION */ + int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) @@ -1167,6 +1347,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, return ret; } +#endif /* CONFIG_USER_REPLICATION */ + #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD static void touch_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags) @@ -1363,7 +1545,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + if (pmdp_set_access_flags_replicated(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); unlock_page(page); spin_unlock(vmf->ptl); @@ -1438,7 +1620,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, * for file pages we set it in page_add_file_rmap(), which * requires page to be locked. */ - + if (PageReplicated(compound_head(page))) { + page = ERR_PTR(-EEXIST); + goto out; + } if (PageAnon(page) && compound_mapcount(page) != 1) goto skip_mlock; if (PageDoubleMap(page) || !page->mapping) @@ -1457,6 +1642,41 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, return page; } +#ifdef CONFIG_USER_REPLICATION + +static int numa_replicate_hugepage(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vmf->vma->vm_mm; + unsigned long start = vmf->address & HPAGE_PMD_MASK; + int error = 0; + + mmap_assert_locked(mm); + + if (WARN_ON_ONCE(!(vma->vm_flags & VM_REPLICA_COMMIT))) + goto out; + + /* + * This should not be possible, + * because we have just handled page fault up to pmd level, + * so pmd tables must exist and be replicated. + */ + BUG_ON(!numa_pgtable_replicated(vmf->pmd)); + + if (phys_duplicate_huge_pmd(vma, vmf->pmd, start, start + HPAGE_SIZE)) { + error = -ENOMEM; + goto out; + } + pr_info("Successfully replicated THP on balancer -- start:%zx; len:%zx PID: %d NAME: %s\n", + start, HPAGE_SIZE, vma->vm_mm->owner->pid, vma->vm_mm->owner->comm); + flush_tlb_range(vma, start, start + HPAGE_SIZE); + +out: + return error; +} + +#endif + /* NUMA hinting page fault entry point for trans huge pmds */ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) { @@ -1471,6 +1691,16 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) bool was_writable = pmd_savedwrite(oldpmd); int flags = 0; +#ifdef CONFIG_USER_REPLICATION + if (get_data_replication_policy(vma->vm_mm) != DATA_REPLICATION_NONE) { + if (vma_might_be_replicated(vma)) { + if (!numa_replicate_hugepage(vmf)) { + return 0; + } + } + } +#endif + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) { spin_unlock(vmf->ptl); @@ -1498,7 +1728,9 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) spin_unlock(vmf->ptl); + BUG_ON(page && PageReplicated(compound_head(page))); migrated = migrate_misplaced_page(page, vma, target_nid); + if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; @@ -1525,7 +1757,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf) pmd = pmd_mkyoung(pmd); if (was_writable) pmd = pmd_mkwrite(pmd); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); + set_pmd_at_replicated(vma->vm_mm, haddr, vmf->pmd, pmd); update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); spin_unlock(vmf->ptl); goto out; @@ -1561,6 +1793,10 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, } page = pmd_page(orig_pmd); + + if (PageReplicated(page)) + goto out; + /* * If other processes are mapping this page, we couldn't discard * the page unless they all do MADV_FREE so let's skip the page. @@ -1589,11 +1825,11 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, unlock_page(page); if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { - pmdp_invalidate(vma, addr, pmd); + pmdp_invalidate_replicated(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); orig_pmd = pmd_mkclean(orig_pmd); - set_pmd_at(mm, addr, pmd, orig_pmd); + set_pmd_at_replicated(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } @@ -1605,11 +1841,17 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return ret; } -static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) +void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, pmd); + +#ifdef CONFIG_USER_REPLICATION + /* For now, clean this list from everything */ + pgtable->replica_list_head.first = NULL; +#endif + pte_free(mm, pgtable); mm_dec_nr_ptes(mm); } @@ -1618,8 +1860,20 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { pmd_t orig_pmd; +#ifdef CONFIG_USER_REPLICATION + pmd_t orig_pmd_numa[MAX_NUMNODES]; + int nid; + struct page *curr; + pmd_t *curr_pmd; + unsigned long offset; + bool start; + bool pmd_replicated = numa_pgtable_replicated(pmd); + bool page_replicated = false; +#endif spinlock_t *ptl; + + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); ptl = __pmd_trans_huge_lock(pmd, vma); @@ -1631,17 +1885,26 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, * pgtable_trans_huge_withdraw after finishing pmdp related * operations. */ - orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, +#ifdef CONFIG_USER_REPLICATION + if (pmd_replicated) { + for_each_pgtable(curr, curr_pmd, pmd, nid, offset, start) { + orig_pmd_numa[nid] = pmdp_huge_get_and_clear_full(vma, addr, curr_pmd, + tlb->fullmm); + } + orig_pmd = orig_pmd_numa[first_memory_node]; + } else +#endif + orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd, tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (vma_is_special_huge(vma)) { if (arch_needs_pgtable_deposit()) - zap_deposited_table(tlb->mm, pmd); + zap_deposited_table(tlb->mm, get_master_pmd(pmd)); spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else if (is_huge_zero_pmd(orig_pmd)) { - zap_deposited_table(tlb->mm, pmd); + zap_deposited_table(tlb->mm, get_master_pmd(pmd)); spin_unlock(ptl); tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else { @@ -1650,8 +1913,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (pmd_present(orig_pmd)) { page = pmd_page(orig_pmd); - reliable_page_counter(page, tlb->mm, -HPAGE_PMD_NR); - page_remove_rmap(page, true); + +#ifdef CONFIG_USER_REPLICATION + page_replicated = PageReplicated(page); + if (page_replicated) { + for_each_memory_node(nid) { + dec_compound_mapcount(pmd_page(orig_pmd_numa[nid])); + reliable_page_counter(pmd_page(orig_pmd_numa[nid]), tlb->mm, -HPAGE_PMD_NR); + } + } else +#endif + { + reliable_page_counter(page, tlb->mm, -HPAGE_PMD_NR); + page_remove_rmap(page, true); + } + VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); } else if (thp_migration_supported()) { @@ -1664,18 +1940,34 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, } else WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!"); +#ifdef CONFIG_USER_REPLICATION + if (page_replicated) { + for_each_pgtable(curr, curr_pmd, pmd, nid, offset, start) { + zap_deposited_table(tlb->mm, curr_pmd); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + } + } else +#endif if (PageAnon(page)) { - zap_deposited_table(tlb->mm, pmd); + zap_deposited_table(tlb->mm, get_master_pmd(pmd)); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { if (arch_needs_pgtable_deposit()) - zap_deposited_table(tlb->mm, pmd); + zap_deposited_table(tlb->mm, get_master_pmd(pmd)); add_mm_counter(tlb->mm, mm_counter_file(page), -HPAGE_PMD_NR); } spin_unlock(ptl); - if (flush_needed) - tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); + if (flush_needed) { +#ifdef CONFIG_USER_REPLICATION + if (page_replicated) { + for_each_memory_node(nid) + tlb_remove_page_size(tlb, pmd_page(orig_pmd_numa[nid]), HPAGE_PMD_SIZE); + } else +#endif + tlb_remove_page_size(tlb, page, HPAGE_PMD_SIZE); + + } } return 1; } @@ -1713,7 +2005,12 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, pmd_t pmd; struct mm_struct *mm = vma->vm_mm; bool force_flush = false; - +#ifdef CONFIG_USER_REPLICATION + bool old_pmd_replicated = numa_pgtable_replicated(old_pmd); + bool new_pmd_replicated = numa_pgtable_replicated(new_pmd); + int nid; + pgtable_t deposit_ptes[MAX_NUMNODES] = {}; +#endif /* * The destination pmd shouldn't be established, free_pgtables() * should have release it. @@ -1723,6 +2020,17 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, return false; } +#ifdef CONFIG_USER_REPLICATION + /* + * Do it here to avoid alloc_pages under spinlock + */ + if (!old_pmd_replicated && new_pmd_replicated) { + for_each_memory_node(nid) { + deposit_ptes[nid] = pte_alloc_one_node(nid, vma->vm_mm); + } + } +#endif + /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. @@ -1732,28 +2040,116 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); - pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd); + + pmd = pmdp_huge_get_and_clear_replicated(mm, old_addr, old_pmd); + if (pmd_present(pmd)) force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { pgtable_t pgtable; +#ifdef CONFIG_USER_REPLICATION + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + bool start; + /* + * Why do we need all of this? + * If transparent_huge_pmd is used the PMD level page contains + * preallocated pte level tables, which are used in case if we want to split hugepage. + * Because of this, we need to carefully zap/alloc pte tables + * if src and dst pagetables don't have similar structure in terms of replicated levels + * TODO think how to do it better later... + */ + + if (old_pmd_replicated && new_pmd_replicated) { + for_each_pgtable(curr, curr_pmd, old_pmd, nid, offset, start) { + deposit_ptes[nid] = pgtable_trans_huge_withdraw(mm, curr_pmd); + } + for_each_pgtable(curr, curr_pmd, new_pmd, nid, offset, start) { + pgtable_trans_huge_deposit(mm, curr_pmd, deposit_ptes[nid]); + deposit_ptes[nid] = NULL; + } + } else if (!old_pmd_replicated && new_pmd_replicated) { + zap_deposited_table(mm, old_pmd); + + for_each_pgtable(curr, curr_pmd, new_pmd, nid, offset, start) { + pgtable_trans_huge_deposit(mm, curr_pmd, deposit_ptes[nid]); + deposit_ptes[nid] = NULL; + } + } else if (old_pmd_replicated && !new_pmd_replicated) { + + for_each_pgtable(curr, curr_pmd, old_pmd, nid, offset, start) { + if (nid == first_memory_node) { + pgtable = pgtable_trans_huge_withdraw(mm, get_master_pmd(curr_pmd)); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); + } else { + zap_deposited_table(mm, curr_pmd); + } + } + + } else if (!old_pmd_replicated && !new_pmd_replicated) { + pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); + pgtable_trans_huge_deposit(mm, new_pmd, pgtable); + } + +#else pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); +#endif } pmd = move_soft_dirty_pmd(pmd); - set_pmd_at(mm, new_addr, new_pmd, pmd); + + set_pmd_at_replicated(mm, new_addr, new_pmd, pmd); + if (force_flush) flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); +#ifdef CONFIG_USER_REPLICATION + for_each_memory_node(nid) { + if (deposit_ptes[nid]) + pte_free(mm, deposit_ptes[nid]); + } +#endif return true; } +#ifdef CONFIG_USER_REPLICATION + for_each_memory_node(nid) { + if (deposit_ptes[nid]) + pte_free(mm, deposit_ptes[nid]); + } +#endif return false; } +static void change_huge_pmd_entry(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, + pgprot_t newprot, bool preserve_write, bool uffd_wp, bool uffd_wp_resolve) +{ + pmd_t entry = pmdp_invalidate(vma, addr, pmd); + struct mm_struct *mm = vma->vm_mm; + + entry = pmd_modify(entry, newprot); + if (preserve_write) + entry = pmd_mk_savedwrite(entry); + if (uffd_wp) { + entry = pmd_wrprotect(entry); + entry = pmd_mkuffd_wp(entry); + } else if (uffd_wp_resolve) { + /* + * Leave the write bit to be handled by PF interrupt + * handler, then things like COW could be properly + * handled. + */ + entry = pmd_clear_uffd_wp(entry); + } + + set_pmd_at(mm, addr, pmd, entry); + BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); +} + /* * Returns * - 0 if PMD could not be locked @@ -1766,7 +2162,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; - pmd_t entry; + bool preserve_write; int ret; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; @@ -1800,7 +2196,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, newpmd = pmd_swp_mksoft_dirty(newpmd); if (pmd_swp_uffd_wp(*pmd)) newpmd = pmd_swp_mkuffd_wp(newpmd); - set_pmd_at(mm, addr, pmd, newpmd); + set_pmd_at_replicated(mm, addr, pmd, newpmd); } goto unlock; } @@ -1817,6 +2213,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (prot_numa && pmd_protnone(*pmd)) goto unlock; + if (prot_numa && PageReplicated(pmd_page(*pmd))) + goto unlock; + /* * In case prot_numa, we are under mmap_read_lock(mm). It's critical * to not clear pmd intermittently to avoid race with MADV_DONTNEED @@ -1838,25 +2237,21 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * pmdp_invalidate() is required to make sure we don't miss * dirty/young flags set by hardware. */ - entry = pmdp_invalidate(vma, addr, pmd); + change_huge_pmd_entry(vma, addr, pmd, newprot, preserve_write, uffd_wp, uffd_wp_resolve); - entry = pmd_modify(entry, newprot); - if (preserve_write) - entry = pmd_mk_savedwrite(entry); - if (uffd_wp) { - entry = pmd_wrprotect(entry); - entry = pmd_mkuffd_wp(entry); - } else if (uffd_wp_resolve) { - /* - * Leave the write bit to be handled by PF interrupt - * handler, then things like COW could be properly - * handled. - */ - entry = pmd_clear_uffd_wp(entry); +#ifdef CONFIG_USER_REPLICATION + if (numa_pgtable_replicated(pmd)) { + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + + for_each_pgtable_replica(curr, curr_pmd, pmd, offset) { + change_huge_pmd_entry(vma, addr, curr_pmd, newprot, preserve_write, uffd_wp, uffd_wp_resolve); + } } +#endif + ret = HPAGE_PMD_NR; - set_pmd_at(mm, addr, pmd, entry); - BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); unlock: spin_unlock(ptl); return ret; @@ -1980,7 +2375,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, */ old_pmd = pmdp_huge_clear_flush(vma, haddr, pmd); - pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pgtable = pgtable_trans_huge_withdraw(mm, get_master_pmd(pmd)); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1995,10 +2390,144 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pte_unmap(pte); } smp_wmb(); /* make pte visible before pmd */ - pmd_populate(mm, pmd, pgtable); + pmd_populate_replicated(mm, pmd, pgtable); } -static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, + +#ifdef CONFIG_USER_REPLICATION +static void __split_huge_pmd_locked_replicated_page(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long haddr) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page; + pmd_t old_pmd; + bool young, write, soft_dirty, uffd_wp = false; + unsigned long addr; + int i; + unsigned long offset; + bool start; + struct page *curr; + pmd_t *curr_pmd; + int nid; + pgtable_t pgtable_numa[MAX_NUMNODES]; + + + + VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); + VM_BUG_ON_VMA(vma->vm_start > haddr, vma); + VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); + VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) + && !pmd_devmap(*pmd)); + + for_each_pgtable(curr, curr_pmd, pmd, nid, offset, start) { + pgtable_numa[nid] = pgtable_trans_huge_withdraw(mm, curr_pmd); + SetPageReplicated(pgtable_numa[nid]); + } + build_pte_chain(pgtable_numa); + set_master_page_for_ptes(NUMA_NO_NODE, pgtable_numa); + /* + * Up to this point the pmd is present and huge and userland has the + * whole access to the hugepage during the split (which happens in + * place). If we overwrite the pmd with the not-huge version pointing + * to the pte here (which of course we could if all CPUs were bug + * free), userland could trigger a small page size TLB miss on the + * small sized TLB while the hugepage TLB entry is still established in + * the huge TLB. Some CPU doesn't like that. + * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum + * 383 on page 105. Intel should be safe but is also warns that it's + * only safe if the permission and cache attributes of the two entries + * loaded in the two TLB is identical (which should be the case here). + * But it is generally safer to never allow small and huge TLB entries + * for the same virtual address to be loaded simultaneously. So instead + * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the + * current pmd notpresent (atomically because here the pmd_trans_huge + * must remain set at all times on the pmd until the split is complete + * for this pmd), then we flush the SMP TLB and finally we write the + * non-huge version of the pmd entry with pmd_populate. + */ + for_each_pgtable(curr, curr_pmd, pmd, nid, offset, start) { + pmd_t _pmd; + + + count_vm_event(THP_SPLIT_PMD); + + old_pmd = pmdp_invalidate(vma, haddr, curr_pmd); + + page = pmd_page(old_pmd); + if (pmd_dirty(old_pmd)) + SetPageDirty(page); + write = pmd_write(old_pmd); + BUG_ON(write); + young = pmd_young(old_pmd); + soft_dirty = pmd_soft_dirty(old_pmd); + uffd_wp = pmd_uffd_wp(old_pmd); + + VM_BUG_ON_PAGE(!page_count(page), page); + page_ref_add(page, HPAGE_PMD_NR - 1); + + /* + * Withdraw the table only after we mark the pmd entry invalid. + * This's critical for some architectures (Power). + */ + + pmd_populate(mm, &_pmd, pgtable_numa[nid]); + + for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { + pte_t entry, *pte; + /* + * Note that NUMA hinting access restrictions are not + * transferred to avoid any possibility of altering + * permissions across VMAs. + */ + + entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); + entry = maybe_mkwrite(entry, vma); + if (!write) + entry = pte_wrprotect(entry); + if (!young) + entry = pte_mkold(entry); + if (soft_dirty) + entry = pte_mksoft_dirty(entry); + if (uffd_wp) + entry = pte_mkuffd_wp(entry); + + pte = pte_offset_map(&_pmd, addr); + + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, addr, pte, entry); + + pte_unmap(pte); + } + + /* + * Set PG_double_map before dropping compound_mapcount to avoid + * false-negative page_mapped(). + */ + if (compound_mapcount(page) > 1 && + !TestSetPageDoubleMap(page)) { + for (i = 0; i < HPAGE_PMD_NR; i++) + atomic_inc(&page[i]._mapcount); + } + lock_page_memcg(page); + if (atomic_add_negative(-1, compound_mapcount_ptr(page))) { + /* Last compound_mapcount is gone. */ + __dec_lruvec_page_state(page, NR_ANON_THPS); + if (TestClearPageDoubleMap(page)) { + /* No need in mapcount reference anymore */ + for (i = 0; i < HPAGE_PMD_NR; i++) + atomic_dec(&page[i]._mapcount); + } + } + unlock_page_memcg(page); + + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, curr_pmd, pgtable_numa[nid]); + } +} + +#endif + +static void __split_huge_pmd_locked_normal_page(struct vm_area_struct *vma, pmd_t *pmd, unsigned long haddr, bool freeze) { struct mm_struct *mm = vma->vm_mm; @@ -2024,7 +2553,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * just go ahead and zap it */ if (arch_needs_pgtable_deposit()) - zap_deposited_table(mm, pmd); + zap_deposited_table(mm, get_master_pmd(pmd)); if (vma_is_special_huge(vma)) return; if (unlikely(is_pmd_migration_entry(old_pmd))) { @@ -2079,7 +2608,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * for this pmd), then we flush the SMP TLB and finally we write the * non-huge version of the pmd entry with pmd_populate. */ - old_pmd = pmdp_invalidate(vma, haddr, pmd); + old_pmd = pmdp_invalidate_replicated(vma, haddr, pmd); pmd_migration = is_pmd_migration_entry(old_pmd); if (unlikely(pmd_migration)) { @@ -2107,7 +2636,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * Withdraw the table only after we mark the pmd entry invalid. * This's critical for some architectures (Power). */ - pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pgtable = pgtable_trans_huge_withdraw(mm, get_master_pmd(pmd)); pmd_populate(mm, &_pmd, pgtable); for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { @@ -2170,7 +2699,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } smp_wmb(); /* make pte visible before pmd */ - pmd_populate(mm, pmd, pgtable); + pmd_populate_replicated(mm, pmd, pgtable); if (freeze) { for (i = 0; i < HPAGE_PMD_NR; i++) { @@ -2181,6 +2710,21 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } } +static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long haddr, bool freeze) +{ +#ifdef CONFIG_USER_REPLICATION + if (vma_is_anonymous(vma) && !is_huge_zero_pmd(*pmd) && !is_pmd_migration_entry(*pmd) && PageReplicated(pmd_page(*pmd))) { + /* + * We do not need to worry about this freeze shenanigans here, + * because replicated pages can not be reclaimed or migrated + */ + __split_huge_pmd_locked_replicated_page(vma, pmd, haddr); + } else +#endif + __split_huge_pmd_locked_normal_page(vma, pmd, haddr, freeze); +} + void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page) { @@ -2966,14 +3510,15 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, return; flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); - pmdval = pmdp_invalidate(vma, address, pvmw->pmd); + pmdval = pmdp_invalidate_replicated(vma, address, pvmw->pmd); if (pmd_dirty(pmdval)) set_page_dirty(page); entry = make_migration_entry(page, pmd_write(pmdval)); pmdswp = swp_entry_to_pmd(entry); if (pmd_soft_dirty(pmdval)) pmdswp = pmd_swp_mksoft_dirty(pmdswp); - set_pmd_at(mm, address, pvmw->pmd, pmdswp); + + set_pmd_at_replicated(mm, address, pvmw->pmd, pmdswp); reliable_page_counter(page, mm, -HPAGE_PMD_NR); page_remove_rmap(page, true); put_page(page); @@ -3007,7 +3552,7 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) page_add_anon_rmap(new, vma, mmun_start, true); else page_add_file_rmap(new, true); - set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); + set_pmd_at_replicated(mm, mmun_start, pvmw->pmd, pmde); if ((vma->vm_flags & VM_LOCKED) && !PageDoubleMap(new)) mlock_vma_page(new); update_mmu_cache_pmd(vma, address, pvmw->pmd); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e0730bb22931d..0ec74e16e3a98 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -32,6 +32,7 @@ #include <linux/cma.h> #include <linux/mman.h> #include <linux/share_pool.h> +#include <linux/numa_user_replication.h> #include <asm/page.h> #include <asm/pgalloc.h> @@ -4249,7 +4250,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, pte_t entry; entry = huge_pte_mkwrite(huge_pte_mkdirty(huge_ptep_get(ptep))); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) + if (huge_ptep_set_access_flags_replicated(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); } @@ -4314,7 +4315,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, src_pte = huge_pte_offset(src, addr, sz); if (!src_pte) continue; - dst_pte = huge_pte_alloc(dst, addr, sz); + dst_pte = huge_pte_alloc_copy_tables(dst, src, addr, sz); if (!dst_pte) { ret = -ENOMEM; break; @@ -4356,10 +4357,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, */ make_migration_entry_read(&swp_entry); entry = swp_entry_to_pte(swp_entry); - set_huge_swap_pte_at(src, addr, src_pte, + set_huge_swap_pte_at_replicated(src, addr, src_pte, entry, sz); } - set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); + set_huge_swap_pte_at_replicated(dst, addr, dst_pte, entry, sz); } else { if (cow) { /* @@ -4369,13 +4370,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, * * See Documentation/vm/mmu_notifier.rst */ - huge_ptep_set_wrprotect(src, addr, src_pte); + huge_ptep_set_wrprotect_replicated(src, addr, src_pte); } entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); page_dup_rmap(ptepage, true); - set_huge_pte_at(dst, addr, dst_pte, entry); + set_huge_pte_at_replicated(dst, addr, dst_pte, entry); hugetlb_count_add(pages_per_huge_page(h), dst); } spin_unlock(src_ptl); @@ -4448,7 +4449,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { - huge_pte_clear(mm, address, ptep, sz); + huge_pte_clear_replicated(mm, address, ptep, sz); spin_unlock(ptl); continue; } @@ -4472,7 +4473,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, set_vma_resv_flags(vma, HPAGE_RESV_UNMAPPED); } - pte = huge_ptep_get_and_clear(mm, address, ptep); + pte = huge_ptep_get_and_clear_replicated(mm, address, ptep); tlb_remove_huge_tlb_entry(h, tlb, ptep, address); /* sharepool k2u mapped pages are marked special */ @@ -4749,9 +4750,9 @@ static vm_fault_t hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, ClearHPageRestoreReserve(new_page); /* Break COW */ - huge_ptep_clear_flush(vma, haddr, ptep); + huge_ptep_clear_flush_replicated(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); - set_huge_pte_at(mm, haddr, ptep, + set_huge_pte_at_replicated(mm, haddr, ptep, make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); @@ -4978,7 +4979,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, page_dup_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); - set_huge_pte_at(mm, haddr, ptep, new_pte); + set_huge_pte_at_replicated(mm, haddr, ptep, new_pte); hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { @@ -5079,7 +5080,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ mapping = vma->vm_file->f_mapping; i_mmap_lock_read(mapping); - ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); + ptep = huge_pte_alloc_replica(mm, vma, haddr, huge_page_size(h)); if (!ptep) { i_mmap_unlock_read(mapping); return VM_FAULT_OOM; @@ -5169,7 +5170,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, + if (huge_ptep_set_access_flags_replicated(vma, haddr, ptep, entry, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, haddr, ptep); out_put_page: @@ -5314,10 +5315,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, _dst_pte = huge_pte_mkdirty(_dst_pte); _dst_pte = pte_mkyoung(_dst_pte); - set_huge_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + set_huge_pte_at_replicated(dst_mm, dst_addr, dst_pte, _dst_pte); - (void)huge_ptep_set_access_flags(dst_vma, dst_addr, dst_pte, _dst_pte, - dst_vma->vm_flags & VM_WRITE); + (void)huge_ptep_set_access_flags_replicated(dst_vma, dst_addr, dst_pte, _dst_pte, + dst_vma->vm_flags & VM_WRITE); hugetlb_count_add(pages_per_huge_page(h), dst_mm); /* No need to invalidate - it was non-present before */ @@ -5576,8 +5577,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, make_migration_entry_read(&entry); newpte = swp_entry_to_pte(entry); - set_huge_swap_pte_at(mm, address, ptep, - newpte, huge_page_size(h)); + set_huge_swap_pte_at_replicated(mm, address, ptep, + newpte, huge_page_size(h)); pages++; } spin_unlock(ptl); @@ -5586,10 +5587,10 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, if (!huge_pte_none(pte)) { pte_t old_pte; - old_pte = huge_ptep_modify_prot_start(vma, address, ptep); + old_pte = huge_ptep_modify_prot_start_replicated(vma, address, ptep); pte = pte_mkhuge(huge_pte_modify(old_pte, newprot)); pte = arch_make_huge_pte(pte, vma, NULL, 0); - huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); + huge_ptep_modify_prot_commit_replicated(vma, address, ptep, old_pte, pte); pages++; } spin_unlock(ptl); @@ -5926,6 +5927,10 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spte = huge_pte_offset(svma->vm_mm, saddr, vma_mmu_pagesize(svma)); if (spte) { + if (PageReplicated(virt_to_page(spte))) { + spte = NULL; + continue; + } get_page(virt_to_page(spte)); break; } @@ -5937,8 +5942,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) spin_lock(&mm->page_table_lock); if (pud_none(*pud)) { - pud_populate(mm, pud, - (pmd_t *)((unsigned long)spte & PAGE_MASK)); + pud_populate_replicated(mm, pud, + (pmd_t *)((unsigned long)spte & PAGE_MASK)); mm_inc_nr_pmds(mm); } else { put_page(virt_to_page(spte)); @@ -5973,7 +5978,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, if (page_count(virt_to_page(ptep)) == 1) return 0; - pud_clear(pud); + pud_clear_replicated(pud); put_page(virt_to_page(ptep)); mm_dec_nr_pmds(mm); /* @@ -6426,7 +6431,7 @@ static int __hugetlb_insert_hugepage(struct mm_struct *mm, unsigned long addr, ptl = huge_pte_lockptr(h, mm, ptep); spin_lock(ptl); - set_huge_pte_at(mm, addr, ptep, entry); + set_huge_pte_at_replicated(mm, addr, ptep, entry); spin_unlock(ptl); return ret; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index a3f45bca187b1..d014ff2e7f370 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -18,6 +18,7 @@ #include <linux/page_idle.h> #include <linux/swapops.h> #include <linux/shmem_fs.h> +#include <linux/numa_user_replication.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -467,6 +468,8 @@ static bool hugepage_vma_check(struct vm_area_struct *vma, return false; if (vma_is_temporary_stack(vma)) return false; + if (vma_has_replicas(vma)) + return false; return !(vm_flags & VM_NO_KHUGEPAGED); } @@ -631,6 +634,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_PAGE_NULL; goto out; } + BUG_ON(PageReplicated(compound_head(page))); VM_BUG_ON_PAGE(!PageAnon(page), page); @@ -762,7 +766,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * paravirt calls inside pte_clear here are * superfluous. */ - pte_clear(vma->vm_mm, address, _pte); + pte_clear_replicated(vma->vm_mm, address, _pte); spin_unlock(ptl); } } else { @@ -780,7 +784,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * paravirt calls inside pte_clear here are * superfluous. */ - pte_clear(vma->vm_mm, address, _pte); + + pte_clear_replicated(vma->vm_mm, address, _pte); reliable_page_counter(src_page, vma->vm_mm, -1); page_remove_rmap(src_page, false); spin_unlock(ptl); @@ -1076,7 +1081,13 @@ static void collapse_huge_page(struct mm_struct *mm, struct vm_area_struct *vma; struct mmu_notifier_range range; gfp_t gfp; - +#ifdef CONFIG_USER_REPLICATION + pmd_t _pmd_numa[MAX_NUMNODES]; + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + bool pmd_replicated = false; +#endif VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* Only allocate from the target node */ @@ -1162,6 +1173,16 @@ static void collapse_huge_page(struct mm_struct *mm, * it detects PMD is changed. */ _pmd = pmdp_collapse_flush(vma, address, pmd); +#ifdef CONFIG_USER_REPLICATION + pmd_replicated = numa_pgtable_replicated(pmd); + if (pmd_replicated) { + _pmd_numa[first_memory_node] = _pmd; + for_each_pgtable_replica(curr, curr_pmd, pmd, offset) { + _pmd_numa[page_to_nid(curr)] = pmdp_collapse_flush(vma, address, curr_pmd); + } + } +#endif + spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); tlb_remove_table_sync_one(); @@ -1181,6 +1202,13 @@ static void collapse_huge_page(struct mm_struct *mm, * points to regular pagetables. Use pmd_populate for that */ pmd_populate(mm, pmd, pmd_pgtable(_pmd)); +#ifdef CONFIG_USER_REPLICATION + if (pmd_replicated) { + for_each_pgtable_replica(curr, curr_pmd, pmd, offset) { + pmd_populate(mm, curr_pmd, pmd_pgtable(_pmd_numa[page_to_nid(curr)])); + } + } +#endif spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); result = SCAN_FAIL; @@ -1214,8 +1242,29 @@ static void collapse_huge_page(struct mm_struct *mm, reliable_page_counter(new_page, vma->vm_mm, HPAGE_PMD_NR); page_add_new_anon_rmap(new_page, vma, address, true); lru_cache_add_inactive_or_unevictable(new_page, vma); - pgtable_trans_huge_deposit(mm, pmd, pgtable); - set_pmd_at(mm, address, pmd, _pmd); + +#ifdef CONFIG_USER_REPLICATION + if (numa_pgtable_replicated(page_to_virt(pgtable))) { + int nid; + + pgtable->master_table = pgtable; + for_each_memory_node(nid) { + pgtable_t curr_pgtable = pmd_pgtable(_pmd_numa[nid]); + + curr_pgtable->replica_list_head.first = NULL; + ClearPageReplicated(curr_pgtable); + memcg_account_dereplicated_pgtable_page(page_to_virt(curr_pgtable)); + if (nid != first_memory_node) { + pte_free(mm, curr_pgtable); + mm_dec_nr_ptes(mm); + } + } + account_dereplicated_table(mm); + } +#endif + + pgtable_trans_huge_deposit(mm, get_master_pmd(pmd), pgtable); + set_pmd_at_replicated(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); spin_unlock(pmd_ptl); diff --git a/mm/ksm.c b/mm/ksm.c index b2cdbe6caa712..f851499313ade 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -38,6 +38,7 @@ #include <linux/freezer.h> #include <linux/oom.h> #include <linux/numa.h> +#include <linux/numa_user_replication.h> #include <linux/mempolicy.h> #include <asm/tlbflush.h> @@ -1115,13 +1116,13 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * * See Documentation/vm/mmu_notifier.rst */ - entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); + entry = ptep_clear_flush_replicated(vma, pvmw.address, pvmw.pte); /* * Check that no O_DIRECT or similar I/O is in progress on the * page */ if (page_mapcount(page) + 1 + swapped != page_count(page)) { - set_pte_at(mm, pvmw.address, pvmw.pte, entry); + set_pte_at_replicated(mm, pvmw.address, pvmw.pte, entry); goto out_unlock; } if (pte_dirty(entry)) @@ -1131,7 +1132,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, entry = pte_mkclean(pte_clear_savedwrite(entry)); else entry = pte_mkclean(pte_wrprotect(entry)); - set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); + set_pte_at_notify_replicated(mm, pvmw.address, pvmw.pte, entry); } *orig_pte = *pvmw.pte; err = 0; @@ -1211,8 +1212,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * * See Documentation/vm/mmu_notifier.rst */ - ptep_clear_flush(vma, addr, ptep); - set_pte_at_notify(mm, addr, ptep, newpte); + ptep_clear_flush_replicated(vma, addr, ptep); + set_pte_at_notify_replicated(mm, addr, ptep, newpte); reliable_page_counter(page, mm, -1); page_remove_rmap(page, false); @@ -2339,7 +2340,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) continue; if (ksm_scan.address < vma->vm_start) ksm_scan.address = vma->vm_start; - if (!vma->anon_vma) + if (!vma->anon_vma || vma_has_replicas(vma)) ksm_scan.address = vma->vm_end; while (ksm_scan.address < vma->vm_end) { @@ -2796,7 +2797,26 @@ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) */ if ((rmap_item->mm == vma->vm_mm) == search_new_forks) continue; +#ifdef CONFIG_USER_REPLICATION + down_read(&vma->vm_mm->replication_ctl->rmap_lock); + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) { + up_read(&vma->vm_mm->replication_ctl->rmap_lock); + continue; + } + + if (!rwc->rmap_one(page, vma, addr, rwc->arg)) { + up_read(&vma->vm_mm->replication_ctl->rmap_lock); + anon_vma_unlock_read(anon_vma); + return; + } + if (rwc->done && rwc->done(page)) { + up_read(&vma->vm_mm->replication_ctl->rmap_lock); + anon_vma_unlock_read(anon_vma); + return; + } + up_read(&vma->vm_mm->replication_ctl->rmap_lock); +#else if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; @@ -2808,6 +2828,7 @@ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) anon_vma_unlock_read(anon_vma); return; } +#endif } anon_vma_unlock_read(anon_vma); } diff --git a/mm/madvise.c b/mm/madvise.c index 926bf4523befc..28198bd03806b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -29,6 +29,7 @@ #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/mmu_notifier.h> +#include <linux/numa_user_replication.h> #include <asm/tlb.h> @@ -370,6 +371,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, page = pmd_page(orig_pmd); + /* Do not interfere with replicated pages */ + if (PageReplicated(page)) + goto huge_unlock; + /* Do not interfere with other mappings of this page */ if (page_mapcount(page) != 1) goto huge_unlock; @@ -389,10 +394,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, } if (pmd_young(orig_pmd)) { - pmdp_invalidate(vma, addr, pmd); + pmdp_invalidate_replicated(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); - set_pmd_at(mm, addr, pmd, orig_pmd); + set_pmd_at_replicated(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } @@ -434,7 +439,12 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, page = vm_normal_page(vma, addr, ptent); if (!page) continue; - + /* + * Again, we do not care about replicated pages here, + * they are unevictable and invisible for reclaim anyway + */ + if (PageReplicated(compound_head(page))) + continue; /* * Creating a THP page is expensive so split it only if we * are sure it's worth. Split it if we are only owner. @@ -472,10 +482,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, VM_BUG_ON_PAGE(PageTransCompound(page), page); if (pte_young(ptent)) { - ptent = ptep_get_and_clear_full(mm, addr, pte, + ptent = ptep_get_and_clear_full_replicated(mm, addr, pte, tlb->fullmm); ptent = pte_mkold(ptent); - set_pte_at(mm, addr, pte, ptent); + set_pte_at_replicated(mm, addr, pte, ptent); tlb_remove_tlb_entry(tlb, pte, addr); } @@ -644,7 +654,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, continue; nr_swap--; free_swap_and_cache(entry); - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + pte_clear_not_present_full_replicated(mm, addr, pte, tlb->fullmm); continue; } @@ -652,6 +662,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (!page) continue; + if (PageReplicated(compound_head(page))) + continue; + /* * If pmd isn't transhuge but the page is THP and * is owned by only this process, split it and @@ -710,12 +723,12 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, * the portability, remap the pte with old|clean * after pte clearing. */ - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); + ptent = ptep_get_and_clear_full_replicated(mm, addr, pte, + tlb->fullmm); ptent = pte_mkold(ptent); ptent = pte_mkclean(ptent); - set_pte_at(mm, addr, pte, ptent); + set_pte_at_replicated(mm, addr, pte, ptent); tlb_remove_tlb_entry(tlb, pte, addr); } mark_page_lazyfree(page); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d9467b38e2f71..90ba413c2d336 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -63,6 +63,7 @@ #include <linux/psi.h> #include <linux/seq_buf.h> #include <linux/memcg_memfs_info.h> +#include <linux/numa_user_replication.h> #include "internal.h" #include <net/sock.h> #include <net/ip.h> @@ -73,6 +74,7 @@ #include <trace/events/vmscan.h> #ifndef __GENKSYMS__ #include <linux/ksm.h> +#include <linux/time_namespace.h> #endif struct cgroup_subsys memory_cgrp_subsys __read_mostly; @@ -3904,6 +3906,10 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, if (val & ~MOVE_MASK) return -EINVAL; +#ifdef CONFIG_USER_REPLICATION + if (memcg->replication_ctl->table_policy != TABLE_REPLICATION_NONE) + return -EINVAL; +#endif /* * No kind of locking is needed in here, because ->can_attach() will * check this value once in the beginning of the process, and then carry @@ -6036,6 +6042,135 @@ static ssize_t wb_blkio_write(struct kernfs_open_file *of, char *buf, static int memory_stat_show(struct seq_file *m, void *v); +#ifdef CONFIG_USER_REPLICATION +static int memory_numa_table_replication_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + seq_printf(m, "%d\n", memcg->replication_ctl->table_policy); + + return 0; +} + +static ssize_t memory_numa_table_replication_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct cgroup *cgrp = of_css(of)->cgroup; + unsigned long long result; + + buf = strstrip(buf); + + if (cgroup_has_tasks(cgrp)) + return -EINVAL; + + if (kstrtoull(buf, 0, &result)) + return -EINVAL; + + if (result == TABLE_REPLICATION_NONE && memcg->replication_ctl->data_policy != DATA_REPLICATION_NONE) + return -EINVAL; + + if (result != TABLE_REPLICATION_NONE) + WRITE_ONCE(memcg->move_charge_at_immigrate, MOVE_MASK); + + switch (result) { + case TABLE_REPLICATION_NONE: { + memcg->replication_ctl->table_policy = TABLE_REPLICATION_NONE; + break; + } + case TABLE_REPLICATION_MINIMAL: { + memcg->replication_ctl->table_policy = TABLE_REPLICATION_MINIMAL; + break; + } + case TABLE_REPLICATION_ALL: { + memcg->replication_ctl->table_policy = TABLE_REPLICATION_ALL; + break; + } + default: { + return -EINVAL; + } + } + + return nbytes; +} + +static int memory_numa_data_replication_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + seq_printf(m, "%d\n", memcg->replication_ctl->data_policy); + + return 0; +} + +static ssize_t memory_numa_data_replication_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct cgroup *cgrp = of_css(of)->cgroup; + unsigned long long result; + + buf = strstrip(buf); + + if (cgroup_has_tasks(cgrp)) + return -EINVAL; + + if (kstrtoull(buf, 0, &result)) + return -EINVAL; + + if (result != DATA_REPLICATION_NONE && memcg->replication_ctl->table_policy == TABLE_REPLICATION_NONE) + return -EINVAL; + + switch (result) { + case DATA_REPLICATION_NONE: { + memcg->replication_ctl->data_policy = DATA_REPLICATION_NONE; + break; + } + case DATA_REPLICATION_ON_DEMAND: { + memcg->replication_ctl->data_policy = DATA_REPLICATION_ON_DEMAND; + break; + } + case DATA_REPLICATION_ALL_MAPPED_ON_DEMAND: { + memcg->replication_ctl->data_policy = DATA_REPLICATION_ALL_MAPPED_ON_DEMAND; + break; + } + case DATA_REPLICATION_ALL: { + memcg->replication_ctl->data_policy = DATA_REPLICATION_ALL; + break; + } + default: { + return -EINVAL; + } + } + + return nbytes; +} + +static int memory_numa_replication_stats_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + long long replicated_data_bytes = 0; + long long replicated_table_bytes = 0; + struct timespec64 uptime; + + ktime_get_boottime_ts64(&uptime); + timens_add_boottime(&uptime); + + replicated_data_bytes = total_replicated_data_bytes_memecg(memcg); + replicated_table_bytes = total_replicated_table_bytes_memecg(memcg); + + seq_printf(m, "{\n" + " \"timestamp\": \"%lu.%02lu\",\n" + " \"replicated_data_bytes\": \"%lld\",\n" + " \"replicated_table_bytes\": \"%lld\"\n" + "}\n", (unsigned long) uptime.tv_sec, + (uptime.tv_nsec / (NSEC_PER_SEC / 100)), replicated_data_bytes, replicated_table_bytes); + + return 0; +} + +#endif + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -6281,6 +6416,24 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = memory_ksm_write, .seq_show = memory_ksm_show, }, +#endif +#ifdef CONFIG_USER_REPLICATION + { + .name = "numa_table_replication", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_numa_table_replication_show, + .write = memory_numa_table_replication_write, + }, + { + .name = "numa_data_replication", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_numa_data_replication_show, + .write = memory_numa_data_replication_write, + }, + { + .name = "numa_replication_stats", + .seq_show = memory_numa_replication_stats_show + }, #endif { }, /* terminate */ }; @@ -6404,6 +6557,20 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) kfree(pn); } +#ifdef CONFIG_USER_REPLICATION +static void memcg_free_replication_ctl(struct mem_cgroup *memcg) +{ + free_percpu(memcg->replication_ctl->pcp_dereplicated_tables); + free_percpu(memcg->replication_ctl->pcp_replicated_tables); + free_percpu(memcg->replication_ctl->pcp_dereplicated_pages); + free_percpu(memcg->replication_ctl->pcp_replicated_pages); + + kfree(memcg->replication_ctl); +} +#else +static void memcg_free_replication_ctl(struct mem_cgroup *memcg) { } +#endif + static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; @@ -6412,6 +6579,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); memcg_free_swap_device(memcg); + memcg_free_replication_ctl(memcg); kfree(memcg); } @@ -6491,6 +6659,53 @@ static struct mem_cgroup *mem_cgroup_alloc(void) return ERR_PTR(error); } +#ifdef CONFIG_USER_REPLICATION +static int memcg_init_replication_ctl(struct mem_cgroup *memcg) +{ + memcg->replication_ctl = kmalloc(sizeof(struct memcg_replication_ctl), GFP_KERNEL); + if (!memcg->replication_ctl) + return -ENOMEM; + memcg->replication_ctl->fork_policy = FORK_KEEP_REPLICA; + memcg->replication_ctl->table_policy = TABLE_REPLICATION_NONE; + memcg->replication_ctl->data_policy = DATA_REPLICATION_NONE; + + memcg->replication_ctl->pcp_replicated_pages = alloc_percpu_gfp(unsigned long, GFP_KERNEL | __GFP_ZERO); + if (!memcg->replication_ctl->pcp_replicated_pages) + goto fail1; + + memcg->replication_ctl->pcp_dereplicated_pages = alloc_percpu_gfp(unsigned long, GFP_KERNEL | __GFP_ZERO); + if (!memcg->replication_ctl->pcp_dereplicated_pages) + goto fail2; + + memcg->replication_ctl->pcp_replicated_tables = alloc_percpu_gfp(unsigned long, GFP_KERNEL | __GFP_ZERO); + if (!memcg->replication_ctl->pcp_replicated_tables) + goto fail3; + + memcg->replication_ctl->pcp_dereplicated_tables = alloc_percpu_gfp(unsigned long, GFP_KERNEL | __GFP_ZERO); + if (!memcg->replication_ctl->pcp_dereplicated_tables) + goto fail4; + + return 0; + +fail4: + free_percpu(memcg->replication_ctl->pcp_replicated_tables); +fail3: + free_percpu(memcg->replication_ctl->pcp_dereplicated_pages); +fail2: + free_percpu(memcg->replication_ctl->pcp_replicated_pages); +fail1: + kfree(memcg->replication_ctl); + memcg->replication_ctl = NULL; + + return -ENOMEM; +} +#else +static int memcg_init_replication_ctl(struct mem_cgroup *memcg) +{ + return 0; +} +#endif + static struct cgroup_subsys_state * __ref mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -6541,6 +6756,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent != root_mem_cgroup) memory_cgrp_subsys.broken_hierarchy = true; } + + if (memcg_init_replication_ctl(memcg)) + goto fail; + /* The following stuff does not apply to the root */ if (!parent) { root_mem_cgroup = memcg; @@ -7281,6 +7500,14 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) VM_BUG_ON(from == memcg); +#ifdef CONFIG_USER_REPLICATION + /* + * We can not remove process from cgroup if replication is enabled + */ + if (from->replication_ctl->table_policy != TABLE_REPLICATION_NONE) + return 1; +#endif + mm = get_task_mm(p); if (!mm) return 0; @@ -7472,9 +7699,24 @@ static void mem_cgroup_attach(struct cgroup_taskset *tset) memcg_attach_ksm(tset); } + +#ifdef CONFIG_USER_REPLICATION + +static void mem_cgroup_handle_replication(void) +{ + set_fork_policy(mc.mm, mc.to->replication_ctl->fork_policy); + numa_dispatch_table_replication_request(mc.mm, mc.to->replication_ctl->table_policy); + numa_dispatch_data_replication_request(mc.mm, mc.to->replication_ctl->data_policy); +} + +#else +static void mem_cgroup_handle_replication(void) { } +#endif + static void mem_cgroup_move_task(void) { if (mc.to) { + mem_cgroup_handle_replication(); mem_cgroup_move_charge(); mem_cgroup_clear_mc(); } @@ -7988,6 +8230,11 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) css_get(&memcg->css); commit_charge(page, memcg); +#ifdef CONFIG_USER_REPLICATION + if (PageReplicated(page)) + memcg_account_replicated_pages(memcg, nr_pages); +#endif + local_irq_disable(); mem_cgroup_charge_statistics(memcg, page, nr_pages); memcg_check_events(memcg, page); @@ -8062,7 +8309,9 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) unsigned long nr_pages; struct mem_cgroup *memcg; struct obj_cgroup *objcg; - +#ifdef CONFIG_USER_REPLICATION + bool replicated = false; +#endif VM_BUG_ON_PAGE(PageLRU(page), page); /* @@ -8080,7 +8329,11 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } else { memcg = __page_memcg(page); } - +#ifdef CONFIG_USER_REPLICATION + replicated = PageReplicated(page); + if (replicated) + ClearPageReplicated(page); +#endif if (!memcg) return; @@ -8097,7 +8350,10 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } nr_pages = compound_nr(page); - +#ifdef CONFIG_USER_REPLICATION + if (replicated) + memcg_account_dereplicated_pages(memcg, nr_pages); +#endif if (PageMemcgKmem(page)) { ug->nr_memory += nr_pages; ug->nr_kmem += nr_pages; diff --git a/mm/memory.c b/mm/memory.c index 42a50af36a4ca..27d06d5c1293d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -75,7 +75,8 @@ #include <linux/vmalloc.h> #include <linux/userswap.h> #include <linux/pbha.h> -#include <linux/numa_replication.h> +#include <linux/numa_user_replication.h> + #include <trace/events/kmem.h> @@ -211,6 +212,223 @@ static void check_sync_rss_stat(struct task_struct *task) #endif /* SPLIT_RSS_COUNTING */ +#ifdef CONFIG_KERNEL_REPLICATION +#ifdef CONFIG_USER_REPLICATION + +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) +{ + unsigned long offset; + struct page *curr, *tmp; + pmd_t *curr_pmd; + pte_t *curr_pte; + pgtable_t token = pmd_pgtable(*pmd); + bool pmd_replicated = numa_pgtable_replicated(pmd); + bool pte_replicated = numa_pgtable_replicated(page_to_virt(token)); + + pmd_clear(pmd); + + if (pmd_replicated) + for_each_pgtable_replica(curr, curr_pmd, pmd, offset) { + pmd_clear(curr_pmd); + } + + if (pte_replicated) { + memcg_account_dereplicated_pte_page(page_to_virt(token)); + for_each_pgtable_replica_safe(curr, tmp, curr_pte, page_to_virt(token), offset) { + memcg_account_dereplicated_pte_page(curr_pte); + cleanup_pte_list(curr); + pte_free_tlb(tlb, curr, addr); + mm_dec_nr_ptes(tlb->mm); + } + account_dereplicated_table(tlb->mm); + } + cleanup_pte_list(token); + pte_free_tlb(tlb, token, addr); + mm_dec_nr_ptes(tlb->mm); +} + +static void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr) +{ + unsigned long offset; + struct page *curr, *tmp; + pud_t *curr_pud; + pmd_t *curr_pmd; + pmd_t *pmd = pmd_offset(pud, addr); + bool pud_replicated = numa_pgtable_replicated(pud); + bool pmd_replicated = numa_pgtable_replicated(pmd); + + pud_clear(pud); + + if (pud_replicated) + for_each_pgtable_replica(curr, curr_pud, pud, offset) { + pud_clear(curr_pud); + } + + if (pmd_replicated) { + memcg_account_dereplicated_pmd_page(pmd); + for_each_pgtable_replica_safe(curr, tmp, curr_pmd, pmd, offset) { + memcg_account_dereplicated_pmd_page(curr_pmd); + cleanup_pmd_list(curr); + pmd_free_tlb(tlb, curr_pmd, addr); + mm_dec_nr_pmds(tlb->mm); + } + account_dereplicated_table(tlb->mm); + } + cleanup_pmd_list(virt_to_page(pmd)); + pmd_free_tlb(tlb, pmd, addr); + mm_dec_nr_pmds(tlb->mm); +} + +static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long addr) +{ + unsigned long offset; + struct page *curr, *tmp; + p4d_t *curr_p4d; + pud_t *curr_pud; + pud_t *pud = pud_offset(p4d, addr); + bool p4d_replicated = numa_pgtable_replicated(p4d); + bool pud_replicated = numa_pgtable_replicated(pud); + + p4d_clear(p4d); + + if (p4d_replicated) + for_each_pgtable_replica(curr, curr_p4d, p4d, offset) { + p4d_clear(curr_p4d); + } + + if (pud_replicated) { + memcg_account_dereplicated_pud_page(pud); + for_each_pgtable_replica_safe(curr, tmp, curr_pud, pud, offset) { + memcg_account_dereplicated_pud_page(curr_pud); + cleanup_pud_list(curr); + pud_free_tlb(tlb, curr_pud, addr); + mm_dec_nr_puds(tlb->mm); + } + account_dereplicated_table(tlb->mm); + } + cleanup_pud_list(virt_to_page(pud)); + pud_free_tlb(tlb, pud, addr); + mm_dec_nr_puds(tlb->mm); +} + +static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr) +{ + unsigned long offset; + struct page *curr, *tmp; + pgd_t *curr_pgd; + p4d_t *curr_p4d; + p4d_t *p4d = p4d_offset(pgd, addr); + bool pgd_replicated = numa_pgtable_replicated(pgd); + bool p4d_replicated = numa_pgtable_replicated(p4d); + + pgd_clear(pgd); + + if (pgd_replicated) + for_each_pgtable_replica(curr, curr_pgd, pgd, offset) + pgd_clear(curr_pgd); + + if (p4d_replicated) { + for_each_pgtable_replica_safe(curr, tmp, curr_p4d, p4d, offset) { + cleanup_p4d_list(curr); + p4d_free_tlb(tlb, curr_p4d, addr); + } +#ifndef __PAGETABLE_P4D_FOLDED + account_dereplicated_table(tlb->mm); +#endif + } + cleanup_p4d_list(virt_to_page(p4d)); + p4d_free_tlb(tlb, p4d, addr); +} + +#else + +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) +{ + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + pgtable_t token = pmd_pgtable(*pmd); + + pmd_clear(pmd); + + if (get_propagation_level() == PMD_PROPAGATION) { + for_each_pgtable_replica(curr, curr_pmd, pmd, offset) { + pmd_clear(curr_pmd); + } + } + + pte_free_tlb(tlb, token, addr); + mm_dec_nr_ptes(tlb->mm); +} + +static inline void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr) +{ + unsigned long offset; + struct page *curr; + pud_t *curr_pud; + pmd_t *pmd = pmd_offset(pud, addr); + + pud_clear(pud); + + if (get_propagation_level() == PUD_PROPAGATION) { + for_each_pgtable_replica(curr, curr_pud, pud, offset) { + pud_clear(curr_pud); + } + } + + pmd_free_tlb(tlb, pmd, addr); + mm_dec_nr_pmds(tlb->mm); +} + +static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long addr) +{ + unsigned long offset; + struct page *curr; + p4d_t *curr_p4d; + pud_t *pud = pud_offset(p4d, addr); + + p4d_clear(p4d); + + if (get_propagation_level() == P4D_PROPAGATION) { + for_each_pgtable_replica(curr, curr_p4d, p4d, offset) { + p4d_clear(curr_p4d); + } + } + + pud_free_tlb(tlb, pud, addr); + mm_dec_nr_puds(tlb->mm); +} + +static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr) +{ + unsigned long offset; + struct page *curr; + pgd_t *curr_pgd; + p4d_t *p4d = p4d_offset(pgd, addr); + + pgd_clear(pgd); + + if (get_propagation_level() == PGD_PROPAGATION) { + for_each_pgtable_replica(curr, curr_pgd, pgd, offset) { + pgd_clear(curr_pgd); + } + } + p4d_free_tlb(tlb, p4d, addr); +} + + +#endif /*CONFIG_USER_REPLICATION*/ + +#else /*!CONFIG_KERNEL_REPLICATION*/ + /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -224,6 +442,40 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, mm_dec_nr_ptes(tlb->mm); } +static void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr) +{ + pmd_t *pmd = pmd_offset(pud, addr); + + pud_clear(pud); + pmd_free_tlb(tlb, pmd, addr); + mm_dec_nr_pmds(tlb->mm); + (void)pmd; +} + +static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long addr) +{ + pud_t *pud = pud_offset(p4d, addr); + + p4d_clear(p4d); + pud_free_tlb(tlb, pud, addr); + mm_dec_nr_puds(tlb->mm); + (void)pud; +} + +static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr) +{ + p4d_t *p4d = p4d_offset(pgd, addr); + + pgd_clear(pgd); + p4d_free_tlb(tlb, p4d, addr); + (void)p4d; +} + +#endif /*CONFIG_KERNEL_REPLICATION*/ + static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) @@ -252,28 +504,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, if (end - 1 > ceiling - 1) return; - pmd = pmd_offset(pud, start); - pud_clear(pud); - pmd_free_tlb(tlb, pmd, start); - mm_dec_nr_pmds(tlb->mm); -} - -static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d) -{ -#ifdef CONFIG_KERNEL_REPLICATION - int nid; - int offset; - - if (mm_p4d_folded(tlb->mm)) { - offset = p4d - (p4d_t *)tlb->mm->pgd; - for_each_memory_node(nid) - p4d_clear((p4d_t *)tlb->mm->pgd_numa[nid] + offset); - } else { - p4d_clear(p4d); - } -#else - p4d_clear(p4d); -#endif /* CONFIG_KERNEL_REPLICATION */ + __free_pmd_range(tlb, pud, start); } static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, @@ -304,28 +535,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, if (end - 1 > ceiling - 1) return; - pud = pud_offset(p4d, start); - - __free_pud_range(tlb, p4d); - - pud_free_tlb(tlb, pud, start); - mm_dec_nr_puds(tlb->mm); -} - -static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd) -{ -#ifdef CONFIG_KERNEL_REPLICATION - int nid; - int offset; - - if (!mm_p4d_folded(tlb->mm)) { - offset = pgd - (pgd_t *)tlb->mm->pgd; - for_each_memory_node(nid) - pgd_clear(tlb->mm->pgd_numa[nid] + offset); - } -#else - pgd_clear(pgd); -#endif /* CONFIG_KERNEL_REPLICATION */ + __free_pud_range(tlb, p4d, start); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -356,11 +566,7 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, if (end - 1 > ceiling - 1) return; - p4d = p4d_offset(pgd, start); - - __free_p4d_range(tlb, pgd); - - p4d_free_tlb(tlb, p4d, start); + __free_p4d_range(tlb, pgd, start); } /* @@ -448,9 +654,15 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } else { /* * Optimization: gather nearby vmas into one call down + * We are able to optimize into one call only if all of them are replicated, + * or all of them are not + * + * Disable this optimization for replicated vmas for now, + * because maple tree (i think) can't erase multiple regions in single call */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE - && !is_vm_hugetlb_page(next)) { + && !is_vm_hugetlb_page(next) + && !numa_is_vma_replicant(vma) && !numa_is_vma_replicant(next)) { vma = next; next = vma->vm_next; unlink_anon_vmas(vma); @@ -488,7 +700,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm_inc_nr_ptes(mm); - pmd_populate(mm, pmd, new); + pmd_populate_replicated(mm, pmd, new); new = NULL; } spin_unlock(ptl); @@ -775,7 +987,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(*src_pte)) pte = pte_swp_mkuffd_wp(pte); - set_pte_at(src_mm, addr, src_pte, pte); + set_pte_at_replicated(src_mm, addr, src_pte, pte); } } else if (is_device_private_entry(entry)) { page = device_private_entry_to_page(entry); @@ -806,12 +1018,12 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(*src_pte)) pte = pte_swp_mkuffd_wp(pte); - set_pte_at(src_mm, addr, src_pte, pte); + set_pte_at_replicated(src_mm, addr, src_pte, pte); } } if (!userfaultfd_wp(dst_vma)) pte = pte_swp_clear_uffd_wp(pte); - set_pte_at(dst_mm, addr, dst_pte, pte); + set_pte_at_replicated(dst_mm, addr, dst_pte, pte); return 0; } @@ -841,9 +1053,15 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma struct page **prealloc, pte_t pte, struct page *page) { struct mm_struct *src_mm = src_vma->vm_mm; +#ifdef CONFIG_USER_REPLICATION + bool discard_replica = PageReplicated(compound_head(page)) && + (get_fork_policy(dst_vma->vm_mm) == FORK_DISCARD_REPLICA); +#else + bool discard_replica = false; +#endif struct page *new_page; - if (!is_cow_mapping(src_vma->vm_flags)) + if (!is_cow_mapping(src_vma->vm_flags) && !discard_replica) return 1; /* @@ -859,9 +1077,9 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma * the page count. That might give false positives for * for pinning, but it will work correctly. */ - if (likely(!atomic_read(&src_mm->has_pinned))) + if (likely(!atomic_read(&src_mm->has_pinned) && !discard_replica)) return 1; - if (likely(!page_maybe_dma_pinned(page))) + if (likely(!page_maybe_dma_pinned(page) && !discard_replica)) return 1; /* @@ -900,7 +1118,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma if (userfaultfd_pte_wp(dst_vma, *src_pte)) /* Uffd-wp needs to be delivered to dest pte as well */ pte = pte_wrprotect(pte_mkuffd_wp(pte)); - set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); + set_pte_at_replicated(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } @@ -917,20 +1135,45 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long vm_flags = src_vma->vm_flags; pte_t pte = *src_pte; struct page *page; - +#ifdef CONFIG_USER_REPLICATION + pte_t pte_numa[MAX_NUMNODES]; + unsigned long offset; + bool start; + struct page *curr; + pte_t *curr_pte; + int nid; + bool page_replicated = false; +#endif page = vm_normal_page(src_vma, addr, pte); + if (page) { int retval; - retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, addr, rss, prealloc, pte, page); if (retval <= 0) return retval; - - get_page(page); - page_dup_rmap(page, false); - rss[mm_counter(page)]++; - reliable_page_counter(page, dst_vma->vm_mm, 1); +#ifdef CONFIG_USER_REPLICATION + page_replicated = PageReplicated(compound_head(page)); + if (page_replicated) { + BUG_ON(get_fork_policy(dst_vma->vm_mm) != FORK_KEEP_REPLICA); + + for_each_pgtable(curr, curr_pte, src_pte, nid, offset, start) { + struct page *curr_page = vm_normal_page(src_vma, addr, *curr_pte); + + pte_numa[nid] = *curr_pte; + get_page(curr_page); + rss[MM_ANONPAGES]++; + reliable_page_counter(curr_page, dst_vma->vm_mm, 1); + } + account_replicated_page(dst_vma->vm_mm); + } else +#endif + { + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; + reliable_page_counter(page, dst_vma->vm_mm, 1); + } } /* @@ -938,8 +1181,15 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, * in the parent and the child */ if (is_cow_mapping(vm_flags) && pte_write(pte)) { - ptep_set_wrprotect(src_mm, addr, src_pte); - pte = pte_wrprotect(pte); + ptep_set_wrprotect_replicated(src_mm, addr, src_pte); +#ifdef CONFIG_USER_REPLICATION + if (page_replicated) { + for_each_memory_node(nid) + pte_numa[nid] = pte_wrprotect(pte_numa[nid]); + } else +#endif + pte = pte_wrprotect(pte); + } /* @@ -948,12 +1198,26 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, */ if (vm_flags & VM_SHARED) pte = pte_mkclean(pte); - pte = pte_mkold(pte); + +#ifdef CONFIG_USER_REPLICATION + if (page_replicated) { + for_each_memory_node(nid) + pte_numa[nid] = pte_mkold(pte_numa[nid]); + } else +#endif + pte = pte_mkold(pte); if (!userfaultfd_wp(dst_vma)) pte = pte_clear_uffd_wp(pte); - set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); +#ifdef CONFIG_USER_REPLICATION + if (page_replicated) { + for_each_pgtable(curr, curr_pte, dst_pte, nid, offset, start) + set_pte_at(dst_vma->vm_mm, addr, curr_pte, pte_numa[nid]); + } else +#endif + set_pte_at_replicated(dst_vma->vm_mm, addr, dst_pte, pte); + return 0; } @@ -976,6 +1240,34 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, return new_page; } +#ifndef CONFIG_USER_REPLICATION + +static pte_t *cpr_alloc_pte_map_lock(struct mm_struct *mm, unsigned long addr, + pmd_t *src_pmd, pmd_t *dst_pmd, spinlock_t **ptl) +{ + return pte_alloc_map_lock(mm, dst_pmd, addr, ptl); +} + +static pmd_t *cpr_alloc_pmd(struct mm_struct *mm, unsigned long addr, + pud_t *src_pud, pud_t *dst_pud) +{ + return pmd_alloc(mm, dst_pud, addr); +} + +static pud_t *cpr_alloc_pud(struct mm_struct *mm, unsigned long addr, + p4d_t *src_p4d, p4d_t *dst_p4d) +{ + return pud_alloc(mm, dst_p4d, addr); +} + +static p4d_t *cpr_alloc_p4d(struct mm_struct *mm, unsigned long addr, + pgd_t *src_pgd, pgd_t *dst_pgd) +{ + return p4d_alloc(mm, dst_pgd, addr); +} + +#endif + static int copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, @@ -995,7 +1287,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, progress = 0; init_rss_vec(rss); - dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); + dst_pte = cpr_alloc_pte_map_lock(dst_mm, addr, src_pmd, dst_pmd, &dst_ptl); if (!dst_pte) { ret = -ENOMEM; goto out; @@ -1096,7 +1388,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pmd_t *src_pmd, *dst_pmd; unsigned long next; - dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); + dst_pmd = cpr_alloc_pmd(dst_mm, addr, src_pud, dst_pud); if (!dst_pmd) return -ENOMEM; src_pmd = pmd_offset(src_pud, addr); @@ -1133,7 +1425,7 @@ copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pud_t *src_pud, *dst_pud; unsigned long next; - dst_pud = pud_alloc(dst_mm, dst_p4d, addr); + dst_pud = cpr_alloc_pud(dst_mm, addr, src_p4d, dst_p4d); if (!dst_pud) return -ENOMEM; src_pud = pud_offset(src_p4d, addr); @@ -1169,7 +1461,7 @@ copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, p4d_t *src_p4d, *dst_p4d; unsigned long next; - dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr); + dst_p4d = cpr_alloc_p4d(dst_mm, addr, src_pgd, dst_pgd); if (!dst_p4d) return -ENOMEM; src_p4d = p4d_offset(src_pgd, addr); @@ -1281,24 +1573,34 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct zap_details *details) { struct mm_struct *mm = tlb->mm; + struct pgtable_private zp; int force_flush = 0; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; pte_t *pte; swp_entry_t entry; - + int nid = 0; + int res = 0; + bool pte_replicated = false; + bool has_replicas = false; tlb_change_page_size(tlb, PAGE_SIZE); again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte = start_pte; + pgtable_update_pte(&zp, pte); + pte_replicated = numa_pgtable_replicated(pte); + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; + pte_t numa_ptent[MAX_NUMNODES]; + + has_replicas = false; if (pte_none(ptent)) - continue; + goto next; if (need_resched()) break; @@ -1306,7 +1608,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_present(ptent)) { struct page *page; + if (pte_replicated) { + for_each_memory_node(nid) + numa_ptent[nid] = *zp.pte_numa[nid]; + } + page = vm_normal_page(vma, addr, ptent); + if (likely(page)) + has_replicas = PageReplicated(compound_head(page)); + if (pte_replicated && has_replicas) { + for_each_memory_node(nid) + zp.replica_pages[nid] = vm_normal_page(vma, addr, numa_ptent[nid]); + } if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -1317,35 +1630,74 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, details->check_mapping != page_rmapping(page)) continue; } - ptent = ptep_get_and_clear_full(mm, addr, pte, - tlb->fullmm); + + if (pte_replicated) + for_each_memory_node(nid) + numa_ptent[nid] = ptep_get_and_clear_full(mm, addr, zp.pte_numa[nid], + tlb->fullmm); + else + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) - continue; + goto next; if (!PageAnon(page)) { - if (pte_dirty(ptent)) { - force_flush = 1; - set_page_dirty(page); + if (pte_replicated) { + int nid; + + for_each_memory_node(nid) { + if (pte_dirty(numa_ptent[nid])) { + force_flush = 1; + set_page_dirty(page); + } + if (pte_young(numa_ptent[nid]) && + likely(!(vma->vm_flags & VM_SEQ_READ))) + mark_page_accessed(page); + } + } else { + if (pte_dirty(ptent)) { + force_flush = 1; + set_page_dirty(page); + } + if (pte_young(ptent) && + likely(!(vma->vm_flags & VM_SEQ_READ))) + mark_page_accessed(page); } - if (pte_young(ptent) && - likely(!(vma->vm_flags & VM_SEQ_READ))) - mark_page_accessed(page); } - rss[mm_counter(page)]--; - reliable_page_counter(page, mm, -1); - page_remove_rmap(page, false); + + if (pte_replicated && has_replicas) { + for_each_memory_node(nid) { + rss[MM_ANONPAGES]--; + reliable_page_counter(zp.replica_pages[nid], mm, -1); + } + } else { + reliable_page_counter(page, mm, -1); + rss[mm_counter(page)]--; + page_remove_rmap(page, false); + } + if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); - if (unlikely(__tlb_remove_page(tlb, page))) { + + if (pte_replicated && has_replicas) + res = __tlb_remove_replica_pages(tlb, zp.replica_pages); + else + res = __tlb_remove_page(tlb, page); + + if (unlikely(res)) { force_flush = 1; addr += PAGE_SIZE; break; } - continue; + goto next; } entry = pte_to_swp_entry(ptent); + + // We can't end up here with replicated memory + BUG_ON(has_replicas); + if (is_device_private_entry(entry)) { struct page *page = device_private_entry_to_page(entry); @@ -1357,7 +1709,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, */ if (details->check_mapping != page_rmapping(page)) - continue; + goto next; } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); @@ -1365,13 +1717,13 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, rss[mm_counter(page)]--; page_remove_rmap(page, false); put_page(page); - continue; + goto next; } if (!non_swap_entry(entry)) { /* Genuine swap entry, hence a private anon page */ if (!should_zap_cows(details)) - continue; + goto next; rss[MM_SWAPENTS]--; } else if (is_migration_entry(entry)) { struct page *page; @@ -1379,12 +1731,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, page = migration_entry_to_page(entry); if (details && details->check_mapping && details->check_mapping != page_rmapping(page)) - continue; + goto next; rss[mm_counter(page)]--; } + if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); - pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); + + if (pte_replicated) + for_each_memory_node(nid) + pte_clear_not_present_full(mm, addr, zp.pte_numa[nid], tlb->fullmm); +next: + pgtable_pte_step(&zp, 1); } while (pte++, addr += PAGE_SIZE, addr != end); add_mm_rss_vec(mm, rss); @@ -1684,7 +2042,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size) { if (address < vma->vm_start || address + size > vma->vm_end || - !(vma->vm_flags & VM_PFNMAP)) + !(vma->vm_flags & VM_PFNMAP)) return; zap_page_range_single(vma, address, size, NULL); @@ -1741,7 +2099,7 @@ static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, inc_mm_counter_fast(mm, mm_counter_file(page)); reliable_page_counter(page, mm, 1); page_add_file_rmap(page, false); - set_pte_at(mm, addr, pte, mk_pte(page, prot)); + set_pte_at_replicated(mm, addr, pte, mk_pte(page, prot)); return 0; } @@ -2044,7 +2402,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, } entry = pte_mkyoung(*pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, addr, pte, entry, 1)) + if (ptep_set_access_flags_replicated(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); } goto out_unlock; @@ -2061,7 +2419,7 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, entry = maybe_mkwrite(pte_mkdirty(entry), vma); } - set_pte_at(mm, addr, pte, entry); + set_pte_at_replicated(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ out_unlock: @@ -2280,7 +2638,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, err = -EACCES; break; } - set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); + set_pte_at_replicated(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); @@ -2787,7 +3145,7 @@ static inline int cow_user_page(struct page *dst, struct page *src, } entry = pte_mkyoung(vmf->orig_pte); - if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) + if (ptep_set_access_flags_replicated(vma, addr, vmf->pte, entry, 0)) update_mmu_cache(vma, addr, vmf->pte); } @@ -2961,7 +3319,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf) flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) + if (ptep_set_access_flags_replicated(vma, vmf->address, vmf->pte, entry, 1)) update_mmu_cache(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); count_vm_event(PGREUSE); @@ -3063,7 +3421,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, vmf->address, vmf->pte); + ptep_clear_flush_notify_replicated(vma, vmf->address, vmf->pte); page_add_new_anon_rmap(new_page, vma, vmf->address, false); lru_cache_add_inactive_or_unevictable(new_page, vma); /* @@ -3071,7 +3429,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ - set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + set_pte_at_notify_replicated(mm, vmf->address, vmf->pte, entry); update_mmu_cache(vma, vmf->address, vmf->pte); if (old_page) { /* @@ -3675,7 +4033,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) pte = pte_mkuffd_wp(pte); pte = pte_wrprotect(pte); } - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + set_pte_at_replicated(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; @@ -3781,7 +4139,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && - !mm_forbids_zeropage(vma->vm_mm)) { + !mm_forbids_zeropage(vma->vm_mm) && !numa_is_vma_replicant(vma)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, @@ -3861,7 +4219,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_inactive_or_unevictable(page, vma); setpte: - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); + set_pte_at_replicated(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, vmf->address, vmf->pte); @@ -3939,17 +4297,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) return ret; } -/* - * The ordering of these checks is important for pmds with _PAGE_DEVMAP set. - * If we check pmd_trans_unstable() first we will trip the bad_pmd() check - * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly - * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. - */ -static int pmd_devmap_trans_unstable(pmd_t *pmd) -{ - return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); -} - static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -3964,7 +4311,7 @@ static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) } mm_inc_nr_ptes(vma->vm_mm); - pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + pmd_populate_replicated(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); vmf->prealloc_pte = NULL; } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { @@ -4004,7 +4351,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + pgtable_trans_huge_deposit(vma->vm_mm, get_master_pmd(vmf->pmd), vmf->prealloc_pte); /* * We are going to consume the prealloc table, * count that as nr_ptes. @@ -4060,7 +4407,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) if (arch_needs_pgtable_deposit()) deposit_prealloc_pte(vmf); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + set_pmd_at_replicated(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, haddr, vmf->pmd); @@ -4101,6 +4448,8 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) pte_t entry; vm_fault_t ret; + BUG_ON(PageReplicated(compound_head(page))); + if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) @@ -4134,8 +4483,10 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } + entry = maybe_mk_pbha_bit0(entry, vma); - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); + + set_pte_at_replicated(vma->vm_mm, vmf->address, vmf->pte, entry); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, vmf->address, vmf->pte); @@ -4476,6 +4827,41 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } +#ifdef CONFIG_USER_REPLICATION + +static int numa_replicate_page(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vmf->vma->vm_mm; + unsigned long start = vmf->address & PAGE_MASK; + int error = 0; + + mmap_assert_locked(mm); + + if (WARN_ON_ONCE(!(vma->vm_flags & VM_REPLICA_COMMIT))) + goto out; + + /* + * This should not be possible, + * because we have just handled page fault up to pmd level, + * so pmd tables must exist and be replicated. + * In fact, event pte level tables must be replicated at this point. + */ + BUG_ON(pmd_none(*vmf->pmd) || !numa_pgtable_replicated(vmf->pmd)); + + if (phys_duplicate_pte_range(vma, vmf->pmd, start, start + PAGE_SIZE) != start + PAGE_SIZE) { + error = -ENOMEM; + goto out; + } + + flush_tlb_page(vma, start); + +out: + return error; +} + +#endif + static vm_fault_t do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -4488,6 +4874,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) bool was_writable = pte_savedwrite(vmf->orig_pte); int flags = 0; + /* * The "pte" at this point cannot be used safely without * validation through pte_unmap_same(). It's of NUMA type but @@ -4504,12 +4891,12 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) * Make it present again, Depending on how arch implementes non * accessible ptes, some can allow access by kernel mode. */ - old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte); + old_pte = ptep_modify_prot_start_replicated(vma, vmf->address, vmf->pte); pte = pte_modify(old_pte, vma->vm_page_prot); pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte); + ptep_modify_prot_commit_replicated(vma, vmf->address, vmf->pte, old_pte, pte); update_mmu_cache(vma, vmf->address, vmf->pte); page = vm_normal_page(vma, vmf->address, pte); @@ -4518,6 +4905,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) return 0; } + BUG_ON(page && PageReplicated(compound_head(page))); + /* TODO: handle PTE-mapped THP */ if (PageCompound(page)) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -4547,6 +4936,27 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); pte_unmap_unlock(vmf->pte, vmf->ptl); + +#ifdef CONFIG_USER_REPLICATION + if (get_data_replication_policy(vma->vm_mm) != DATA_REPLICATION_NONE) { + if (vma_might_be_replicated(vma)) { + if (!numa_replicate_page(vmf)) { + vmf->replica_action = REPLICA_NONE; + put_page(page); + flags |= TNF_FAULT_LOCAL; + if (target_nid != NUMA_NO_NODE) + page_nid = target_nid; + goto out; + } + } + + if (vma->vm_file && ((vma->vm_flags & (VM_READ|VM_WRITE)) == VM_READ)) { + put_page(page); + return 0; + } + } +#endif + if (target_nid == NUMA_NO_NODE) { put_page(page); goto out; @@ -4647,7 +5057,10 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud) static vm_fault_t handle_pte_fault(struct vm_fault *vmf) { pte_t entry; + vm_fault_t ret; bool is_write = vmf->flags & FAULT_FLAG_WRITE; + UREPLICA_DEBUG(ktime_t start; + ktime_t end;) if (unlikely(pmd_none(*vmf->pmd))) { /* @@ -4686,17 +5099,35 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) } if (!vmf->pte) { - if (vma_is_anonymous(vmf->vma)) - return do_anonymous_page(vmf); - else - return do_fault(vmf); + if (vma_is_anonymous(vmf->vma)) { + UREPLICA_DEBUG(start = ktime_get()); + ret = do_anonymous_page(vmf); + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_do_anonymous_page(end - start);) + } else { + UREPLICA_DEBUG(start = ktime_get()); + ret = do_fault(vmf); + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_do_fault(end - start);) + } + goto replication; } - if (!pte_present(vmf->orig_pte)) - return do_swap_page(vmf); + if (!pte_present(vmf->orig_pte)) { + UREPLICA_DEBUG(start = ktime_get()); + ret = do_swap_page(vmf); + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_do_swap_page(end - start);) + return ret; + } - if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) - return do_numa_page(vmf); + if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) { + UREPLICA_DEBUG(start = ktime_get()); + ret = do_numa_page(vmf); + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_do_numa_page(end - start);) + return ret; + } vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); spin_lock(vmf->ptl); @@ -4706,12 +5137,17 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) goto unlock; } if (is_write) { - if (!pte_write(entry)) - return do_wp_page(vmf); + if (!pte_write(entry)) { + UREPLICA_DEBUG(start = ktime_get()); + ret = do_wp_page(vmf); + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_do_wp_page(end - start);) + return ret; + } entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, + if (ptep_set_access_flags_replicated(vmf->vma, vmf->address, vmf->pte, entry, is_write)) { update_mmu_cache(vmf->vma, vmf->address, vmf->pte); if (is_write) @@ -4733,8 +5169,54 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; + +replication: + +#ifdef CONFIG_USER_REPLICATION +/* + * How about a little bit of replication ?? + */ + if (get_data_replication_policy(vmf->vma->vm_mm) == DATA_REPLICATION_ALL && vma_might_be_replicated(vmf->vma)) { + unsigned long start = vmf->address & PAGE_MASK; + + if (WARN_ON_ONCE(pmd_none(*vmf->pmd) || !numa_pgtable_replicated(vmf->pmd))) + return ret; + if (phys_duplicate_pte_range(vmf->vma, vmf->pmd, start, start + PAGE_SIZE) == start + PAGE_SIZE) + flush_tlb_page(vmf->vma, start); + } +#endif + + return ret; +} +#ifndef CONFIG_USER_REPLICATION + +static pgd_t *fault_pgd_offset(struct vm_fault *vmf, unsigned long address) +{ + return pgd_offset(vmf->vma->vm_mm, address); +} + +static p4d_t *fault_p4d_alloc(struct vm_fault *vmf, struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + return p4d_alloc(mm, pgd, address); } +static pud_t *fault_pud_alloc(struct vm_fault *vmf, struct mm_struct *mm, p4d_t *p4d, unsigned long address) +{ + return pud_alloc(mm, p4d, address); +} + +static pmd_t *fault_pmd_alloc(struct vm_fault *vmf, struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + return pmd_alloc(mm, pud, address); +} + +static int fault_pte_alloc(struct vm_fault *vmf) +{ + return 0; +} + +#endif /* CONFIG_USER_REPLICATION */ + /* * By the time we get here, we already hold the mm semaphore * @@ -4747,6 +5229,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, struct vm_fault vmf = { .vma = vma, .address = address & PAGE_MASK, + .real_address = address, .flags = flags, .pgoff = linear_page_index(vma, address), .gfp_mask = __get_fault_gfp_mask(vma), @@ -4756,13 +5239,20 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, pgd_t *pgd; p4d_t *p4d; vm_fault_t ret; - - pgd = pgd_offset(mm, address); - p4d = p4d_alloc(mm, pgd, address); + UREPLICA_DEBUG(ktime_t start; + ktime_t end;) + + pgd = fault_pgd_offset(&vmf, address); + UREPLICA_DEBUG(start = ktime_get()); + p4d = fault_p4d_alloc(&vmf, mm, pgd, address); + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_fault_p4d_alloc(end - start);) if (!p4d) return VM_FAULT_OOM; - - vmf.pud = pud_alloc(mm, p4d, address); + UREPLICA_DEBUG(start = ktime_get()); + vmf.pud = fault_pud_alloc(&vmf, mm, p4d, address); + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_fault_pud_alloc(end - start);) if (!vmf.pud) return VM_FAULT_OOM; retry_pud: @@ -4775,7 +5265,6 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, barrier(); if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) { - /* NUMA case for anonymous PUDs would go here */ if (dirty && !pud_write(orig_pud)) { @@ -4788,8 +5277,10 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, } } } - - vmf.pmd = pmd_alloc(mm, vmf.pud, address); + UREPLICA_DEBUG(start = ktime_get()); + vmf.pmd = fault_pmd_alloc(&vmf, mm, vmf.pud, address); + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_fault_pmd_alloc(end - start);) if (!vmf.pmd) return VM_FAULT_OOM; @@ -4821,13 +5312,20 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(&vmf); + huge_pmd_set_accessed_replicated(&vmf); return 0; } } } - return handle_pte_fault(&vmf); + if (fault_pte_alloc(&vmf)) + return VM_FAULT_OOM; + + UREPLICA_DEBUG(start = ktime_get()); + ret = handle_pte_fault(&vmf); + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_handle_pte_fault(end - start);) + return ret; } /** @@ -4901,6 +5399,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { vm_fault_t ret; + UREPLICA_DEBUG(ktime_t start; + ktime_t end;) __set_current_state(TASK_RUNNING); @@ -4924,9 +5424,12 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, if (unlikely(is_vm_hugetlb_page(vma))) ret = hugetlb_fault(vma->vm_mm, vma, address, flags); - else + else { + UREPLICA_DEBUG(start = ktime_get()); ret = __handle_mm_fault(vma, address, flags); - + UREPLICA_DEBUG(end = ktime_get(); + trace_mm_ureplica_cost_handle_mm_fault(end - start);) + } if (flags & FAULT_FLAG_USER) { mem_cgroup_exit_user_fault(); /* @@ -4948,24 +5451,6 @@ EXPORT_SYMBOL_GPL(handle_mm_fault); #ifndef __PAGETABLE_P4D_FOLDED #ifdef CONFIG_KERNEL_REPLICATION -static void __p4d_populate_to_replicas(struct mm_struct *mm, - p4d_t *p4d, - unsigned long address) -{ - int nid; - pgd_t *pgd; - - if (mm_p4d_folded(mm) || !is_text_replicated()) - return; - - for_each_memory_node(nid) { - pgd = pgd_offset_pgd(mm->pgd_numa[nid], address); - if (pgd_present(*pgd)) - continue; - pgd_populate(mm, pgd, p4d); - } -} - int __p4d_alloc_node(unsigned int nid, struct mm_struct *mm, pgd_t *pgd, unsigned long address) @@ -4984,11 +5469,6 @@ int __p4d_alloc_node(unsigned int nid, spin_unlock(&mm->page_table_lock); return 0; } -#else -static void __p4d_populate_to_replicas(struct mm_struct *mm, - p4d_t *p4d, - unsigned long address) -{ } #endif /* CONFIG_KERNEL_REPLICATION */ /* @@ -5006,10 +5486,8 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) /* Another has populated it */ p4d_free(mm, new); - else { - pgd_populate(mm, pgd, new); - __p4d_populate_to_replicas(mm, new, address); - } + else + pgd_populate_replicated(mm, pgd, new); spin_unlock(&mm->page_table_lock); return 0; } @@ -5018,24 +5496,6 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) #ifndef __PAGETABLE_PUD_FOLDED #ifdef CONFIG_KERNEL_REPLICATION -static void __pud_populate_to_replicas(struct mm_struct *mm, - pud_t *pud, - unsigned long address) -{ - int nid; - p4d_t *p4d; - - if (!mm_p4d_folded(mm) || !is_text_replicated()) - return; - - for_each_online_node(nid) { - p4d = (p4d_t *)pgd_offset_pgd(mm->pgd_numa[nid], address); - if (p4d_present(*p4d)) - continue; - p4d_populate(mm, p4d, pud); - } -} - int __pud_alloc_node(unsigned int nid, struct mm_struct *mm, p4d_t *p4d, unsigned long address) @@ -5044,23 +5504,17 @@ int __pud_alloc_node(unsigned int nid, if (!new) return -ENOMEM; + smp_wmb(); /* See comment in __pte_alloc */ + spin_lock(&mm->page_table_lock); if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); - smp_wmb(); /* See comment in pmd_install() */ p4d_populate(mm, p4d, new); - } else /* Another has populated it */ + } else /* Another has populated it */ pud_free(mm, new); spin_unlock(&mm->page_table_lock); return 0; } -#else -static void __pud_populate_to_replicas(struct mm_struct *mm, - pud_t *pud, - unsigned long address) -{ - return; -} #endif /* CONFIG_KERNEL_REPLICATION */ /* @@ -5078,8 +5532,7 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) spin_lock(&mm->page_table_lock); if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); - p4d_populate(mm, p4d, new); - __pud_populate_to_replicas(mm, new, address); + p4d_populate_replicated(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); spin_unlock(&mm->page_table_lock); @@ -5104,7 +5557,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) ptl = pud_lock(mm, pud); if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); - pud_populate(mm, pud, new); + pud_populate_replicated(mm, pud, new); } else /* Another has populated it */ pmd_free(mm, new); spin_unlock(ptl); @@ -5121,14 +5574,14 @@ int __pmd_alloc_node(unsigned int nid, if (!new) return -ENOMEM; + smp_wmb(); /* See comment in __pte_alloc */ + ptl = pud_lock(mm, pud); if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); - smp_wmb(); /* See comment in pmd_install() */ pud_populate(mm, pud, new); - } else { /* Another has populated it */ + } else /* Another has populated it */ pmd_free(mm, new); - } spin_unlock(ptl); return 0; } @@ -5686,6 +6139,25 @@ void __init ptlock_cache_init(void) SLAB_PANIC, NULL); } +#ifdef CONFIG_KERNEL_REPLICATION +bool ptlock_alloc(struct page *page) +{ + spinlock_t *ptl; + + ptl = kmem_cache_alloc(page_ptl_cachep, GFP_KERNEL); + if (!ptl) + return false; + page->ptl = ptl; + page->master_table = page; + return true; +} + +void ptlock_free(struct page *page) +{ + kmem_cache_free(page_ptl_cachep, page->ptl); + page->master_table = NULL; +} +#else bool ptlock_alloc(struct page *page) { spinlock_t *ptl; @@ -5703,6 +6175,8 @@ void ptlock_free(struct page *page) } #endif +#endif + #ifdef CONFIG_PIN_MEMORY vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, struct page *page) @@ -5751,7 +6225,7 @@ vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, page_add_new_anon_rmap(page, vma, address, false); lru_cache_add_inactive_or_unevictable(page, vma); - set_pte_at(vma->vm_mm, address, pte, entry); + set_pte_at_replicated(vma->vm_mm, address, pte, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, address, pte); @@ -5785,9 +6259,10 @@ struct page *walk_to_page_node(int nid, const void *vmalloc_addr) pte_t *ptep, pte; if (!is_text_replicated()) - nid = 0; + pgd = pgd_offset_pgd(init_mm.pgd, addr); + else + pgd = pgd_offset_pgd(per_node_pgd(&init_mm, nid), addr); - pgd = pgd_offset_pgd(per_node_pgd(&init_mm, nid), addr); if (pgd_none(*pgd)) return NULL; if (WARN_ON_ONCE(pgd_leaf(*pgd))) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 95167512e0545..269edec067ea3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -103,6 +103,7 @@ #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> +#include <linux/numa_user_replication.h> #include <linux/share_pool_interface.h> @@ -622,6 +623,12 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, continue; if (!queue_pages_required(page, qp)) continue; + /* + * If vma contains replicated memory, we are not going to move these pages. + */ + if (PageReplicated(compound_head(page))) + continue; + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { /* MPOL_MF_STRICT must be specified if we get here */ if (!vma_migratable(vma)) { diff --git a/mm/migrate.c b/mm/migrate.c index cf8c05ea821e9..4f3fffab7b871 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -50,6 +50,7 @@ #include <linux/ptrace.h> #include <linux/oom.h> #include <linux/dynamic_hugetlb.h> +#include <linux/numa_user_replication.h> #include <asm/tlbflush.h> @@ -260,7 +261,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, if (PageHuge(new)) { pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, vma, new, 0); - set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); + set_huge_pte_at_replicated(vma->vm_mm, pvmw.address, pvmw.pte, pte); if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, pvmw.address); else @@ -268,7 +269,7 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, } else #endif { - set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); + set_pte_at_replicated(vma->vm_mm, pvmw.address, pvmw.pte, pte); reliable_page_counter(new, vma->vm_mm, 1); if (PageAnon(new)) @@ -285,7 +286,6 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, pvmw.address, pvmw.pte); } - return true; } @@ -1694,7 +1694,8 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, mmap_read_lock(mm); err = -EFAULT; vma = find_vma(mm, addr); - if (!vma || addr < vma->vm_start || !vma_migratable(vma)) + // if page belongs to fully replicated vma, we don't want to move it here. + if (!vma || addr < vma->vm_start || !vma_migratable(vma) || vma_has_replicas(vma)) goto out; /* FOLL_DUMP to ignore special (like zero) pages */ @@ -1879,7 +1880,10 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, int err = -EFAULT; vma = find_vma(mm, addr); - if (!vma || addr < vma->vm_start) + /* + * Skip fully replicated vmas (done) + */ + if (!vma || addr < vma->vm_start || vma_has_replicas(vma)) goto set_status; /* FOLL_DUMP to ignore special (like zero) pages */ @@ -2168,6 +2172,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, else new = alloc_misplaced_dst_page; + BUG_ON(PageReplicated(page)); + /* * Don't migrate file pages that are mapped in multiple processes * with execute permissions as they are probably shared libraries. @@ -2208,7 +2214,6 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, return 0; } #endif /* CONFIG_NUMA_BALANCING */ - #endif /* CONFIG_NUMA */ #ifdef CONFIG_DEVICE_PRIVATE @@ -2386,6 +2391,11 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, * in one process. If we can lock the page, then we can safely * set up a special migration page table entry now. */ + + /* + * If last level of the page table is replicated, + * handle this migration later + */ if (trylock_page(page)) { pte_t swp_pte; @@ -2407,7 +2417,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp, if (pte_swp_uffd_wp(pte)) swp_pte = pte_swp_mkuffd_wp(swp_pte); } - set_pte_at(mm, addr, ptep, swp_pte); + set_pte_at_replicated(mm, addr, ptep, swp_pte); /* * This is like regular unmap: we remove the rmap and @@ -2760,6 +2770,8 @@ int migrate_vma_setup(struct migrate_vma *args) return -EINVAL; if (args->fault_page && !is_device_private_page(args->fault_page)) return -EINVAL; + if (numa_is_vma_replicant(args->vma)) + return -EINVAL; memset(args->src, 0, sizeof(*args->src) * nr_pages); args->cpages = 0; diff --git a/mm/mlock.c b/mm/mlock.c index 4e47dd274b91a..eaf082f2b64d5 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -23,6 +23,7 @@ #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> +#include <linux/numa_user_replication.h> #include "internal.h" @@ -441,7 +442,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, page = follow_page(vma, start, FOLL_GET | FOLL_DUMP); if (page && !IS_ERR(page)) { - if (PageTransTail(page)) { + if (PageTransTail(page) || PageReplicated(compound_head(page))) { VM_BUG_ON_PAGE(PageMlocked(page), page); put_page(page); /* follow_page_mask() */ } else if (PageTransHuge(page)) { diff --git a/mm/mmap.c b/mm/mmap.c index 6bf5d219aece1..b40139e3f634a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -50,7 +50,7 @@ #include <linux/swapops.h> #include <linux/share_pool.h> #include <linux/ksm.h> - +#include <linux/numa_user_replication.h> #include <linux/uaccess.h> #include <asm/cacheflush.h> #include <asm/tlb.h> @@ -1173,6 +1173,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; int err; + int adjust_replicas_prev = 0, adjust_replicas_next = 0; /* * We later require that vma->vm_flags == vm_flags, @@ -1218,11 +1219,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, err = __vma_adjust(prev, prev->vm_start, next->vm_end, prev->vm_pgoff, NULL, prev); - } else /* cases 2, 5, 7 */ + adjust_replicas_prev = numa_is_vma_replicant(prev); + } else { /* cases 2, 5, 7 */ err = __vma_adjust(prev, prev->vm_start, end, prev->vm_pgoff, NULL, prev); + adjust_replicas_prev = adjust_replicas_next = + numa_is_vma_replicant(prev); + } if (err) return NULL; + khugepaged_enter_vma_merge(prev, vm_flags); return prev; } @@ -1235,12 +1241,15 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, vm_userfaultfd_ctx)) { - if (prev && addr < prev->vm_end) /* case 4 */ + if (prev && addr < prev->vm_end) { /* case 4 */ err = __vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL, next); - else { /* cases 3, 8 */ + adjust_replicas_prev = adjust_replicas_next = + numa_is_vma_replicant(prev); + } else { /* cases 3, 8 */ err = __vma_adjust(area, addr, next->vm_end, next->vm_pgoff - pglen, NULL, next); + adjust_replicas_next = numa_is_vma_replicant(next); /* * In case 3 area is already equal to next and * this is a noop, but in case 8 "area" has @@ -1250,6 +1259,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, } if (err) return NULL; + khugepaged_enter_vma_merge(area, vm_flags); return area; } @@ -1606,6 +1616,22 @@ unsigned long __do_mmap_mm(struct mm_struct *mm, struct file *file, if (flags & MAP_CHECKNODE) set_vm_checknode(&vm_flags, flags); +#ifdef CONFIG_USER_REPLICATION + if (flags & MAP_REPLICA) { + vm_flags |= VM_REPLICA_INIT; + } + + if (get_table_replication_policy(mm) == TABLE_REPLICATION_ALL) { + vm_flags |= VM_REPLICA_INIT; + } + + if ((get_table_replication_policy(mm) == TABLE_REPLICATION_MINIMAL) && !(vm_flags & VM_WRITE)) { + vm_flags |= VM_REPLICA_INIT; + } + + +#endif /* CONFIG_USER_REPLICATION */ + addr = __mmap_region_ext(mm, file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || @@ -1823,6 +1849,8 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, /* * Can we just expand an old mapping? + * + * Note: If replicant vmas are merged, interval in mm::replica_ranges will be extended */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); @@ -1925,6 +1953,18 @@ static unsigned long __mmap_region(struct mm_struct *mm, struct file *file, file = vma->vm_file; ksm_add_vma(vma); out: + +#ifdef CONFIG_USER_REPLICATION + if (numa_is_vma_replicant(vma)) { + anon_vma_prepare(vma); + + if ((get_data_replication_policy(mm) != DATA_REPLICATION_NONE) && vma_might_be_replicated(vma)) { + vma->vm_flags |= VM_REPLICA_COMMIT; + } + + } +#endif /* CONFIG_USER_REPLICATION */ + perf_event_mmap(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); @@ -2291,7 +2331,11 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, return addr; } - info.flags = VM_UNMAPPED_AREA_TOPDOWN; + if (flags & MAP_REPLICA) + info.flags = 0; + else + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; info.low_limit = max(PAGE_SIZE, mmap_min_addr); info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); @@ -3257,6 +3301,12 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla if (vma) goto out; +#ifdef CONFIG_USER_REPLICATION + if (get_table_replication_policy(mm) == TABLE_REPLICATION_ALL) { + flags |= VM_REPLICA_INIT; + } +#endif /* CONFIG_USER_REPLICATION */ + /* * create a vma struct for an anonymous mapping */ @@ -3276,6 +3326,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla ksm_add_vma(vma); out: perf_event_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; mm->data_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) @@ -3644,6 +3695,17 @@ static struct vm_area_struct *__install_special_mapping( vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); +#ifdef CONFIG_USER_REPLICATION + if (vma_want_table_replica(vma)) { + vma->vm_flags |= VM_REPLICA_INIT; + } + if (numa_is_vma_replicant(vma)) { + if ((get_data_replication_policy(mm) != DATA_REPLICATION_NONE) && vma_might_be_replicated(vma)) { + vma->vm_flags |= VM_REPLICA_COMMIT; + } + } +#endif /* CONFIG_USER_REPLICATION */ + perf_event_mmap(vma); return vma; diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 1c66771f088bb..13533c2bb983d 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -7,6 +7,7 @@ #include <linux/rcupdate.h> #include <linux/smp.h> #include <linux/swap.h> +#include <linux/numa_user_replication.h> #include <asm/pgalloc.h> #include <asm/tlb.h> @@ -78,7 +79,7 @@ static void tlb_batch_list_free(struct mmu_gather *tlb) bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { struct mmu_gather_batch *batch; - + int mem_nodes = num_node_state(N_MEMORY); VM_BUG_ON(!tlb->end); #ifdef CONFIG_MMU_GATHER_PAGE_SIZE @@ -90,8 +91,11 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ * Add the page and check if we are full. If so * force a flush. */ + /* + * Leave enough entries for fully replicated page + */ batch->pages[batch->nr++] = page; - if (batch->nr == batch->max) { + if (batch->nr + mem_nodes > batch->max) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; @@ -101,6 +105,40 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ return false; } +bool __tlb_remove_replica_pages_size(struct mmu_gather *tlb, struct page **pages, int page_size) +{ + struct mmu_gather_batch *batch; + int mem_nodes = num_node_state(N_MEMORY); + int nid; + + VM_BUG_ON(!tlb->end); + +#ifdef CONFIG_MMU_GATHER_PAGE_SIZE + VM_WARN_ON(tlb->page_size != page_size); +#endif + + batch = tlb->active; + /* + * Add the page and check if we are full. If so + * force a flush. + */ + /* + * Leave enough entries for fully replicated page + */ + for_each_memory_node(nid) { + batch->pages[batch->nr++] = pages[nid]; + } + if (batch->nr + mem_nodes > batch->max) { + if (!tlb_next_batch(tlb)) + return true; + batch = tlb->active; + } + VM_BUG_ON_PAGE(batch->nr > batch->max, pages[0]); + + return false; +} + + #endif /* MMU_GATHER_NO_GATHER */ #ifdef CONFIG_MMU_GATHER_TABLE_FREE diff --git a/mm/mprotect.c b/mm/mprotect.c index 4ca7035495684..7374815925582 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -29,18 +29,17 @@ #include <linux/uaccess.h> #include <linux/mm_inline.h> #include <linux/pgtable.h> +#include <linux/numa_user_replication.h> #include <asm/cacheflush.h> #include <asm/mmu_context.h> #include <asm/tlbflush.h> #include "internal.h" -static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, unsigned long end, pgprot_t newprot, - unsigned long cp_flags) +static unsigned long change_pte_entry(struct vm_area_struct *vma, unsigned long addr, + pte_t *pte, pgprot_t newprot, unsigned long cp_flags) { - pte_t *pte, oldpte; - spinlock_t *ptl; + pte_t oldpte = *pte; unsigned long pages = 0; int target_node = NUMA_NO_NODE; bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT; @@ -48,6 +47,132 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + /* Get target node for single threaded private VMAs */ + if (prot_numa && !(vma->vm_flags & VM_SHARED) && + atomic_read(&vma->vm_mm->mm_users) == 1) + target_node = numa_node_id(); + + if (pte_present(oldpte)) { + pte_t ptent; + bool preserve_write = prot_numa && pte_write(oldpte); + + /* + * Avoid trapping faults against the zero or KSM + * pages. See similar comment in change_huge_pmd. + */ + if (prot_numa) { + struct page *page; + + /* Avoid TLB flush if possible */ + if (pte_protnone(oldpte)) + return pages; + + page = vm_normal_page(vma, addr, oldpte); + if (!page || PageKsm(page)) + return pages; + + /* Skip fully replicated memory */ + if (page && PageReplicated(compound_head(page))) + return pages; + + /* Also skip shared copy-on-write pages */ + if ((get_data_replication_policy(vma->vm_mm) == DATA_REPLICATION_NONE) && + is_cow_mapping(vma->vm_flags) && + page_count(page) != 1) + return pages; + + /* + * While migration can move some dirty pages, + * it cannot move them all from MIGRATE_ASYNC + * context. + */ + if (page_is_file_lru(page) && PageDirty(page)) + return pages; + + /* + * Don't mess with PTEs if page is already on the node + * a single-threaded process is running on. + */ + if (target_node == page_to_nid(page)) + return pages; + } + + oldpte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_modify(oldpte, newprot); + if (preserve_write) + ptent = pte_mk_savedwrite(ptent); + + if (uffd_wp) { + ptent = pte_wrprotect(ptent); + ptent = pte_mkuffd_wp(ptent); + } else if (uffd_wp_resolve) { + /* + * Leave the write bit to be handled + * by PF interrupt handler, then + * things like COW could be properly + * handled. + */ + ptent = pte_clear_uffd_wp(ptent); + } + + /* Avoid taking write faults for known dirty pages */ + if (dirty_accountable && pte_dirty(ptent) && + (pte_soft_dirty(ptent) || + !(vma->vm_flags & VM_SOFTDIRTY))) { + ptent = pte_mkwrite(ptent); + } + ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); + pages++; + } else if (is_swap_pte(oldpte)) { + swp_entry_t entry = pte_to_swp_entry(oldpte); + pte_t newpte; + + if (is_write_migration_entry(entry)) { + /* + * A protection check is difficult so + * just be safe and disable write + */ + make_migration_entry_read(&entry); + newpte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(oldpte)) + newpte = pte_swp_mksoft_dirty(newpte); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else if (is_write_device_private_entry(entry)) { + /* + * We do not preserve soft-dirtiness. See + * copy_one_pte() for explanation. + */ + make_device_private_entry_read(&entry); + newpte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(oldpte)) + newpte = pte_swp_mkuffd_wp(newpte); + } else { + newpte = oldpte; + } + + if (uffd_wp) + newpte = pte_swp_mkuffd_wp(newpte); + else if (uffd_wp_resolve) + newpte = pte_swp_clear_uffd_wp(newpte); + + if (!pte_same(oldpte, newpte)) { + set_pte_at(vma->vm_mm, addr, pte, newpte); + pages++; + } + } + + return pages; +} + +static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, pgprot_t newprot, + unsigned long cp_flags) +{ + pte_t *pte; + spinlock_t *ptl; + unsigned long pages = 0; + /* * Can be called with only the mmap_lock for reading by * prot_numa so we must check the pmd isn't constantly @@ -64,119 +189,21 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, */ pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - /* Get target node for single threaded private VMAs */ - if (prot_numa && !(vma->vm_flags & VM_SHARED) && - atomic_read(&vma->vm_mm->mm_users) == 1) - target_node = numa_node_id(); - flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); do { - oldpte = *pte; - if (pte_present(oldpte)) { - pte_t ptent; - bool preserve_write = prot_numa && pte_write(oldpte); - - /* - * Avoid trapping faults against the zero or KSM - * pages. See similar comment in change_huge_pmd. - */ - if (prot_numa) { - struct page *page; - - /* Avoid TLB flush if possible */ - if (pte_protnone(oldpte)) - continue; - - page = vm_normal_page(vma, addr, oldpte); - if (!page || PageKsm(page)) - continue; - - /* Also skip shared copy-on-write pages */ - if (is_cow_mapping(vma->vm_flags) && - page_count(page) != 1) - continue; - - /* - * While migration can move some dirty pages, - * it cannot move them all from MIGRATE_ASYNC - * context. - */ - if (page_is_file_lru(page) && PageDirty(page)) - continue; - - /* - * Don't mess with PTEs if page is already on the node - * a single-threaded process is running on. - */ - if (target_node == page_to_nid(page)) - continue; - } - - oldpte = ptep_modify_prot_start(vma, addr, pte); - ptent = pte_modify(oldpte, newprot); - if (preserve_write) - ptent = pte_mk_savedwrite(ptent); - - if (uffd_wp) { - ptent = pte_wrprotect(ptent); - ptent = pte_mkuffd_wp(ptent); - } else if (uffd_wp_resolve) { - /* - * Leave the write bit to be handled - * by PF interrupt handler, then - * things like COW could be properly - * handled. - */ - ptent = pte_clear_uffd_wp(ptent); - } - - /* Avoid taking write faults for known dirty pages */ - if (dirty_accountable && pte_dirty(ptent) && - (pte_soft_dirty(ptent) || - !(vma->vm_flags & VM_SOFTDIRTY))) { - ptent = pte_mkwrite(ptent); - } - ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent); - pages++; - } else if (is_swap_pte(oldpte)) { - swp_entry_t entry = pte_to_swp_entry(oldpte); - pte_t newpte; - - if (is_write_migration_entry(entry)) { - /* - * A protection check is difficult so - * just be safe and disable write - */ - make_migration_entry_read(&entry); - newpte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(oldpte)) - newpte = pte_swp_mksoft_dirty(newpte); - if (pte_swp_uffd_wp(oldpte)) - newpte = pte_swp_mkuffd_wp(newpte); - } else if (is_write_device_private_entry(entry)) { - /* - * We do not preserve soft-dirtiness. See - * copy_one_pte() for explanation. - */ - make_device_private_entry_read(&entry); - newpte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(oldpte)) - newpte = pte_swp_mkuffd_wp(newpte); - } else { - newpte = oldpte; - } - - if (uffd_wp) - newpte = pte_swp_mkuffd_wp(newpte); - else if (uffd_wp_resolve) - newpte = pte_swp_clear_uffd_wp(newpte); - - if (!pte_same(oldpte, newpte)) { - set_pte_at(vma->vm_mm, addr, pte, newpte); - pages++; + pages += change_pte_entry(vma, addr, pte, newprot, cp_flags); +#ifdef CONFIG_USER_REPLICATION + if (numa_pgtable_replicated(pte)) { + unsigned long offset; + struct page *curr; + pte_t *curr_pte; + + for_each_pgtable_replica(curr, curr_pte, pte, offset) { + change_pte_entry(vma, addr, curr_pte, newprot, cp_flags); } } +#endif } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -332,7 +359,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, unsigned long pages = 0; BUG_ON(addr >= end); - pgd = pgd_offset(mm, addr); + pgd = pgd_offset_pgd(this_node_pgd(mm), addr); flush_cache_range(vma, addr, end); inc_tlb_flush_pending(mm); do { @@ -448,6 +475,16 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, } } +#ifdef CONFIG_USER_REPLICATION + if ((newflags & VM_WRITE) && vma_has_replicas(vma)) { + if (phys_deduplicate(vma, start, end - start, true)) + goto fail; + newflags &= ~VM_REPLICA_COMMIT; + } + + numa_mprotect_vm_flags_modify(&newflags, vma); +#endif + /* * First try to merge with previous and/or next vma. */ @@ -481,12 +518,16 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * held in write mode. */ vma->vm_flags = newflags; + dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot); vma_set_page_prot(vma); change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable ? MM_CP_DIRTY_ACCT : 0); +#ifdef CONFIG_USER_REPLICATION + numa_replication_post_mprotect(vma); +#endif /* * Private VM_LOCKED VMA becoming writable: trigger COW to avoid major * fault on access. @@ -518,6 +559,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP); const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); + bool arch_valid_prot; start = untagged_addr(start); @@ -533,7 +575,12 @@ static int do_mprotect_pkey(unsigned long start, size_t len, end = start + len; if (end <= start) return -ENOMEM; - if (!arch_validate_prot(prot, start)) + + arch_valid_prot = !arch_validate_prot(prot, start); +#ifdef CONFIG_USER_REPLICATION + arch_valid_prot = arch_valid_prot && (prot != PROT_REPLICA); +#endif + if (arch_valid_prot) return -EINVAL; reqprot = prot; @@ -541,6 +588,27 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (mmap_write_lock_killable(current->mm)) return -EINTR; + vma = find_vma(current->mm, start); + error = -ENOMEM; + if (!vma) + goto out; + prev = vma->vm_prev; + +#ifdef CONFIG_USER_REPLICATION + if (prot == PROT_REPLICA) { + error = -EINVAL; + if (vma->vm_flags & VM_SHARED) + goto out; + + error = phys_duplicate(vma, start, len); + if (error) + pr_info("Failed to replicate memory -- start:%zx; len:%zx PID: %d NAME: %s\n", + start, len, current->pid, current->comm); + + goto out; + } +#endif /* CONFIG_USER_REPLICATION */ + /* * If userspace did not allocate the pkey, do not let * them use it here. @@ -549,11 +617,6 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) goto out; - vma = find_vma(current->mm, start); - error = -ENOMEM; - if (!vma) - goto out; - prev = vma->vm_prev; if (unlikely(grows & PROT_GROWSDOWN)) { if (vma->vm_start >= end) goto out; diff --git a/mm/mremap.c b/mm/mremap.c index b8b694be40bdc..1f7401fe387c8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -26,6 +26,7 @@ #include <linux/userfaultfd_k.h> #include <linux/share_pool.h> #include <linux/userswap.h> +#include <linux/numa_user_replication.h> #include <asm/cacheflush.h> #include <asm/tlb.h> @@ -183,7 +184,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (pte_none(*old_pte)) continue; - pte = ptep_get_and_clear(mm, old_addr, old_pte); + pte = ptep_get_and_clear_replicated(mm, old_addr, old_pte); /* * If we are remapping a valid PTE, make sure * to flush TLB before we drop the PTL for the @@ -199,7 +200,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, force_flush = true; pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); pte = move_soft_dirty_pte(pte); - set_pte_at(mm, new_addr, new_pte, pte); + set_pte_at_replicated(mm, new_addr, new_pte, pte); } arch_leave_lazy_mmu_mode(); @@ -229,6 +230,11 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, spinlock_t *old_ptl, *new_ptl; struct mm_struct *mm = vma->vm_mm; pmd_t pmd; +#ifdef CONFIG_USER_REPLICATION + pmd_t pmd_numa[MAX_NUMNODES]; + bool old_pte_replicated = numa_pgtable_replicated(page_to_virt(pmd_pgtable(*old_pmd))); + bool new_pmd_replicated = numa_pgtable_replicated(new_pmd); +#endif if (!arch_supports_page_table_move()) return false; @@ -258,6 +264,16 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pmd_none(*new_pmd))) return false; +#ifdef CONFIG_USER_REPLICATION + /* + * In that case, we need to somehow get rid of page tables replicas of pte level + * I am not sure how to do it properly right now, so fallback to + * slowpath + */ + if (old_pte_replicated && !new_pmd_replicated) + return false; +#endif + /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. @@ -268,12 +284,37 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); /* Clear the pmd */ +#ifdef CONFIG_USER_REPLICATION + if (old_pte_replicated) { + int nid; + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + bool start; + + for_each_pgtable(curr, curr_pmd, old_pmd, nid, offset, start) + pmd_numa[nid] = *(curr_pmd); + } +#endif pmd = *old_pmd; - pmd_clear(old_pmd); + pmd_clear_replicated(old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - pmd_populate(mm, new_pmd, pmd_pgtable(pmd)); +#ifdef CONFIG_USER_REPLICATION + if (new_pmd_replicated && old_pte_replicated) { + int nid; + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + bool start; + + for_each_pgtable(curr, curr_pmd, new_pmd, nid, offset, start) + pmd_populate(mm, curr_pmd, pmd_pgtable(pmd_numa[nid])); + } else +#endif + pmd_populate_replicated(mm, new_pmd, pmd_pgtable(pmd)); + flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); @@ -297,6 +338,11 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, spinlock_t *old_ptl, *new_ptl; struct mm_struct *mm = vma->vm_mm; pud_t pud; +#ifdef CONFIG_USER_REPLICATION + pud_t pud_numa[MAX_NUMNODES]; + bool old_pmd_replicated = numa_pgtable_replicated(pud_pgtable(*old_pud)); + bool new_pud_replicated = numa_pgtable_replicated(new_pud); +#endif if (!arch_supports_page_table_move()) return false; @@ -307,6 +353,17 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, if (WARN_ON_ONCE(!pud_none(*new_pud))) return false; +#ifdef CONFIG_USER_REPLICATION + /* + * In that case, we need to somehow get rid of page tables replicas + * of pte pmd and levels + * I am not sure how to do it properly right now, so fallback to + * slowpath + */ + if (old_pmd_replicated && !new_pud_replicated) + return false; +#endif + /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. @@ -317,12 +374,39 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); /* Clear the pud */ +#ifdef CONFIG_USER_REPLICATION + if (old_pmd_replicated) { + int nid; + unsigned long offset; + struct page *curr; + pud_t *curr_pud; + bool start; + + for_each_pgtable(curr, curr_pud, old_pud, nid, offset, start) + pud_numa[nid] = *(curr_pud); + } +#endif + pud = *old_pud; - pud_clear(old_pud); + + pud_clear_replicated(old_pud); VM_BUG_ON(!pud_none(*new_pud)); - pud_populate(mm, new_pud, pud_pgtable(pud)); +#ifdef CONFIG_USER_REPLICATION + if (new_pud_replicated && old_pmd_replicated) { + int nid; + unsigned long offset; + struct page *curr; + pud_t *curr_pud; + bool start; + + for_each_pgtable(curr, curr_pud, new_pud, nid, offset, start) + pud_populate(mm, curr_pud, pud_pgtable(pud_numa[nid])); + } else +#endif + pud_populate_replicated(mm, new_pud, pud_pgtable(pud)); + flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); @@ -340,6 +424,9 @@ static inline bool move_normal_pud(struct vm_area_struct *vma, #endif #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +/* + * Not supported on arm64, do not implement right now + */ static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) { @@ -505,7 +592,6 @@ unsigned long move_page_tables(struct vm_area_struct *vma, * PUD level if possible. */ extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr); - old_pud = get_old_pud(vma->vm_mm, old_addr); if (!old_pud) continue; @@ -730,6 +816,12 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, return ERR_PTR(-EINVAL); } + /* + * For simplicity, remap is not supported for replicant vmas right now + */ + if (numa_is_vma_replicant(vma)) + return ERR_PTR(-EINVAL); + if (flags & MREMAP_DONTUNMAP && (!vma_is_anonymous(vma) || vma->vm_flags & VM_SHARED)) return ERR_PTR(-EINVAL); diff --git a/mm/numa_replication.c b/mm/numa_kernel_replication.c similarity index 82% rename from mm/numa_replication.c rename to mm/numa_kernel_replication.c index 4bd5b75188bac..2339765b723fd 100644 --- a/mm/numa_replication.c +++ b/mm/numa_kernel_replication.c @@ -1,15 +1,18 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/kernel.h> #include <linux/pagewalk.h> -#include <linux/numa_replication.h> +#include <linux/numa_kernel_replication.h> #include <linux/memblock.h> #include <linux/pgtable.h> #include <linux/hugetlb.h> #include <linux/kobject.h> #include <linux/debugfs.h> + #include <asm/sections.h> #include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/mmu_context.h> #define KERNEL_TEXT_START ((unsigned long)&_stext) #define KERNEL_TEXT_END ((unsigned long)&_etext) @@ -21,7 +24,7 @@ #define PAGES_PER_PMD (1 << PMD_ALLOC_ORDER) #define replication_log(data, fmt, args...) \ -({ \ +({ \ if (data && data->m) \ seq_printf(data->m, fmt, ##args); \ else \ @@ -50,6 +53,7 @@ struct dump_config { }; static bool text_replicated; +static propagation_level_t prop_level = NONE; /* * The first ready NUMA node, used as a source node * for kernel text and rodata replication @@ -66,6 +70,11 @@ static int node_to_memory_node[MAX_NUMNODES]; static bool pgtables_extra; static DEFINE_SPINLOCK(debugfs_lock); +propagation_level_t get_propagation_level(void) +{ + return prop_level; +} + bool is_text_replicated(void) { return text_replicated; @@ -127,8 +136,8 @@ static int p4d_callback(p4d_t *p4d, next = (addr & P4D_MASK) - 1 + P4D_SIZE; replication_log(c->data, - "P4D ADDR: 0x%p P4D VAL: 0x%016lx [%p --- %p]\n", - p4d, val, (void *)addr, (void *)next); + "P4D ADDR: 0x%p (REPL=%d) P4D VAL: 0x%016lx [%p --- %p]\n", + p4d, numa_pgtable_replicated(p4d), val, (void *)addr, (void *)next); if (c->p4d_extra_info) binary_dump(c->data, val); @@ -150,8 +159,8 @@ static int pud_callback(pud_t *pud, next = (addr & PUD_MASK) - 1 + PUD_SIZE; replication_log(c->data, - "PUD ADDR: 0x%p PUD VAL: 0x%016lx huge(%d) [%p --- %p]\n", - pud, val, pud_huge(*pud), (void *)addr, (void *)next); + "PUD ADDR: 0x%p (REPL=%d) PUD VAL: 0x%016lx huge(%d) [%p --- %p]\n", + pud, numa_pgtable_replicated(pud), val, pud_huge(*pud), (void *)addr, (void *)next); if (c->pud_extra_info) binary_dump(c->data, val); @@ -174,8 +183,8 @@ static int pmd_callback(pmd_t *pmd, next = (addr & PMD_MASK) - 1 + PMD_SIZE; replication_log(c->data, - "PMD ADDR: 0x%p PMD VAL: 0x%016lx huge(%d) [%p --- %p] to %p\n", - pmd, val, pmd_huge(*pmd), (void *)addr, (void *)next, (void *)paddr); + "PMD ADDR: 0x%p (REPL=%d) PMD VAL: 0x%016lx huge(%d) [%p --- %p] to %p\n", + pmd, numa_pgtable_replicated(pmd), val, pmd_huge(*pmd), (void *)addr, (void *)next, (void *)paddr); if (c->pmd_extra_info) binary_dump(c->data, val); @@ -198,8 +207,8 @@ static int pte_callback(pte_t *pte, next = (addr & PAGE_MASK) - 1 + PAGE_SIZE; replication_log(c->data, - "PTE ADDR: 0x%p PTE VAL: 0x%016lx [%p --- %p] to %p\n", - pte, val, (void *)addr, (void *)next, (void *)paddr); + "PTE ADDR: 0x%p (REPL=%d) PTE VAL: 0x%016lx [%p --- %p] to %p (REPL=%d)\n", + pte, numa_pgtable_replicated(pte), val, (void *)addr, (void *)next, (void *)paddr, PageReplicated(virt_to_page(phys_to_virt(paddr)))); if (c->pte_extra_info) binary_dump(c->data, val); @@ -207,6 +216,8 @@ static int pte_callback(pte_t *pte, return 0; } + + static int pte_hole_callback(unsigned long addr, unsigned long next, int depth, struct mm_walk *walk) { @@ -223,6 +234,7 @@ static void dump_pgtables(struct mm_struct *mm, { int nid = 0; int extra = pgtables_extra ? 1 : 0; + bool locked = false; struct dump_config conf = { .pgd_extra_info = extra, .p4d_extra_info = extra, @@ -248,14 +260,22 @@ static void dump_pgtables(struct mm_struct *mm, replication_log(data, "----PER-NUMA NODE KERNEL REPLICATION ENABLED----\n"); - mmap_read_lock(mm); + + if (rwsem_is_locked(&mm->mmap_lock)) + locked = true; + else + mmap_read_lock(mm); + for_each_memory_node(nid) { replication_log(data, "NUMA node id #%d\n", nid); replication_log(data, "PGD: %p PGD phys: %p\n", mm->pgd_numa[nid], (void *)virt_to_phys(mm->pgd_numa[nid])); walk_page_range_novma(mm, start, end, &ops, mm->pgd_numa[nid], &conf); } - mmap_read_unlock(mm); + + if (!locked) + mmap_read_unlock(mm); + replication_log(data, "----PER-NUMA NODE KERNEL REPLICATION ENABLED----\n"); } @@ -388,21 +408,32 @@ static void replicate_memory(void *dst, unsigned long start, unsigned long end, p4d_t *p4d; pud_t *pud; pmd_t *pmd; + pte_t *pte; pgprot_t prot; - unsigned int nr_pmd = 0; + unsigned int offset_in_pages = 0; unsigned long vaddr = start; struct page *pages = virt_to_page(dst); memcpy(dst, lm_alias(start), end - start); - for (; vaddr < end; vaddr += PMD_SIZE, nr_pmd++) { + while (vaddr < end) { pgd = pgd_offset_pgd(node_desc[nid].pgd, vaddr); p4d = p4d_offset(pgd, vaddr); pud = pud_offset(p4d, vaddr); pmd = pmd_offset(pud, vaddr); - prot = pmd_pgprot(*pmd); + if (pmd_leaf(*pmd)) { + prot = pmd_pgprot(*pmd); + set_pmd(pmd, pfn_pmd(page_to_pfn(pages) + offset_in_pages, prot)); + offset_in_pages += PAGES_PER_PMD; + vaddr += PMD_SIZE; + continue; + } + pte = pte_offset_kernel(pmd, vaddr); + prot = pte_pgprot(*pte); + set_pte(pte, pfn_pte(page_to_pfn(pages) + offset_in_pages, prot)); + offset_in_pages++; + vaddr += PAGE_SIZE; - set_pmd(pmd, pfn_pmd(page_to_pfn(pages) + nr_pmd * PAGES_PER_PMD, prot)); } } @@ -420,6 +451,38 @@ static void replicate_kernel_rodata(int nid) KERNEL_RODATA_START, KERNEL_RODATA_END, nid); } +//'-1' in next functions have only one purpose - prevent unsgined long overflow +static void replicate_pgt_pte(pud_t *dst, pud_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & PMD_MASK; + unsigned long right = (end & PMD_MASK) - 1 + PMD_SIZE; + unsigned long addr; + + pmd_t *clone_pmd = pmd_offset(dst, left); + pmd_t *orig_pmd = pmd_offset(src, left); + + for (addr = left; + (addr >= left && addr < right); addr += PMD_SIZE) { + pgtable_t new_pte; + + if (pmd_none(*orig_pmd) || pmd_huge(*orig_pmd) || + pmd_val(*orig_pmd) == 0) + goto skip; + + pmd_clear(clone_pmd); + new_pte = pte_alloc_one_node(nid, &init_mm); + pmd_populate(&init_mm, clone_pmd, new_pte); + BUG_ON(new_pte == NULL); + + copy_page(page_to_virt(pmd_pgtable(*clone_pmd)), page_to_virt(pmd_pgtable(*orig_pmd))); +skip: + clone_pmd++; + orig_pmd++; + } +} + //'-1' in next functions have only one purpose - prevent unsgined long overflow static void replicate_pgt_pmd(p4d_t *dst, p4d_t *src, unsigned long start, unsigned long end, @@ -436,7 +499,8 @@ static void replicate_pgt_pmd(p4d_t *dst, p4d_t *src, (addr >= left && addr < right); addr += PUD_SIZE) { pmd_t *new_pmd; - if (pud_none(*orig_pud) || pud_huge(*orig_pud)) + if (pud_none(*orig_pud) || pud_huge(*orig_pud) || + pud_val(*orig_pud) == 0) goto skip; pud_clear(clone_pud); @@ -444,6 +508,9 @@ static void replicate_pgt_pmd(p4d_t *dst, p4d_t *src, BUG_ON(new_pmd == NULL); copy_page(pud_pgtable(*clone_pud), pud_pgtable(*orig_pud)); + + replicate_pgt_pte(clone_pud, orig_pud, max(addr, start), + min(addr - 1 + PUD_SIZE, end), nid); skip: clone_pud++; orig_pud++; @@ -465,7 +532,8 @@ static void replicate_pgt_pud(pgd_t *dst, pgd_t *src, (addr >= left && addr < right); addr += P4D_SIZE) { pud_t *new_pud; - if (p4d_none(*orig_p4d) || p4d_huge(*orig_p4d)) + if (p4d_none(*orig_p4d) || p4d_huge(*orig_p4d) || + p4d_val(*orig_p4d) == 0) goto skip; p4d_clear(clone_p4d); @@ -555,7 +623,6 @@ static void replicate_pgtables(void) for_each_online_node(nid) { int memory_nid = numa_get_memory_node(nid); - init_mm.pgd_numa[nid] = node_desc[memory_nid].pgd; } @@ -588,6 +655,18 @@ void __init numa_replicate_kernel_text(void) } text_replicated = true; + + if (!mm_p4d_folded(&init_mm)) + prop_level = PGD_PROPAGATION; + if (mm_p4d_folded(&init_mm) && !mm_pud_folded(&init_mm)) + prop_level = P4D_PROPAGATION; + if (mm_p4d_folded(&init_mm) && mm_pud_folded(&init_mm) && !mm_pmd_folded(&init_mm)) + prop_level = PUD_PROPAGATION; + if (mm_p4d_folded(&init_mm) && mm_pud_folded(&init_mm) && mm_pmd_folded(&init_mm)) + prop_level = PMD_PROPAGATION; + + BUG_ON(prop_level == NONE); + numa_setup_pgd(); } @@ -621,16 +700,23 @@ void __init_or_module *numa_get_replica(void *vaddr, int nid) return node_desc[nid].text_vaddr + offset; } -nodemask_t __read_mostly replica_nodes = { { [0] = 1UL } }; - +extern nodemask_t replica_nodes; +unsigned long __read_mostly replica_count; void __init numa_replication_init(void) { int nid; + unsigned long align = PAGE_SIZE; +#ifdef CONFIG_ARM64_4K_PAGES + align = HPAGE_SIZE; +#else + align = CONT_PTE_SIZE; +#endif nodes_clear(replica_nodes); - + replica_count = 0; for_each_node_state(nid, N_MEMORY) { __node_set(nid, &replica_nodes); + replica_count++; } for_each_memory_node(nid) @@ -647,11 +733,11 @@ void __init numa_replication_init(void) } else { node_desc[nid].text_vaddr = memblock_alloc_try_nid( (KERNEL_TEXT_END - KERNEL_TEXT_START), - HPAGE_SIZE, 0, MEMBLOCK_ALLOC_ANYWHERE, nid); + align, 0, MEMBLOCK_ALLOC_ANYWHERE, nid); node_desc[nid].rodata_vaddr = memblock_alloc_try_nid( (KERNEL_RODATA_END - KERNEL_RODATA_START), - HPAGE_SIZE, 0, MEMBLOCK_ALLOC_ANYWHERE, nid); + align, 0, MEMBLOCK_ALLOC_ANYWHERE, nid); } BUG_ON(node_desc[nid].text_vaddr == NULL); diff --git a/mm/numa_user_replication.c b/mm/numa_user_replication.c new file mode 100644 index 0000000000000..7152e262303aa --- /dev/null +++ b/mm/numa_user_replication.c @@ -0,0 +1,3105 @@ +#include <linux/numa_user_replication.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/rmap.h> +#include <linux/kernel.h> + +#include <asm/tlb.h> + +#include "internal.h" + + +DEFINE_SPINLOCK(replication_candidates_lock); +LIST_HEAD(replication_candidates); + +/* copypaste from mm/memory.c:195 */ +static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) +{ + struct task_struct *task = current; + + if (likely(task->mm == mm)) + task->rss_stat.count[member] += val; + else + add_mm_counter(mm, member, val); +} +#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) +#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) + + +static int pick_remaining_node(struct page *page, struct vm_area_struct *vma, unsigned long addr) +{ + return mpol_misplaced(page, vma, addr); +} + +static int phys_deduplicate_pte_entry(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, bool alloc_new_page, struct page **new_page) +{ + spinlock_t *ptl; + struct pgtable_private zp; + int nid; + int remaining_node; + int error = 0; + struct page *remaining_page; + pte_t new_entry; + + pte_t *head_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + + if (pte_present(*head_pte)) { + struct page *page = vm_normal_page(vma, addr, *head_pte); + + if (!page) { + pte_unmap_unlock(head_pte, ptl); + return 0; + } + + if (page && !PageReplicated(compound_head(page))) { + pte_unmap_unlock(head_pte, ptl); + return 0; + } + } else { + pte_unmap_unlock(head_pte, ptl); + return 0; + } + + pgtable_update_pte(&zp, head_pte); + + for_each_memory_node(nid) { + zp.replica_pages[nid] = vm_normal_page(vma, addr, *zp.pte_numa[nid]); + } + + if (alloc_new_page) { + remaining_node = NUMA_NO_NODE; + remaining_page = *new_page; + *new_page = NULL; + + inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + reliable_page_counter(remaining_page, vma->vm_mm, 1); + } else { + remaining_node = pick_remaining_node(zp.replica_pages[first_memory_node], vma, addr); + + if (remaining_node == -1) { + remaining_node = first_memory_node; + } + remaining_page = zp.replica_pages[remaining_node]; + } + + new_entry = pfn_pte(page_to_pfn(remaining_page), vma->vm_page_prot); + + if (alloc_new_page) { + void *src_vaddr = page_to_virt(zp.replica_pages[first_memory_node]); + void *dst_vaddr = page_to_virt(remaining_page); + + copy_page(dst_vaddr, src_vaddr); + } else { + ClearPageReplicated(remaining_page); + } + + page_add_new_anon_rmap(remaining_page, vma, addr, false); + lru_cache_add_inactive_or_unevictable(remaining_page, vma); + + for_each_memory_node(nid) { + if (nid == remaining_node) + continue; + pte_clear(vma->vm_mm, addr, zp.pte_numa[nid]); + set_pte_at(vma->vm_mm, addr, zp.pte_numa[nid], new_entry); + tlb_remove_tlb_entry(tlb, zp.pte_numa[nid], addr); + } + + + pte_unmap_unlock(head_pte, ptl); + + for_each_memory_node(nid) { + int res; + + if (nid == remaining_node) + continue; + + dec_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); + reliable_page_counter(zp.replica_pages[nid], vma->vm_mm, -1); + res = __tlb_remove_page(tlb, zp.replica_pages[nid]); + + if (unlikely(res)) + tlb_flush_mmu(tlb); + } + account_dereplicated_page(vma->vm_mm); + return error; +} + +static int prealloc_page_for_deduplication(struct vm_area_struct *vma, unsigned long addr, bool alloc_new_page, struct page **page) +{ + if (!alloc_new_page) + return 0; + if (*page) + return 0; + + *page = alloc_zeroed_user_highpage_movable(vma, addr); + + if (!(*page)) + return -ENOMEM; + + return 0; +} + +static int phys_deduplicate_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, bool alloc_new_page) +{ + int error = 0; + struct page *prealloc_page = NULL; + + tlb_change_page_size(tlb, PAGE_SIZE); + flush_tlb_batched_pending(vma->vm_mm); + arch_enter_lazy_mmu_mode(); + do { + error = prealloc_page_for_deduplication(vma, addr, alloc_new_page, &prealloc_page); + if (error) + goto out; + + error = phys_deduplicate_pte_entry(tlb, vma, pmd, addr, alloc_new_page, &prealloc_page); + if (error) + goto out; + + } while (addr += PAGE_SIZE, addr != end); + arch_leave_lazy_mmu_mode(); +out: + if (prealloc_page) + put_page(prealloc_page); + + return error; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +extern void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd); +extern gfp_t alloc_hugepage_direct_gfpmask(struct vm_area_struct *vma); + +static int prealloc_hugepage_for_deduplication(struct vm_area_struct *vma, unsigned long addr, bool alloc_new_page, struct page **page) +{ + gfp_t gfp; + + if (!alloc_new_page) + return 0; + if (*page) + return 0; + + gfp = alloc_hugepage_direct_gfpmask(vma); + *page = alloc_hugepage_vma(gfp, vma, addr, HPAGE_PMD_ORDER); + + if (!(*page)) + return -ENOMEM; + + prep_transhuge_page(*page); + return 0; +} + +static void copy_huge_page(void *to, void *from) +{ + int i; + + for (i = 0; i < HPAGE_PMD_NR; i++, to += PAGE_SIZE, from += PAGE_SIZE) { + copy_page(to, from); + } +} + +static int phys_deduplicate_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, bool alloc_new_page, struct page **new_page) +{ + spinlock_t *ptl; + int nid; + struct page *curr; + pmd_t *curr_pmd; + unsigned long offset; + bool start; + struct page *remaining_page; + pmd_t entry; + int remaining_node; + struct page *replica_pages[MAX_NUMNODES]; + + if (prealloc_hugepage_for_deduplication(vma, addr, alloc_new_page, new_page)) + return -ENOMEM; + + ptl = __pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return -EAGAIN; + + if (is_huge_zero_pmd(*pmd)) + goto out; + + if (!pmd_present(*pmd)) + goto out; + + if (!PageReplicated(pmd_page(*pmd))) + goto out; + + for_each_pgtable(curr, curr_pmd, pmd, nid, offset, start) { + replica_pages[nid] = pmd_page(*curr_pmd); + if (curr_pmd != get_master_pmd(curr_pmd)) + zap_deposited_table(vma->vm_mm, curr_pmd); + } + + if (alloc_new_page) { + remaining_node = NUMA_NO_NODE; + remaining_page = *new_page; + *new_page = NULL; + + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + reliable_page_counter(remaining_page, vma->vm_mm, HPAGE_PMD_NR); + } else { + remaining_node = pick_remaining_node(replica_pages[first_memory_node], vma, addr); + + if (remaining_node == -1) { + remaining_node = first_memory_node; + } + + remaining_page = replica_pages[remaining_node]; + } + + entry = mk_huge_pmd(remaining_page, vma->vm_page_prot); + + if (alloc_new_page) { + void *src_vaddr = page_to_virt(replica_pages[first_memory_node]); + void *dst_vaddr = page_to_virt(remaining_page); + + copy_huge_page(dst_vaddr, src_vaddr); + } else { + ClearPageReplicated(remaining_page); + } + + page_add_new_anon_rmap(remaining_page, vma, addr, true); + lru_cache_add_inactive_or_unevictable(remaining_page, vma); + + for_each_pgtable(curr, curr_pmd, pmd, nid, offset, start) { + if (nid == remaining_node) + continue; + pmd_clear(curr_pmd); + atomic_dec(compound_mapcount_ptr(replica_pages[nid])); + set_pmd_at(vma->vm_mm, addr, curr_pmd, entry); + tlb_remove_pmd_tlb_entry(tlb, curr_pmd, addr); + } + + + spin_unlock(ptl); + + for_each_memory_node(nid) { + if (nid == remaining_node) + continue; + + add_mm_counter(vma->vm_mm, MM_ANONPAGES, -HPAGE_PMD_NR); + reliable_page_counter(replica_pages[nid], vma->vm_mm, -HPAGE_PMD_NR); + tlb_remove_page_size(tlb, replica_pages[nid], HPAGE_PMD_SIZE); + } + account_dereplicated_hugepage(vma->vm_mm); + + return 0; +out: + spin_unlock(ptl); + return 0; +} + +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ + +static int phys_deduplicate_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, bool alloc_new_page, struct page **new_page) +{ + return 0; +} + +#endif + +static int phys_deduplicate_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, bool alloc_new_page) +{ + pmd_t *pmd; + unsigned long next; + int error = 0; + struct page *prealloc_hugepage = NULL; +retry: + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + + if (pmd_none(*pmd)) + continue; + if (is_swap_pmd(*pmd)) + continue; + if (pmd_devmap(*pmd)) + BUG(); + if (pmd_trans_huge(*pmd)) { + + /* Same as in the phys_duplicate */ + BUG_ON(next - addr != HPAGE_PMD_SIZE); + error = phys_deduplicate_huge_pmd(tlb, vma, pmd, addr, next, alloc_new_page, &prealloc_hugepage); + if (error == -EAGAIN) + goto retry; + } else { + error = phys_deduplicate_pte_range(tlb, vma, pmd, addr, next, alloc_new_page); + } + + if (error) + goto out; + + cond_resched(); + } while (pmd++, addr = next, addr != end); +out: + if (prealloc_hugepage) + put_page(prealloc_hugepage); + + return error; +} + +static int phys_deduplicate_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, + unsigned long addr, unsigned long end, bool alloc_new_page) +{ + pud_t *pud; + unsigned long next; + int error = 0; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + + if (pud_none_or_clear_bad(pud)) + continue; + + error = phys_deduplicate_pmd_range(tlb, vma, pud, addr, next, alloc_new_page); + if (error) + goto out; + + } while (pud++, addr = next, addr != end); +out: + return error; +} + +static int phys_deduplicate_p4d_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, bool alloc_new_page) +{ + p4d_t *p4d; + unsigned long next; + int error = 0; + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + + if (p4d_none_or_clear_bad(p4d)) + continue; + + error = phys_deduplicate_pud_range(tlb, vma, p4d, addr, next, alloc_new_page); + if (error) + goto out; + + } while (p4d++, addr = next, addr != end); +out: + return error; +} + +static int phys_deduplicate_pgd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, + unsigned long end, bool alloc_new_page) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + unsigned long next; + unsigned long start = addr; + int error = 0; + + BUG_ON(addr >= end); + + addr = start; + tlb_start_vma(tlb, vma); + pgd = pgd_offset(mm, addr); + do { + next = pgd_addr_end(addr, end); + + if (pgd_none_or_clear_bad(pgd)) + continue; + + error = phys_deduplicate_p4d_range(tlb, vma, pgd, addr, next, alloc_new_page); + if (error) + goto out; + + } while (pgd++, addr = next, addr != end); + + tlb_end_vma(tlb, vma); +out: + return error; +} + +/* + * Pages inside [addr; end) are 100% populated, + * so we can't skip some checks and simplify code. + */ +static int phys_deduplicate_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, bool alloc_new_page) +{ + struct mmu_gather tlb; + int error = 0; + + if (addr == end) + return 0; + + tlb_gather_mmu(&tlb, vma->vm_mm, addr, end); + + error = phys_deduplicate_pgd_range(&tlb, vma, addr, end, alloc_new_page); + + tlb_finish_mmu(&tlb, addr, end); + + if (!error && printk_ratelimit()) { + pr_info("Deduplicated range: 0x%016lx --- 0x%016lx, mm: 0x%016lx, PID: %d name: %s\n", + addr, end, (unsigned long)(vma->vm_mm), vma->vm_mm->owner->pid, vma->vm_mm->owner->comm); + } + + BUG_ON(error && !alloc_new_page); + + return error; +} + +int numa_remove_replicas(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool alloc_new_page) +{ + int error = 0; + + start = start & PAGE_MASK; + end = end & PAGE_MASK; + + error = phys_deduplicate_range(vma, start, end, alloc_new_page); + + + return error; +} + +int phys_deduplicate(struct vm_area_struct *vma, unsigned long start, size_t len, bool alloc_new_page) +{ + if (!vma) { + pr_warn("%s -- %s:%d\n", __func__, __FILE__, __LINE__); + return -EINVAL; + } + + if (!(vma->vm_flags & VM_REPLICA_COMMIT)) + return -EINVAL; + + numa_remove_replicas(vma, vma->vm_start, vma->vm_end, alloc_new_page); + + return 0; +} + +int __fixup_fault(struct vm_area_struct *vma, unsigned long addr) +{ + return (handle_mm_fault(vma, addr, FAULT_FLAG_INTERRUPTIBLE | FAULT_FLAG_KILLABLE | + FAULT_FLAG_RETRY_NOWAIT | FAULT_FLAG_ALLOW_RETRY, NULL) & VM_FAULT_ERROR); +} + + +int fixup_fault(struct vm_area_struct *vma, unsigned long addr) +{ + vm_fault_t fault = __fixup_fault(vma, addr); + + if (fault & VM_FAULT_SIGBUS) + return 0; + return !!(fault & VM_FAULT_ERROR); +} + +static int phys_duplicate_pte_entry(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, struct page **replica_page) +{ + struct page *new_page; + struct page *orig_page; + int nid; + char *new_vaddr; + struct page *pg; + pte_t entry; + int pte_nid; + void *src_addr; + int reason = 0; + struct pgtable_private ptes; + spinlock_t *ptl; + pte_t *pte, *head_pte; +retry: + head_pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + + if (!numa_pgtable_replicated(head_pte)) { + vm_fault_t fault; + + pte_unmap_unlock(head_pte, ptl); + fault = __fixup_fault(vma, addr); + + if (fault & VM_FAULT_SIGBUS) + return -EBUSY; + if (fault & VM_FAULT_RETRY) + return -EBUSY; + if (fault) + return -ENOMEM; + + goto retry; + } + + + pgtable_update_pte(&ptes, head_pte); + + /* It could happen on a not yet faulted vaddr. Now we require from user to + * put MAP_POPULATE manually, but add it with MAP_REPLICA silently. + */ + + pte = ptes.pte_numa[first_memory_node]; + + for_each_memory_node(nid) { + /* + * For some unknown reasons, there are cases, when + * pte_level populated only on single node. This is not good, + * to avoid this check all ptes now, but this should not happening at all + */ + if (!pte_present(*ptes.pte_numa[nid])) { + pte_unmap_unlock(head_pte, ptl); + + return -EBUSY; + } + } + + if (pte_write(*pte)) { + reason = 2; + goto bug; + } + + + /* We can handle this case only for 0th node table (I hope so), + * because we are under pte_lock, which serializes migration pte modifications + */ + + orig_page = vm_normal_page(vma, addr, *pte); + + if (orig_page && PageReplicated(compound_head(orig_page))) { + pte_unmap_unlock(head_pte, ptl); + return -EBUSY; + } + + for_each_memory_node(nid) { + pte = ptes.pte_numa[nid]; + + pg = pte_page(*pte); + pte_nid = page_to_nid(pg); + + new_page = replica_page[nid]; + replica_page[nid] = NULL; + + new_vaddr = page_to_virt(new_page); + src_addr = page_to_virt(pg); + + copy_page(new_vaddr, src_addr); + + __SetPageUptodate(new_page); + + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); + reliable_page_counter(new_page, vma->vm_mm, 1); + entry = pfn_pte(page_to_pfn(new_page), vma->vm_page_prot); + + pte_clear(vma->vm_mm, addr, pte); + set_pte_at(vma->vm_mm, addr, pte, entry); + + update_mmu_cache(vma, addr, pte); + } + account_replicated_page(vma->vm_mm); + if (orig_page) { + dec_mm_counter_fast(vma->vm_mm, mm_counter(orig_page)); + reliable_page_counter(orig_page, vma->vm_mm, -1); + page_remove_rmap(orig_page, false); + } + + pte_unmap_unlock(head_pte, ptl); + + if (orig_page) { + free_pages_and_swap_cache(&orig_page, 1); + } + + return 0; + +bug: + dump_mm_pgtables(vma->vm_mm, addr, addr + PAGE_SIZE * 4 - 1); + + pr_info("Died because BUG_ON #%d\n", reason); + + BUG(); +} + +static void release_prealloc_pages(struct page **pages) +{ + int nid; + + for_each_memory_node(nid) { + if (pages[nid] != NULL) { + put_page(pages[nid]); + pages[nid] = NULL; + } + } +} + +static int prealloc_pages_for_replicas(struct mm_struct *mm, struct page **pages, int order) +{ + int nid; + gfp_t gfp = (GFP_HIGHUSER | __GFP_THISNODE) & (~__GFP_DIRECT_RECLAIM); + + if (order) + gfp |= __GFP_COMP; + for_each_memory_node(nid) { + /* + * Do not reclaim in case of memory shortage, just fail + * We already don't have enough memory. + * Also, make replica pages unmovable + */ + pages[nid] = alloc_pages_node(nid, gfp, order); + if (pages[nid] == NULL) + goto fail; + SetPageReplicated(pages[nid]); + if (mem_cgroup_charge(pages[nid], mm, GFP_KERNEL)) + goto fail; + } + + for_each_memory_node(nid) { + cgroup_throttle_swaprate(pages[nid], GFP_KERNEL); + } + + return 0; + +fail: + release_prealloc_pages(pages); + return -ENOMEM; +} + +/* + * We must hold at least mmap_read_lock + */ +unsigned long phys_duplicate_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end) +{ + struct page *prealloc_pages[MAX_NUMNODES] = {}; + + flush_tlb_batched_pending(vma->vm_mm); + do { + int ret = 0; + + if (prealloc_pages_for_replicas(vma->vm_mm, prealloc_pages, 0)) + break; + ret = phys_duplicate_pte_entry(vma, pmd, addr, prealloc_pages); + if (ret) + release_prealloc_pages(prealloc_pages); + if (ret == -ENOMEM) + break; + + } while (addr += PAGE_SIZE, addr != end); + + + return addr; +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +static void release_deposit_pgtables(struct mm_struct *mm, pgtable_t *pgtables) +{ + int nid; + + for_each_memory_node(nid) { + if (pgtables[nid] != NULL) { + pte_free(mm, pgtables[nid]); + pgtables[nid] = NULL; + } + } +} + +static int prealloc_deposit_pgtables(struct mm_struct *mm, pgtable_t *pgtables) +{ + int nid; + + for_each_memory_node(nid) { + pgtables[nid] = pte_alloc_one_node(nid, mm); + if (!pgtables[nid]) + goto fail; + } + + return 0; +fail: + release_deposit_pgtables(mm, pgtables); + return -ENOMEM; +} + +int phys_duplicate_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end) +{ + pgtable_t deposit_ptes[MAX_NUMNODES] = {}; + struct page *prealloc_pages[MAX_NUMNODES] = {}; + spinlock_t *ptl; + int nid; + struct page *curr; + pmd_t *curr_pmd; + unsigned long offset; + bool start; + unsigned long ret = 0; + pmd_t orig_pmd; + pmd_t entry; + struct page *orig_page; + + if (prealloc_deposit_pgtables(vma->vm_mm, deposit_ptes)) { + ret = -ENOMEM; + goto out; + } + + if (prealloc_pages_for_replicas(vma->vm_mm, prealloc_pages, HPAGE_PMD_ORDER)) { + ret = -ENOMEM; + goto out; + } + + ptl = __pmd_trans_huge_lock(pmd, vma); + + if (!ptl) { + ret = -EAGAIN; + goto out; + } + + for_each_pgtable(curr, curr_pmd, pmd, nid, offset, start) { + /* + * For some unknown reasons, there are cases, when + * pte_level populated only on single node. This is not good, + * to avoid this check all ptes now, but this should not happening at all + */ + if (!pmd_present(*curr_pmd)) { + spin_unlock(ptl); + + ret = 0; + goto out; + } + } + + orig_pmd = *pmd; + orig_page = pmd_page(orig_pmd); + + if (PageReplicated(orig_page)) { + spin_unlock(ptl); + ret = -EBUSY; + goto out; + } + + zap_deposited_table(vma->vm_mm, get_master_pmd(pmd)); + + for_each_pgtable(curr, curr_pmd, pmd, nid, offset, start) { + copy_huge_page(page_to_virt(prealloc_pages[nid]), page_to_virt(orig_page)); + prep_transhuge_page(prealloc_pages[nid]); + SetPageReplicated(prealloc_pages[nid]); + + entry = mk_huge_pmd(prealloc_pages[nid], vma->vm_page_prot); + + atomic_inc(compound_mapcount_ptr(prealloc_pages[nid])); + reliable_page_counter(prealloc_pages[nid], vma->vm_mm, HPAGE_PMD_NR); + + prealloc_pages[nid] = NULL; + + pgtable_trans_huge_deposit(vma->vm_mm, curr_pmd, deposit_ptes[nid]); + deposit_ptes[nid] = NULL; + + + pmd_clear(curr_pmd); + + set_pmd_at(vma->vm_mm, addr, curr_pmd, entry); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + + mm_inc_nr_ptes(vma->vm_mm); + + } + account_replicated_hugepage(vma->vm_mm); + if (!is_huge_zero_pmd(orig_pmd)) { + add_mm_counter(vma->vm_mm, mm_counter(orig_page), -HPAGE_PMD_NR); + reliable_page_counter(orig_page, vma->vm_mm, -HPAGE_PMD_NR); + page_remove_rmap(orig_page, true); + } + + spin_unlock(ptl); + + if (!is_huge_zero_pmd(orig_pmd)) { + free_pages_and_swap_cache(&orig_page, 1); + } + +out: + release_deposit_pgtables(vma->vm_mm, deposit_ptes); + release_prealloc_pages(prealloc_pages); + return ret; +} + +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ + +int phys_duplicate_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end) +{ + return 0; +} + +#endif + +static unsigned long phys_duplicate_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next, last = addr; +retry: + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + + if (pmd_none(*pmd) || !numa_pgtable_replicated(pmd) || is_swap_pmd(*pmd)) { + if (fixup_fault(vma, addr)) + break; + goto retry; + } + + if (pmd_devmap(*pmd)) { + BUG(); // not supported right now, probably only trans_huge will be + } + + if (pmd_trans_huge(*pmd)) { + int ret; + /* + * Leave this bug for now, + * need to carefully think how to handle this situation + */ + BUG_ON(next - addr != HPAGE_PMD_SIZE); + ret = phys_duplicate_huge_pmd(vma, pmd, addr, next); + if (ret == -EAGAIN) + goto retry; + if (!ret || ret == -EBUSY) + last = next; + } else { + last = phys_duplicate_pte_range(vma, pmd, addr, next); + } + + if (last != next) + break; + + cond_resched(); + } while (pmd++, addr = next, addr != end); + + return last; +} + +static unsigned long phys_duplicate_pud_range(struct vm_area_struct *vma, p4d_t *p4d, + unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next, last = addr; +retry: + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + + if (pud_none_or_clear_bad(pud) || !numa_pgtable_replicated(pud)) { + if (fixup_fault(vma, addr)) + break; + goto retry; + } + + last = phys_duplicate_pmd_range(vma, pud, addr, next); + + if (last != next) + break; + + } while (pud++, addr = next, addr != end); + + return last; +} + +static unsigned long phys_duplicate_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end) +{ + p4d_t *p4d; + unsigned long next, last = addr; +retry: + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + + if (p4d_none_or_clear_bad(p4d) || !numa_pgtable_replicated(p4d)) { + if (fixup_fault(vma, addr)) + break; + goto retry; + } + + last = phys_duplicate_pud_range(vma, p4d, addr, next); + + if (last != next) + break; + + } while (p4d++, addr = next, addr != end); + + return last; +} + +static unsigned long phys_duplicate_pgd_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + unsigned long next; + unsigned long start = addr; + unsigned long last = addr; + + BUG_ON(addr >= end); + + addr = start; + + flush_cache_range(vma, addr, end); + inc_tlb_flush_pending(mm); + + pgd = pgd_offset_pgd(this_node_pgd(mm), addr); +retry: + + do { + next = pgd_addr_end(addr, end); + + if (pgd_none_or_clear_bad(pgd)) { + if (fixup_fault(vma, addr)) + break; + goto retry; + } + + last = phys_duplicate_p4d_range(vma, pgd, addr, next); + + if (last != next) + break; + + } while (pgd++, addr = next, addr != end); + + if (last == end) + flush_tlb_range(vma, start, end); + dec_tlb_flush_pending(mm); + + return last; + +} + +/* + * We must hold at least mmap_read_lock + */ +static unsigned long phys_duplicate_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end) +{ + unsigned long last; + + if (unlikely(anon_vma_prepare(vma))) + return addr; + + last = phys_duplicate_pgd_range(vma, addr, end); + + return last; +} + +int numa_clone_pte(struct vm_area_struct *vma, unsigned long start, unsigned long end) +{ + unsigned long last = 0; + + start = start & PAGE_MASK; + end = end & PAGE_MASK; + + last = phys_duplicate_range(vma, start, end); + + if (last != end) { + phys_deduplicate_range(vma, start, last, false); + return -ENOMEM; + } + + return 0; +} + +int phys_duplicate(struct vm_area_struct *vma, unsigned long start, size_t len) +{ + int error = 0; + + if (!vma) { + pr_warn("%s -- %s:%d\n", __func__, __FILE__, __LINE__); + return -ENOMEM; + } + + if (vma->vm_flags & VM_WRITE) + return -EINVAL; + + + if ((start < vma->vm_start) || (start + len > vma->vm_end)) { + pr_warn("Replication is possible only inside vma\n"); + pr_warn("vma->vm_start %zx; len %zx\n", vma->vm_start, + vma->vm_end - vma->vm_start); + return -ENOMEM; + } + + if (!numa_is_vma_replicant(vma)) { + pr_warn("%s -- %s:%d\n", __func__, __FILE__, __LINE__); + return -EINVAL; + } + + error = numa_clone_pte(vma, start, start + len); + if (!error) { + // pr_info("Successfully replicated memory -- start:%zx; len:%zx PID: %d NAME: %s\n", + // start, len, vma->vm_mm->owner->pid, vma->vm_mm->owner->comm); + vma->vm_flags |= VM_REPLICA_COMMIT; + } + + flush_tlb_range(vma, start, start + len); + + return error; +} + +void numa_replication_remove_from_candidate_list(struct mm_struct *mm) +{ + + + if (!mm->replication_ctl->in_candidate_list) + return; + + spin_lock(&mm->replication_ctl->lock); + + /* We are already not in this list */ + if (!mm->replication_ctl->in_candidate_list) + goto out; + + spin_lock_nested(&replication_candidates_lock, SINGLE_DEPTH_NESTING); + + list_del(&mm->replication_ctl->replication_candidates); + + spin_unlock(&replication_candidates_lock); + + mm->replication_ctl->in_candidate_list = false; +out: + spin_unlock(&mm->replication_ctl->lock); +} + +void numa_mm_apply_replication(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + if (get_user_replication_policy(mm)) + return; + + mmap_write_lock(mm); + + if (get_user_replication_policy(mm)) + goto out; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + vma->vm_flags |= VM_REPLICA_INIT; + + numa_replicate_pgtables_vma(vma); + + if (vma_might_be_replicated(vma)) { + phys_duplicate(vma, vma->vm_start, vma->vm_end - vma->vm_start); + /* + * Set this flag anyway, even if we failed. + * Our hopes are on numa balancer + */ + vma->vm_flags |= VM_REPLICA_COMMIT; + } + } + + set_user_replication_policy(mm, true); + numa_replication_remove_from_candidate_list(mm); +out: + mmap_write_unlock(mm); +} + + + +static inline bool replicated_p4d_level(struct vm_fault *vmf) +{ + //TODO Do something better + return mm_p4d_folded(vmf->vma->vm_mm) || vmf->p4d_replicated || pgd_none(*(vmf->pgd)); +} + +static inline bool replicated_pud_level(struct vm_fault *vmf) +{ + //TODO Do something better + /* We don't have entries on this level, or they are not the same*/ + return mm_pud_folded(vmf->vma->vm_mm) || vmf->pud_replicated || p4d_none(*(vmf->p4d)); +} + +static inline bool replicated_pmd_level(struct vm_fault *vmf) +{ + //TODO Do something better + /* We don't have entries on this level, or they are not the same*/ + return mm_pmd_folded(vmf->vma->vm_mm) || vmf->pmd_replicated || pud_none(*(vmf->pud)); +} + +static inline bool replicated_pte_level(struct vm_fault *vmf) +{ + //TODO Do something better + /* We don't have entries on this level, or they are not the same*/ + return vmf->pte_replicated || pmd_none(*(vmf->pmd)); +} + +static inline bool overlap_pmd_entry(unsigned long address, unsigned long left, unsigned long right) +{ + return ((address & PMD_MASK) == (left & PMD_MASK)) || + ((address & PMD_MASK) == (right & PMD_MASK)); +} + +static inline bool overlap_pud_entry(unsigned long address, unsigned long left, unsigned long right) +{ + return ((address & PUD_MASK) == (left & PUD_MASK)) || + ((address & PUD_MASK) == (right & PUD_MASK)); +} + +static inline bool overlap_p4d_entry(unsigned long address, unsigned long left, unsigned long right) +{ + return ((address & P4D_MASK) == (left & P4D_MASK)) || + ((address & P4D_MASK) == (right & P4D_MASK)); +} + +static inline bool overlap_pgd_entry(unsigned long address, unsigned long left, unsigned long right) +{ + return ((address & PGDIR_MASK) == (left & PGDIR_MASK)) || + ((address & PGDIR_MASK) == (right & PGDIR_MASK)); +} + +static inline void get_replicant_neighbours(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, unsigned long *left, unsigned long *right) +{ + *left = ULONG_MAX; + *right = ULONG_MAX; + + if (numa_is_vma_replicant(vma)) + *left = *right = address; + +} + +static inline void __replication_path_action(struct vm_fault *vmf, bool replicated) +{ + if (vmf->replica_action != REPLICA_NONE) { + /* + * If we meet propagation action again, that means upper + * level has already been propagated and we don't have + * replicas anylower -- we need to completely switch + * to default handling. + */ + if (vmf->replica_action == REPLICA_PROPAGATE) + vmf->replica_action = REPLICA_NONE; + else + vmf->replica_action = replicated ? REPLICA_KEEP : REPLICA_PROPAGATE; + } +} + +static bool replication_path_pgd(struct vm_fault *vmf) +{ + bool p4d_folded = mm_p4d_folded(vmf->vma->vm_mm), replicated; + struct mm_struct *mm = vmf->vma->vm_mm; + unsigned long address = vmf->real_address; + /* There are replicated tables in our pgd entry or there is vma requiring it. Need to replicate next level. + * 5-level paging and folded p4d give us a lot of grief. + * If 5-level paging disabled, handle_mm_fault_pgd function doing nothing, except filling vmf->p4d_numa + * with same values as in vmf->pgd_numa and propagation will not work correctly. + * So we need to go in __handle_mm_fault_p4d_replicant, because we might still want to propagate it. + */ + get_replicant_neighbours(mm, vmf->vma, address, &(vmf->left_replicant), &(vmf->right_replicant)); + if (!p4d_folded) + vmf->p4d_replicated = !pgd_none(*(vmf->pgd)) && + PageReplicated(virt_to_page(pgd_page_vaddr(*vmf->pgd))); + replicated = p4d_folded || overlap_pgd_entry(address, vmf->left_replicant, vmf->right_replicant) + || vmf->p4d_replicated; + /* + * Here replica_action may be REPLICA_NONE, so we ignore that, + * because we always replicate top level table. + */ + vmf->replica_action = replicated ? REPLICA_KEEP : REPLICA_PROPAGATE; + return replicated; +} + +static bool replication_path_p4d(struct vm_fault *vmf) +{ + bool pud_folded = mm_pud_folded(vmf->vma->vm_mm), replicated; + unsigned long address = vmf->real_address; + + if (vmf->replica_action == REPLICA_PROPAGATE) { + /* + * We have already propagated upper level, + * so we'll never use XXX_replicated values again + * during this fault. + */ + vmf->replica_action = REPLICA_NONE; + return false; + } + + if (!pud_folded) + vmf->pud_replicated = !p4d_none(*(vmf->p4d)) && + PageReplicated(virt_to_page(p4d_pgtable(*vmf->p4d))); + replicated = pud_folded || overlap_p4d_entry(address, vmf->left_replicant, vmf->right_replicant) + || vmf->pud_replicated; + __replication_path_action(vmf, replicated); + return replicated; +} + +static bool replication_path_pud(struct vm_fault *vmf) +{ + bool pmd_folded = mm_pmd_folded(vmf->vma->vm_mm), replicated; + unsigned long address = vmf->real_address; + + if (vmf->replica_action == REPLICA_PROPAGATE) { + /* + * We have already propagated upper level, + * so we'll never use XXX_replicated values again + * during this fault. + */ + vmf->replica_action = REPLICA_NONE; + return false; + } + + if (!pmd_folded) + vmf->pmd_replicated = !pud_none(*(vmf->pud)) && + PageReplicated(virt_to_page(pud_pgtable(*vmf->pud))); + replicated = pmd_folded || overlap_pud_entry(address, vmf->left_replicant, vmf->right_replicant) + || vmf->pmd_replicated; + __replication_path_action(vmf, replicated); + return replicated; +} + +static bool replication_path_pmd(struct vm_fault *vmf) +{ + bool replicated; + unsigned long address = vmf->real_address; + + if (vmf->replica_action == REPLICA_PROPAGATE) { + /* + * We have already propagated upper level, + * so we'll never use XXX_replicated values again + * during this fault. + */ + vmf->replica_action = REPLICA_NONE; + return false; + } + vmf->pte_replicated = !pmd_none(*(vmf->pmd)) && !pmd_devmap_trans_unstable(vmf->pmd) && + PageReplicated(pmd_pgtable(*vmf->pmd)); + replicated = overlap_pmd_entry(address, vmf->left_replicant, vmf->right_replicant) + || vmf->pte_replicated; + __replication_path_action(vmf, replicated); + return replicated; +} + +static void +release_replicated_p4d_tables(int allocated_node, p4d_t **new, struct mm_struct *mm) +{ + int nid; + + for_each_memory_node(nid) { + if (nid == allocated_node || new[nid] == NULL) + continue; + p4d_free(mm, new[nid]); + } + +} + +static void +release_replicated_pud_tables(int allocated_node, pud_t **new, struct mm_struct *mm) +{ + int nid; + + for_each_memory_node(nid) { + if (nid == allocated_node || new[nid] == NULL) + continue; + pud_free(mm, new[nid]); + } + +} + +static void +release_replicated_pmd_tables(int allocated_node, pmd_t **new, struct mm_struct *mm) +{ + int nid; + + for_each_memory_node(nid) { + if (nid == allocated_node || new[nid] == NULL) + continue; + pmd_free(mm, new[nid]); + } + +} + +static void +release_replicated_pte_tables(int allocated_node, struct page **new, struct mm_struct *mm) +{ + int nid; + + for_each_memory_node(nid) { + if (nid == allocated_node || new[nid] == NULL) + continue; + if (allocated_node == NUMA_NO_NODE) { + ClearPageReplicated(new[nid]); + new[nid]->replica_list_node.next = NULL; + } + pte_free(mm, new[nid]); + } +} + +static void +sync_replicated_p4d_tables(int allocated_node, p4d_t **new, pgd_t *start_pgd, struct mm_struct *mm) +{ + int nid; + unsigned long offset; + struct page *curr; + pgd_t *curr_pgd; + bool start; + + for_each_pgtable(curr, curr_pgd, start_pgd, nid, offset, start) { + SetPageReplicated(virt_to_page(new[allocated_node])); + + memcg_account_replicated_p4d_page(mm, new[nid]); + if (nid == allocated_node) + continue; + if (allocated_node != NUMA_NO_NODE) + copy_page(new[nid], new[allocated_node]); + + SetPageReplicated(virt_to_page(new[nid])); + + smp_wmb(); + pgd_populate(mm, curr_pgd, new[nid]); + } +#ifndef __PAGETABLE_P4D_FOLDED + account_replicated_table(mm); +#endif +} + +static void +sync_replicated_pud_tables(int allocated_node, pud_t **new, p4d_t *start_p4d, struct mm_struct *mm) +{ + int nid; + unsigned long offset; + struct page *curr; + p4d_t *curr_p4d; + bool start; + /* + * Do not need locking from sync_replicated_pte_tables, + * because pud_lockptr == page_table_lock + */ + build_pud_chain(new); + set_master_page_for_puds(allocated_node, new); + for_each_pgtable(curr, curr_p4d, start_p4d, nid, offset, start) { + SetPageReplicated(virt_to_page(new[nid])); + memcg_account_replicated_pud_page(mm, new[nid]); + if (nid == allocated_node) + continue; + if (allocated_node != NUMA_NO_NODE) + copy_page(new[nid], new[allocated_node]); + + mm_inc_nr_puds(mm); + smp_wmb(); + p4d_populate(mm, curr_p4d, new[nid]); + } + account_replicated_table(mm); +} + +static void +sync_replicated_pmd_tables(int allocated_node, pmd_t **new, pud_t *start_pud, struct mm_struct *mm) +{ + int nid; + unsigned long offset; + struct page *curr; + pud_t *curr_pud; + bool start; + /* + * Locking here the same as in the sync_replicated_pte_tables + */ + spinlock_t *ptl = NULL; + + if (allocated_node != NUMA_NO_NODE) { + ptl = pmd_lockptr(mm, new[allocated_node]); + spin_lock_nested(ptl, 1); + } + + BUILD_BUG_ON(!USE_SPLIT_PMD_PTLOCKS); + + build_pmd_chain(new); + set_master_page_for_pmds(allocated_node, new); + for_each_pgtable(curr, curr_pud, start_pud, nid, offset, start) { + SetPageReplicated(virt_to_page(new[nid])); + memcg_account_replicated_pmd_page(mm, new[nid]); + if (nid == allocated_node) + continue; + if (allocated_node != NUMA_NO_NODE) + copy_page(new[nid], new[allocated_node]); + + mm_inc_nr_pmds(mm); + smp_wmb(); + pud_populate(mm, curr_pud, new[nid]); + + } + account_replicated_table(mm); + + if (ptl) + spin_unlock(ptl); +} + +#ifdef CONFIG_ARM64 +static void +sync_replicated_pte_tables(int allocated_node, struct page **new, pmd_t *start_pmd, struct mm_struct *mm) +{ + int nid; + unsigned long offset; + struct page *curr; + pmd_t *curr_pmd; + bool start; + spinlock_t *ptl = NULL; + + /* Why we need (sometimes) ptl from allocated_node here? + * If replicate existed table, concurrent page fault might + * observe replicated table which content was not copied + * from original table yet. At this point master_locks are + * already set (which is lock from original table), so we + * need to hold it here. + * + * Obviously, if there was no any table before, + * we do not need to hold any pte lock at all, everything will be propagated + * correctly via replica_list + */ + BUILD_BUG_ON(!USE_SPLIT_PTE_PTLOCKS); + + if (allocated_node != NUMA_NO_NODE) { + ptl = ptlock_ptr(new[allocated_node]); + spin_lock_nested(ptl, 1); + + build_pte_chain(new); + set_master_page_for_ptes(allocated_node, new); + + for_each_memory_node(nid) { + SetPageReplicated(new[nid]); + if (nid == allocated_node) + continue; + copy_page(page_to_virt(new[nid]), page_to_virt(new[allocated_node])); + } + + } + + smp_wmb(); + + for_each_pgtable(curr, curr_pmd, start_pmd, nid, offset, start) { + /* + * We are safe to set this flag here even for original table, + * because replica list have already been created. + * So, in the case if some propagation will be required, + * we are able to do it, even if not all upper tables are populated yet + */ + memcg_account_replicated_pte_page(mm, page_to_virt(new[nid])); + if (nid == allocated_node) + continue; + + mm_inc_nr_ptes(mm); + + WRITE_ONCE(*curr_pmd, __pmd(__phys_to_pmd_val(page_to_phys(new[nid])) | PMD_TYPE_TABLE)); + } + + dsb(ishst); + isb(); + account_replicated_table(mm); + + if (ptl) + spin_unlock(ptl); +} +#endif + +static int +prepare_replicated_p4d_tables(int allocated_node, p4d_t **new, struct mm_struct *mm, unsigned long address) +{ + int nid; + p4d_t *new_p4d; + bool fail = false; + + for_each_memory_node(nid) { + if (nid == allocated_node) + continue; + new_p4d = p4d_alloc_one_node(nid, mm, address); + + if (unlikely(!new_p4d)) + fail = true; + + new[nid] = new_p4d; + } + + if (unlikely(fail)) { + release_replicated_p4d_tables(allocated_node, new, mm); + return -ENOMEM; + } + + return 0; +} + +static int +prepare_replicated_pud_tables(int allocated_node, pud_t **new, struct mm_struct *mm, unsigned long address) +{ + int nid; + pud_t *new_pud; + bool fail = false; + + for_each_memory_node(nid) { + if (nid == allocated_node) + continue; + new_pud = pud_alloc_one_node(nid, mm, address); + + if (unlikely(!new_pud)) + fail = true; + + new[nid] = new_pud; + } + + if (unlikely(fail)) { + release_replicated_pud_tables(allocated_node, new, mm); + return -ENOMEM; + } + + return 0; +} + +static int +prepare_replicated_pmd_tables(int allocated_node, pmd_t **new, struct mm_struct *mm, unsigned long address) +{ + int nid; + pmd_t *new_pmd; + bool fail = false; + + for_each_memory_node(nid) { + if (nid == allocated_node) + continue; + new_pmd = pmd_alloc_one_node(nid, mm, address); + + if (unlikely(!new_pmd)) + fail = true; + + new[nid] = new_pmd; + } + + if (unlikely(fail)) { + release_replicated_pmd_tables(allocated_node, new, mm); + return -ENOMEM; + } + + return 0; +} + +static int +prepare_replicated_pte_tables(int allocated_node, struct page **new, struct mm_struct *mm) +{ + int nid; + struct page *new_pte; + bool fail = false; + + for_each_memory_node(nid) { + if (nid == allocated_node) + continue; + new_pte = pte_alloc_one_node(nid, mm); + + if (unlikely(!new_pte)) + fail = true; + + new[nid] = new_pte; + } + + if (unlikely(fail)) { + release_replicated_pte_tables(allocated_node, new, mm); + return -ENOMEM; + } + + if (allocated_node == NUMA_NO_NODE) { + build_pte_chain(new); + set_master_page_for_ptes(allocated_node, new); + + for_each_memory_node(nid) { + SetPageReplicated(new[nid]); + } + } + + return 0; +} + +vm_fault_t replication_handle_pgd_fault(struct vm_fault *vmf) +{ + unsigned long address = vmf->real_address; + struct mm_struct *mm = vmf->vma->vm_mm; + + vmf->pgd = pgd_offset_pgd(mm->pgd, address); + + return 0; +} + +/* TODO Need to clarify, how this going to work with and without 5-level paging*/ +static vm_fault_t replication_handle_p4d_fault(struct vm_fault *vmf) +{ + int ret; + p4d_t *p4d_tables[MAX_NUMNODES]; + unsigned long address = vmf->real_address; + struct mm_struct *mm = vmf->vma->vm_mm; + +retry: + + /* See replication_handle_pgd_fault in mm/numa_replication.c */ + if (replicated_p4d_level(vmf)) { + if (!pgd_none(*vmf->pgd)) { + vmf->p4d = p4d_offset(vmf->pgd, address); + return 0; + } + ret = prepare_replicated_p4d_tables(NUMA_NO_NODE, p4d_tables, mm, address); + if (ret) + goto fault_oom; + + spin_lock(&mm->page_table_lock); + if (pgd_present(*vmf->pgd)) { + /* Someone else has replicated this level */ + release_replicated_p4d_tables(NUMA_NO_NODE, p4d_tables, mm); + if (!PageReplicated(virt_to_page(pgd_page_vaddr(*(vmf->pgd))))) { + spin_unlock(&mm->page_table_lock); + goto retry; + } + } else + sync_replicated_p4d_tables(NUMA_NO_NODE, p4d_tables, vmf->pgd, mm); + spin_unlock(&mm->page_table_lock); + + } else { + p4d_t *table_page = (p4d_t *)pgd_page_vaddr(*(vmf->pgd)); + int p4d_node = page_to_nid(virt_to_page(table_page)); + + p4d_tables[p4d_node] = table_page; + ret = prepare_replicated_p4d_tables(p4d_node, p4d_tables, mm, address); + if (ret) + goto fault_oom; + + spin_lock(&mm->page_table_lock); + if (PageReplicated(virt_to_page(table_page))) + /* Someone else has replicated this level */ + release_replicated_p4d_tables(p4d_node, p4d_tables, mm); + else + sync_replicated_p4d_tables(p4d_node, p4d_tables, vmf->pgd, mm); + spin_unlock(&mm->page_table_lock); + } + + vmf->p4d = p4d_offset(vmf->pgd, address); + + return 0; + +fault_oom: + vmf->replica_action = REPLICA_FAIL; + return VM_FAULT_OOM; +} + +static vm_fault_t replication_handle_pud_fault(struct vm_fault *vmf) +{ + int ret; + pud_t *pud_tables[MAX_NUMNODES]; + unsigned long address = vmf->real_address; + struct mm_struct *mm = vmf->vma->vm_mm; + +retry: + + /* See replication_handle_pgd_fault in mm/numa_replication.c */ + if (replicated_pud_level(vmf)) { + if (!p4d_none(*vmf->p4d)) { + vmf->pud = pud_offset(vmf->p4d, address); + return 0; + } + ret = prepare_replicated_pud_tables(NUMA_NO_NODE, pud_tables, mm, address); + if (ret) + goto fault_oom; + + spin_lock(&mm->page_table_lock); + if (p4d_present(*vmf->p4d)) { + /* Someone else has replicated this level */ + release_replicated_pud_tables(NUMA_NO_NODE, pud_tables, mm); + /* Concurrent normal fault and replicated (for example hugetlbfs fault for now or spurious on 6.6)*/ + if (!PageReplicated(virt_to_page(p4d_pgtable(*(vmf->p4d))))) { + spin_unlock(&mm->page_table_lock); + goto retry; + } + } else + sync_replicated_pud_tables(NUMA_NO_NODE, pud_tables, vmf->p4d, mm); + spin_unlock(&mm->page_table_lock); + } else { + pud_t *table_page = p4d_pgtable(*(vmf->p4d)); + int pud_node = page_to_nid(virt_to_page(table_page)); + + pud_tables[pud_node] = table_page; + ret = prepare_replicated_pud_tables(pud_node, pud_tables, mm, address); + if (ret) + goto fault_oom; + + spin_lock(&mm->page_table_lock); + if (PageReplicated(virt_to_page(table_page))) + /* Someone else has replicated this level */ + release_replicated_pud_tables(pud_node, pud_tables, mm); + else + sync_replicated_pud_tables(pud_node, pud_tables, vmf->p4d, mm); + spin_unlock(&mm->page_table_lock); + } + + vmf->pud = pud_offset(vmf->p4d, address); + + return 0; + +fault_oom: + vmf->replica_action = REPLICA_FAIL; + return VM_FAULT_OOM; +} + +static vm_fault_t replication_handle_pmd_fault(struct vm_fault *vmf) +{ + int ret; + pmd_t *pmd_tables[MAX_NUMNODES]; + unsigned long address = vmf->real_address; + struct mm_struct *mm = vmf->vma->vm_mm; + spinlock_t *ptl; + +retry: + /* See replication_handle_pgd_fault in mm/numa_replication.c */ + if (replicated_pmd_level(vmf)) { + if (!pud_none(*vmf->pud)) { + vmf->pmd = pmd_offset(vmf->pud, address); + return 0; + } + ret = prepare_replicated_pmd_tables(NUMA_NO_NODE, pmd_tables, mm, address); + if (ret) + goto fault_oom; + + ptl = pud_lock(mm, vmf->pud); + if (pud_present(*vmf->pud)) { + /* Someone else has replicated this level */ + release_replicated_pmd_tables(NUMA_NO_NODE, pmd_tables, mm); + if (!PageReplicated(virt_to_page(pud_pgtable(*(vmf->pud))))) { + spin_unlock(ptl); + goto retry; + } + + } else + sync_replicated_pmd_tables(NUMA_NO_NODE, pmd_tables, vmf->pud, mm); + spin_unlock(ptl); + } else { + pmd_t *table_page = pud_pgtable(*(vmf->pud)); + int pmd_node = page_to_nid(virt_to_page(table_page)); + + pmd_tables[pmd_node] = table_page; + ret = prepare_replicated_pmd_tables(pmd_node, pmd_tables, mm, address); + if (ret) + goto fault_oom; + + ptl = pud_lock(mm, vmf->pud); + if (PageReplicated(virt_to_page(table_page))) + /* Someone else has replicated this level */ + release_replicated_pmd_tables(pmd_node, pmd_tables, mm); + else + sync_replicated_pmd_tables(pmd_node, pmd_tables, vmf->pud, mm); + spin_unlock(ptl); + } + + vmf->pmd = pmd_offset(vmf->pud, address); + + return 0; + +fault_oom: + vmf->replica_action = REPLICA_FAIL; + return VM_FAULT_OOM; +} + +static vm_fault_t replication_handle_pte_fault(struct vm_fault *vmf) +{ + int ret; + struct mm_struct *mm = vmf->vma->vm_mm; + struct page *pte_tables[MAX_NUMNODES]; + spinlock_t *ptl; + +retry: + + if (!pmd_none(*vmf->pmd) && pmd_devmap_trans_unstable(vmf->pmd)) + return 0; + + if (replicated_pte_level(vmf)) { + /* + * If pmd from 0th node populated and PageReplciated flag is set, + * we don't care whether other nodes are populated or not, + * beacause pgtable lists are already built and we can use them + */ + if (!pmd_none(*vmf->pmd)) + return 0; + ret = prepare_replicated_pte_tables(NUMA_NO_NODE, pte_tables, mm); + if (ret) + goto fault_oom; + ptl = pmd_lock(mm, vmf->pmd); + if (unlikely(pmd_present(*vmf->pmd))) { + spin_unlock(ptl); + /* Someone else has replicated this level */ + release_replicated_pte_tables(NUMA_NO_NODE, pte_tables, mm); + if (!PageReplicated(pmd_pgtable(*(vmf->pmd)))) { + goto retry; + } + } else { + sync_replicated_pte_tables(NUMA_NO_NODE, pte_tables, vmf->pmd, mm); + spin_unlock(ptl); + } + } else { + struct page *table_page = pmd_pgtable(*(vmf->pmd)); + int pte_node = page_to_nid(table_page); + + pte_tables[pte_node] = table_page; + ret = prepare_replicated_pte_tables(pte_node, pte_tables, mm); + if (ret) + goto fault_oom; + + ptl = pmd_lock(mm, vmf->pmd); + if (unlikely(pmd_devmap_trans_unstable(vmf->pmd) || PageReplicated(table_page))) { + spin_unlock(ptl); + /* Someone else has replicated this level */ + release_replicated_pte_tables(pte_node, pte_tables, mm); + } else { + sync_replicated_pte_tables(pte_node, pte_tables, vmf->pmd, mm); + spin_unlock(ptl); + } + } + + return 0; + +fault_oom: + vmf->replica_action = REPLICA_FAIL; + return VM_FAULT_OOM; +} + +pgd_t *fault_pgd_offset(struct vm_fault *vmf, unsigned long address) +{ + vmf->pgd = pgd_offset_pgd(this_node_pgd(vmf->vma->vm_mm), address); + return vmf->pgd; +} + +p4d_t *fault_p4d_alloc(struct vm_fault *vmf, struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + if (replication_path_pgd(vmf)) { + if (replication_handle_p4d_fault(vmf)) + return NULL; + } else { + vmf->p4d = p4d_alloc(mm, pgd, address); + } + return vmf->p4d; +} + +pud_t *fault_pud_alloc(struct vm_fault *vmf, struct mm_struct *mm, p4d_t *p4d, unsigned long address) +{ + if (vmf->replica_action != REPLICA_NONE && replication_path_p4d(vmf)) { + if (replication_handle_pud_fault(vmf)) + return NULL; + } else { + vmf->pud = pud_alloc(mm, p4d, address); + } + return vmf->pud; +} + +pmd_t *fault_pmd_alloc(struct vm_fault *vmf, struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + if (vmf->replica_action != REPLICA_NONE && replication_path_pud(vmf)) { + if (replication_handle_pmd_fault(vmf)) + return NULL; + } else { + vmf->pmd = pmd_alloc(mm, pud, address); + } + return vmf->pmd; +} + +int fault_pte_alloc(struct vm_fault *vmf) +{ + if (vmf->replica_action != REPLICA_NONE && replication_path_pmd(vmf)) + return replication_handle_pte_fault(vmf); + return 0; +} + +pte_t *cpr_alloc_pte_map(struct mm_struct *mm, unsigned long addr, + pmd_t *src_pmd, pmd_t *dst_pmd) +{ + struct page *pte_tables[MAX_NUMNODES]; + struct page *src_pte = pmd_pgtable(*src_pmd); + /* + * Because tt structure of src and dst mm must be the same, + * it doesn't matter which pgtable check for being replicated + */ + bool pte_replicated = numa_pgtable_replicated(page_to_virt(src_pte)); + bool pmd_replicated_dst = numa_pgtable_replicated(dst_pmd); + + if (pte_replicated && pmd_replicated_dst) { + if (!pmd_none(*dst_pmd)) { + return pte_offset_map(dst_pmd, addr); + } + if (prepare_replicated_pte_tables(NUMA_NO_NODE, pte_tables, mm)) + return NULL; + + spin_lock(&mm->page_table_lock); + sync_replicated_pte_tables(NUMA_NO_NODE, pte_tables, dst_pmd, mm); + spin_unlock(&mm->page_table_lock); + + return pte_offset_map(dst_pmd, addr); + + } else { + return pte_alloc_map(mm, dst_pmd, addr); + } +} + +pte_t *cpr_alloc_pte_map_lock(struct mm_struct *mm, unsigned long addr, + pmd_t *src_pmd, pmd_t *dst_pmd, spinlock_t **ptl) +{ + struct page *pte_tables[MAX_NUMNODES]; + struct page *src_pte = pmd_pgtable(*src_pmd); + /* + * Because tt structure of src and dst mm must be the same, + * it doesn't matter which pgtable check for being replicated + */ + bool pte_replicated_src = numa_pgtable_replicated(page_to_virt(src_pte)); + bool pmd_replicated_dst = numa_pgtable_replicated(dst_pmd); + + if (pte_replicated_src && pmd_replicated_dst) { + if (!pmd_none(*dst_pmd)) { + return pte_offset_map_lock(mm, dst_pmd, addr, ptl); + } + if (prepare_replicated_pte_tables(NUMA_NO_NODE, pte_tables, mm)) + return NULL; + + spin_lock(&mm->page_table_lock); + sync_replicated_pte_tables(NUMA_NO_NODE, pte_tables, dst_pmd, mm); + spin_unlock(&mm->page_table_lock); + + return pte_offset_map_lock(mm, dst_pmd, addr, ptl); + + } else { + return pte_alloc_map_lock(mm, dst_pmd, addr, ptl); + } +} + +pmd_t *cpr_alloc_pmd(struct mm_struct *mm, unsigned long addr, + pud_t *src_pud, pud_t *dst_pud) +{ + pmd_t *pmd_tables[MAX_NUMNODES]; + pmd_t *src_pmd = pud_pgtable(*src_pud); + /* + * Because tt structure of src and dst mm must be the same, + * it doesn't matter which pgtable check for being replicated + */ + bool pmd_replicated_src = numa_pgtable_replicated(src_pmd); + bool pud_replicated_dst = numa_pgtable_replicated(dst_pud); + + if (pmd_replicated_src && pud_replicated_dst) { + if (!pud_none(*dst_pud)) { + return pmd_offset(dst_pud, addr); + } + if (prepare_replicated_pmd_tables(NUMA_NO_NODE, pmd_tables, mm, addr)) + return NULL; + + spin_lock(&mm->page_table_lock); + sync_replicated_pmd_tables(NUMA_NO_NODE, pmd_tables, dst_pud, mm); + spin_unlock(&mm->page_table_lock); + + return pmd_offset(dst_pud, addr); + + } else { + return pmd_alloc(mm, dst_pud, addr); + } +} + +pud_t *cpr_alloc_pud(struct mm_struct *mm, unsigned long addr, + p4d_t *src_p4d, p4d_t *dst_p4d) +{ + pud_t *pud_tables[MAX_NUMNODES]; + pud_t *src_pud = p4d_pgtable(*src_p4d); + /* + * Because tt structure of src and dst mm must be the same, + * it doesn't matter which pgtable check for being replicated + */ + bool pud_replicated_src = numa_pgtable_replicated(src_pud); + bool p4d_replicated_dst = numa_pgtable_replicated(dst_p4d); + + if (pud_replicated_src && p4d_replicated_dst) { + if (!p4d_none(*dst_p4d)) { + return pud_offset(dst_p4d, addr); + } + if (prepare_replicated_pud_tables(NUMA_NO_NODE, pud_tables, mm, addr)) + return NULL; + + spin_lock(&mm->page_table_lock); + sync_replicated_pud_tables(NUMA_NO_NODE, pud_tables, dst_p4d, mm); + spin_unlock(&mm->page_table_lock); + + return pud_offset(dst_p4d, addr); + + } else { + return pud_alloc(mm, dst_p4d, addr); + } +} + +p4d_t *cpr_alloc_p4d(struct mm_struct *mm, unsigned long addr, + pgd_t *src_pgd, pgd_t *dst_pgd) +{ +#if CONFIG_PGTABLE_LEVELS == 5 + p4d_t *p4d_tables[MAX_NUMNODES]; + p4d_t *src_p4d = pgd_pgtable(*src_pgd); + /* + * Because tt structure of src and dst mm must be the same, + * it doesn't matter which pgtable check for being replicated + */ + bool p4d_replicated_src = numa_pgtable_replicated(src_p4d); + + if (p4d_replicated_src) { + if (!pgd_none(*dst_pgd)) { + return p4d_offset(dst_pgd, addr); + } + if (prepare_replicated_p4d_tables(NUMA_NO_NODE, p4d_tables, mm, addr)) + return NULL; + + spin_lock(&mm->page_table_lock); + sync_replicated_p4d_tables(NUMA_NO_NODE, p4d_tables, dst_pgd, mm); + spin_unlock(&mm->page_table_lock); + + return p4d_offset(dst_pgd, addr); + + } else { + return p4d_alloc(mm, dst_pgd, addr); + } +#else + return p4d_offset(dst_pgd, addr); +#endif +} + +static pte_t *fault_pte_alloc_map(struct vm_fault *vmf, struct mm_struct *mm, pmd_t *pmd, unsigned long address) +{ + if (vmf->replica_action != REPLICA_NONE && replication_path_pmd(vmf)) { + if (!replication_handle_pte_fault(vmf)) + return pte_offset_map(pmd, address); + else + return NULL; + } else { + return pte_alloc_map(mm, pmd, address); + } +} + +pte_t *huge_pte_alloc_replica(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, unsigned long sz) +{ + struct vm_fault vmf = { + .vma = vma, + .address = addr & PAGE_MASK, + .real_address = addr, + .pte = NULL + }; + + vmf.pgd = fault_pgd_offset(&vmf, addr); + vmf.p4d = fault_p4d_alloc(&vmf, mm, vmf.pgd, addr); + vmf.pud = fault_pud_alloc(&vmf, mm, vmf.p4d, addr); + if (!vmf.pud) + return NULL; + + if (sz == PUD_SIZE) { + vmf.pte = (pte_t *)vmf.pud; + } else if (sz == (CONT_PTE_SIZE)) { + vmf.pmd = fault_pmd_alloc(&vmf, mm, vmf.pud, addr); + if (!vmf.pmd) + return NULL; + + WARN_ON(addr & (sz - 1)); + /* + * Note that if this code were ever ported to the + * 32-bit arm platform then it will cause trouble in + * the case where CONFIG_HIGHPTE is set, since there + * will be no pte_unmap() to correspond with this + * pte_alloc_map(). + */ + vmf.pte = fault_pte_alloc_map(&vmf, mm, vmf.pmd, addr); + } else if (sz == PMD_SIZE) { + if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && + pud_none(READ_ONCE(*vmf.pud)) && !numa_is_vma_replicant(vma)) + vmf.pte = huge_pmd_share(mm, addr, vmf.pud); + else + vmf.pte = (pte_t *)fault_pmd_alloc(&vmf, mm, vmf.pud, addr); + } else if (sz == (CONT_PMD_SIZE)) { + vmf.pmd = fault_pmd_alloc(&vmf, mm, vmf.pud, addr); + WARN_ON(addr & (sz - 1)); + return (pte_t *)vmf.pmd; + } + + return vmf.pte; +} + +static unsigned long squared_norm(unsigned long *numa_vec) +{ + int nid; + unsigned long result = 0; + + for_each_memory_node(nid) + result += (numa_vec[nid] * numa_vec[nid]); + return result; +} + +static unsigned long numa_calculate_uniformity_value(struct numa_context_switch_stat *stats) +{ + unsigned long dot = 0; + unsigned long squared_norm_val = 0; + unsigned long sqrt = 0; + int nid; + + for_each_memory_node(nid) { + dot += stats->total_stats[nid]; + } + + squared_norm_val = squared_norm(stats->total_stats); + + if (!squared_norm_val) + return 0; + + sqrt = int_sqrt(squared_norm_val * replica_count); + + return (dot * 1000UL) / sqrt; +} + +void free_numa_replication_ctl(struct mm_struct *mm) +{ + if (!mm->replication_ctl) + return; + + if (mm->replication_ctl->in_candidate_list) { + spin_lock(&replication_candidates_lock); + + list_del(&mm->replication_ctl->replication_candidates); + + spin_unlock(&replication_candidates_lock); + } + + free_percpu(mm->replication_ctl->pcp_dereplicated_tables); + free_percpu(mm->replication_ctl->pcp_replicated_tables); + free_percpu(mm->replication_ctl->pcp_dereplicated_pages); + free_percpu(mm->replication_ctl->pcp_replicated_pages); + + kfree(mm->replication_ctl); + mm->replication_ctl = NULL; +} + +static ssize_t show_candidates(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + ssize_t total = 0; + struct numa_replication_control *ctl; + + spin_lock(&replication_candidates_lock); + list_for_each_entry(ctl, &replication_candidates, replication_candidates) { + struct task_struct *tsk; + + rcu_read_lock(); + tsk = rcu_dereference(ctl->owner->owner); + if (!tsk) { + rcu_read_unlock(); + continue; + } + + total += sprintf(buf + total, "%d\n", tsk->pid); + + rcu_read_unlock(); + } + spin_unlock(&replication_candidates_lock); + + return total; +} + +static struct kobj_attribute candidates_attr = + __ATTR(candidates, 0444, show_candidates, NULL); + +static struct attribute *numa_replication_attr[] = { + &candidates_attr.attr, + NULL, +}; + +static const struct attribute_group numa_replication_attr_group = { + .attrs = numa_replication_attr, +}; + +int numa_replication_init_sysfs(void) +{ + int err; + struct kobject *kobj = kobject_create_and_add("numa_replication", mm_kobj); + + if (unlikely(!kobj)) { + pr_err("failed to create numa_replication kobject\n"); + return -ENOMEM; + } + + err = sysfs_create_group(kobj, &numa_replication_attr_group); + if (err) { + pr_err("failed to register numa_replication group\n"); + goto delete_obj; + } + + return 0; + +delete_obj: + kobject_put(kobj); + return err; +} + + + +void numa_replication_add_candidate(struct mm_struct *mm) +{ + if (mm->replication_ctl->in_candidate_list) + return; + + spin_lock(&mm->replication_ctl->lock); + + /* We are already in this list */ + if (mm->replication_ctl->in_candidate_list) + goto out; + + spin_lock_nested(&replication_candidates_lock, SINGLE_DEPTH_NESTING); + + list_add(&mm->replication_ctl->replication_candidates, &replication_candidates); + + spin_unlock(&replication_candidates_lock); + + mm->replication_ctl->in_candidate_list = true; +out: + spin_unlock(&mm->replication_ctl->lock); +} + + +void numa_account_switch(struct mm_struct *mm) +{ + if (!mm->context_switch_stats) + return; + this_cpu_inc(*(mm->context_switch_stats->pcp_stats)); +} + + +void numa_accumulate_switches(struct mm_struct *mm) +{ + int cpu; + int nid; + unsigned long numa_val; + + if (!mm->context_switch_stats) + return; + + /* + * Replication is enabled, we do not need to do anything (due to this piece of work or cgroup) + * In case of races we only perform redundant calculations of replication score - not a big deal + */ + if (get_user_replication_policy(mm)) + return; + if (mm->replication_ctl->in_candidate_list) + return; + + spin_lock(&(mm->context_switch_stats->lock)); + + for_each_possible_cpu(cpu) { + unsigned long *ptr = per_cpu_ptr(mm->context_switch_stats->pcp_stats, cpu); + + mm->context_switch_stats->last_stats[cpu_to_node(cpu)] += *ptr; + *ptr = 0; + } + + for_each_memory_node(nid) { + mm->context_switch_stats->total_stats[nid] = (mm->context_switch_stats->total_stats[nid] * 7) / 10 + + mm->context_switch_stats->last_stats[nid]; + mm->context_switch_stats->last_stats[nid] = 0; + } + + numa_val = numa_calculate_uniformity_value(mm->context_switch_stats); + + spin_unlock(&(mm->context_switch_stats->lock)); + + /* + * 960 is a magic number. + * Tunable and we need to revaluate it more carefully + */ + if (numa_val > 960) { + pr_info("%d New candidate for pid: %d, comm: %s, replication score: %lu\n", current->pid, mm->owner->pid, mm->owner->comm, numa_val); + + numa_replication_add_candidate(mm); + } +} + +static int replicate_alloc_pte(struct mm_struct *mm, unsigned long addr, pmd_t *pmd) +{ + struct page *pte_tables[MAX_NUMNODES]; + struct page *pte = pmd_pgtable(*pmd); + spinlock_t *ptl; + int nid = page_to_nid(pte); + + pte_tables[nid] = pte; + /* + * Because tt structure of src and dst mm must be the same, + * it doesn't matter which pgtable check for being replicated + */ + if (numa_pgtable_replicated(page_to_virt(pte))) { + return 0; + } + + if (prepare_replicated_pte_tables(nid, pte_tables, mm)) + return -ENOMEM; + + ptl = pmd_lock(mm, pmd); + sync_replicated_pte_tables(nid, pte_tables, pmd, mm); + spin_unlock(ptl); + + return 0; +} + +static unsigned long numa_replicate_pgtables_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end) +{ + pmd_t *pmd; + unsigned long next, last = addr; + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + + if (pmd_none(*pmd) || is_swap_pmd(*pmd) || pmd_devmap(*pmd) || pmd_trans_huge(*pmd)) + continue; + + if (replicate_alloc_pte(vma->vm_mm, addr, pmd)) + break; + + cond_resched(); + } while (pmd++, last = next, addr = next, addr != end); + + return last; +} + +static int replicate_alloc_pmd(struct mm_struct *mm, unsigned long addr, pud_t *pud) +{ + pmd_t *pmd_tables[MAX_NUMNODES]; + pmd_t *pmd = pud_pgtable(*pud); + spinlock_t *ptl; + int nid = page_to_nid(virt_to_page(pmd)); + + pmd_tables[nid] = pmd; + + if (numa_pgtable_replicated(pmd)) + return 0; + + if (prepare_replicated_pmd_tables(nid, pmd_tables, mm, addr)) + return -ENOMEM; + + ptl = pud_lock(mm, pud); + sync_replicated_pmd_tables(nid, pmd_tables, pud, mm); + spin_unlock(ptl); + + return 0; + +} + +static unsigned long numa_replicate_pgtables_pud_range(struct vm_area_struct *vma, p4d_t *p4d, + unsigned long addr, unsigned long end) +{ + pud_t *pud; + unsigned long next, last = addr; + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + + if (pud_none_or_clear_bad(pud)) + continue; + + if (replicate_alloc_pmd(vma->vm_mm, addr, pud)) + break; + + last = numa_replicate_pgtables_pmd_range(vma, pud, addr, next); + + if (last != next) + break; + + } while (pud++, last = next, addr = next, addr != end); + + return last; +} + +static int replicate_alloc_pud(struct mm_struct *mm, unsigned long addr, p4d_t *p4d) +{ + pud_t *pud_tables[MAX_NUMNODES]; + pud_t *pud = p4d_pgtable(*p4d); + int nid = page_to_nid(virt_to_page(pud)); + + pud_tables[nid] = pud; + + if (numa_pgtable_replicated(pud)) + return 0; + + if (prepare_replicated_pud_tables(nid, pud_tables, mm, addr)) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); + sync_replicated_pud_tables(nid, pud_tables, p4d, mm); + spin_unlock(&mm->page_table_lock); + + return 0; + +} + +static unsigned long numa_replicate_pgtables_p4d_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end) +{ + p4d_t *p4d; + unsigned long next, last = addr; + + p4d = p4d_offset(pgd, addr); + + + do { + next = p4d_addr_end(addr, end); + + if (p4d_none_or_clear_bad(p4d)) + continue; + + if (replicate_alloc_pud(vma->vm_mm, addr, p4d)) + break; + + last = numa_replicate_pgtables_pud_range(vma, p4d, addr, next); + + if (last != next) + break; + + } while (p4d++, last = next, addr = next, addr != end); + + return last; +} + +static int replicate_alloc_p4d(struct mm_struct *mm, unsigned long addr, pgd_t *pgd) +{ +#if CONFIG_PGTABLE_LEVELS == 5 + p4d_t *p4d_tables[MAX_NUMNODES]; + p4d_t *p4d = pgd_pgtable(*pgd); + int nid = page_to_nid(virt_to_page(p4d)); + + p4d_tables[nid] = p4d; + + if (numa_pgtable_replicated(p4d)) + return 0; + + if (prepare_replicated_p4d_tables(nid, p4d_tables, mm, addr)) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); + sync_replicated_p4d_tables(nid, p4d_tables, pgd, mm); + spin_unlock(&mm->page_table_lock); + +#endif + return 0; + +} + +static unsigned long numa_replicate_pgtables_pgd_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + unsigned long next; + unsigned long last = addr; + + BUG_ON(addr >= end); + + pgd = pgd_offset_pgd(this_node_pgd(mm), addr); + + do { + next = pgd_addr_end(addr, end); + + if (pgd_none_or_clear_bad(pgd)) + continue; + + if (replicate_alloc_p4d(vma->vm_mm, addr, pgd)) + break; + + last = numa_replicate_pgtables_p4d_range(vma, pgd, addr, next); + + if (last != next) + break; + + } while (pgd++, last = next, addr = next, addr != end); + + return last; + +} + +int numa_replicate_pgtables_vma(struct vm_area_struct *vma) +{ + unsigned long last = numa_replicate_pgtables_pgd_range(vma, vma->vm_start, vma->vm_end); + + if (last != vma->vm_end) + return -ENOMEM; + + flush_tlb_range(vma, vma->vm_start, vma->vm_end); + + return 0; + +} + +static void dereplicate_pgtables_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) +{ + unsigned long offset; + struct page *curr, *tmp; + pmd_t *curr_pmd; + pte_t *curr_pte; + spinlock_t *lock; + pgtable_t token = pmd_pgtable(*pmd)->master_table; + bool pte_replicated = numa_pgtable_replicated(page_to_virt(token)); + + if (!pte_replicated) + return; + + lock = pmd_lock(tlb->mm, pmd); + + pmd_populate(tlb->mm, pmd, token); + + for_each_pgtable_replica(curr, curr_pmd, pmd, offset) { + pmd_populate(tlb->mm, curr_pmd, token); + } + + spin_unlock(lock); + + for_each_pgtable_replica_safe(curr, tmp, curr_pte, page_to_virt(token), offset) { + memcg_account_dereplicated_pgtable_page(curr_pte); + cleanup_pte_list(curr); + pte_free_tlb(tlb, curr, addr); + mm_dec_nr_ptes(tlb->mm); + } + memcg_account_dereplicated_pgtable_page(page_to_virt(token)); + account_dereplicated_table(tlb->mm); + cleanup_pte_list(token); + ClearPageReplicated(token); +} + +static inline void __free_pgtables_replica_pmd(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr) +{ + unsigned long offset; + struct page *curr, *tmp; + pud_t *curr_pud; + pmd_t *curr_pmd; + spinlock_t *lock; + pmd_t *pmd = get_master_pmd(pmd_offset(pud, addr)); + + lock = pud_lock(tlb->mm, pud); + + pud_populate(tlb->mm, pud, pmd); + + + for_each_pgtable_replica(curr, curr_pud, pud, offset) { + pud_populate(tlb->mm, curr_pud, pmd); + } + + spin_unlock(lock); + + for_each_pgtable_replica_safe(curr, tmp, curr_pmd, pmd, offset) { + memcg_account_dereplicated_pgtable_page(curr_pmd); + cleanup_pmd_list(curr); + pmd_free_tlb(tlb, curr_pmd, addr); + mm_dec_nr_pmds(tlb->mm); + } + memcg_account_dereplicated_pgtable_page(pmd); + account_dereplicated_table(tlb->mm); + cleanup_pmd_list(virt_to_page(pmd)); + ClearPageReplicated(virt_to_page(pmd)); + +} + +static inline void dereplicate_pgtables_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pmd_t *pmd; + unsigned long next; + unsigned long start; + + start = addr; + pmd = pmd_offset(pud, addr); + + if (!numa_pgtable_replicated(pmd)) + return; + + do { + next = pmd_addr_end(addr, end); + if (pmd_none(*pmd) || is_swap_pmd(*pmd) || pmd_devmap(*pmd) || pmd_trans_huge(*pmd)) + continue; + dereplicate_pgtables_pte_range(tlb, pmd, addr); + } while (pmd++, addr = next, addr != end); + + start &= PUD_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PUD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + __free_pgtables_replica_pmd(tlb, pud, start); +} + +static inline void __free_pgtables_replica_pud(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long addr) +{ + unsigned long offset; + struct page *curr, *tmp; + p4d_t *curr_p4d; + pud_t *curr_pud; + pud_t *pud = get_master_pud(pud_offset(p4d, addr)); + + spin_lock(&tlb->mm->page_table_lock); + + p4d_populate(tlb->mm, p4d, pud); + + for_each_pgtable_replica(curr, curr_p4d, p4d, offset) { + p4d_populate(tlb->mm, curr_p4d, pud); + } + + spin_unlock(&tlb->mm->page_table_lock); + + for_each_pgtable_replica_safe(curr, tmp, curr_pud, pud, offset) { + memcg_account_dereplicated_pgtable_page(curr_pud); + cleanup_pud_list(curr); + pud_free_tlb(tlb, curr_pud, addr); + mm_dec_nr_puds(tlb->mm); + } + memcg_account_dereplicated_pgtable_page(pud); + account_dereplicated_table(tlb->mm); + cleanup_pud_list(virt_to_page(pud)); + ClearPageReplicated(virt_to_page(pud)); + +} + +static inline void dereplicate_pgtables_pud_range(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pud_t *pud; + unsigned long next; + unsigned long start; + + start = addr; + pud = pud_offset(p4d, addr); + + if (!numa_pgtable_replicated(pud)) + return; + + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + continue; + dereplicate_pgtables_pmd_range(tlb, pud, addr, next, floor, ceiling); + } while (pud++, addr = next, addr != end); + + start &= P4D_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= P4D_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + __free_pgtables_replica_pud(tlb, p4d, start); +} + +static inline void dereplicate_pgtables_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + p4d_t *p4d; + unsigned long next; + unsigned long start; + + start = addr; + p4d = p4d_offset(pgd, addr); + + if (!numa_pgtable_replicated(p4d)) + return; + + do { + next = p4d_addr_end(addr, end); + if (p4d_none_or_clear_bad(p4d)) + continue; + dereplicate_pgtables_pud_range(tlb, p4d, addr, next, floor, ceiling); + } while (p4d++, addr = next, addr != end); + + start &= PGDIR_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PGDIR_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + return; + + /* TODO + __free_pgtables_replica_p4d(tlb, pgd, start); + */ +} + +/* + * This function frees user-level page tables of a process. + */ +static inline void dereplicate_pgtables_pgd_range(struct mmu_gather *tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) +{ + pgd_t *pgd; + unsigned long next; + + addr &= PMD_MASK; + if (addr < floor) { + addr += PMD_SIZE; + if (!addr) + return; + } + if (ceiling) { + ceiling &= PMD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + end -= PMD_SIZE; + if (addr > end - 1) + return; + + tlb_change_page_size(tlb, PAGE_SIZE); + pgd = pgd_offset(tlb->mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + continue; + dereplicate_pgtables_p4d_range(tlb, pgd, addr, next, floor, ceiling); + } while (pgd++, addr = next, addr != end); +} + +void dereplicate_pgtables(struct mm_struct *mm) +{ + struct mmu_gather tlb; + struct vm_area_struct *vma; + unsigned long start = 0; + unsigned long end = mm->mmap_base; + + tlb_gather_mmu(&tlb, mm, start, end); + + down_write(&mm->replication_ctl->rmap_lock); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + BUG_ON(vma_has_replicas(vma)); + vma->vm_flags &= ~(VM_REPLICA_INIT); + dereplicate_pgtables_pgd_range(&tlb, vma->vm_start, vma->vm_end, FIRST_USER_ADDRESS, + vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + } + + tlb_finish_mmu(&tlb, start, end); + + up_write(&mm->replication_ctl->rmap_lock); + +} + +static inline struct vm_area_struct *find_next_replicant_vma(struct vm_area_struct *vma) +{ + if (!vma) + return NULL; + vma = vma->vm_next; + while (vma && !numa_is_vma_replicant(vma)) { + vma = vma->vm_next; + } + return vma; +} + +static inline struct vm_area_struct *find_first_replicant_vma(struct mm_struct *mm) +{ + struct vm_area_struct *vma = mm->mmap; + + while (vma && !numa_is_vma_replicant(vma)) { + vma = vma->vm_next; + } + return vma; +} + +void dereplicate_rw_pgtables(struct mm_struct *mm) +{ + struct mmu_gather tlb; + struct vm_area_struct *vma, *prev, *next; + unsigned long start = 0; + unsigned long end = mm->mmap_base; + + tlb_gather_mmu(&tlb, mm, start, end); + + down_write(&mm->replication_ctl->rmap_lock); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!vma_replica_candidate(vma)) { + BUG_ON(vma_has_replicas(vma)); + vma->vm_flags &= ~(VM_REPLICA_INIT); + } + } + + next = find_first_replicant_vma(mm); + prev = NULL; + + do { + dereplicate_pgtables_pgd_range(&tlb, + prev ? prev->vm_end : FIRST_USER_ADDRESS, + next ? next->vm_start : mm->mmap_base, + prev ? prev->vm_end : FIRST_USER_ADDRESS, + next ? next->vm_start : USER_PGTABLES_CEILING); + + } while (prev = next, next = find_next_replicant_vma(next), prev != NULL); + + + tlb_finish_mmu(&tlb, start, end); + + up_write(&mm->replication_ctl->rmap_lock); +} + +static int numa_mm_table_replication_none(struct mm_struct *mm) +{ + int ret = 0; + + mmap_write_lock(mm); + + switch (get_table_replication_policy(mm)) { + case TABLE_REPLICATION_NONE: { + goto out; + } + case TABLE_REPLICATION_MINIMAL: + case TABLE_REPLICATION_ALL: { + dereplicate_pgtables(mm); + break; + } + default: { + BUG(); + } + } + set_table_replication_policy(mm, TABLE_REPLICATION_NONE); +out: + mmap_write_unlock(mm); + return ret; +} + +static int numa_mm_table_replication_minimal_from_all(struct mm_struct *mm) +{ + dereplicate_rw_pgtables(mm); + return 0; +} + +static int numa_mm_table_replication_minimal_from_none(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int ret = 0; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma_replica_candidate(vma)) { + vma->vm_flags |= VM_REPLICA_INIT; + + ret = numa_replicate_pgtables_vma(vma); + if (ret) + return ret; + } + } + + numa_replication_remove_from_candidate_list(mm); + return ret; +} + +static int numa_mm_table_replication_minimal(struct mm_struct *mm) +{ + int ret = 0; + + mmap_write_lock(mm); + + switch (get_table_replication_policy(mm)) { + case TABLE_REPLICATION_NONE: { + ret = numa_mm_table_replication_minimal_from_none(mm); + break; + } + case TABLE_REPLICATION_MINIMAL: { + goto out; + } + case TABLE_REPLICATION_ALL: { + ret = numa_mm_table_replication_minimal_from_all(mm); + break; + } + default: { + BUG(); + } + } + set_table_replication_policy(mm, TABLE_REPLICATION_MINIMAL); +out: + mmap_write_unlock(mm); + + return ret; +} + +static int numa_mm_table_replication_all(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int ret = 0; + + mmap_write_lock(mm); + + if (get_table_replication_policy(mm) == TABLE_REPLICATION_ALL) { + goto out; + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!numa_is_vma_replicant(vma)) { + vma->vm_flags |= VM_REPLICA_INIT; + + ret = numa_replicate_pgtables_vma(vma); + if (ret) + goto out; + } + } + + set_table_replication_policy(mm, TABLE_REPLICATION_ALL); + numa_replication_remove_from_candidate_list(mm); +out: + mmap_write_unlock(mm); + + return ret; +} + +/* CMD: + * 0 - no table replication + * 1 - replicate minimal amount to support RO-data replication + * 2 - replicate all tables + * + * Transitions between all states are supported, + */ +int numa_dispatch_table_replication_request(struct mm_struct *mm, int cmd) +{ + switch (cmd) { + case TABLE_REPLICATION_NONE: + return numa_mm_table_replication_none(mm); + case TABLE_REPLICATION_MINIMAL: + return numa_mm_table_replication_minimal(mm); + case TABLE_REPLICATION_ALL: + return numa_mm_table_replication_all(mm); + default: + return -EINVAL; + } +} + +static int numa_mm_disable_replication(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int ret = 0; + + mmap_write_lock(mm); + + if (get_data_replication_policy(mm) == DATA_REPLICATION_NONE) { + goto out; + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma_has_replicas(vma)) { + ret = phys_deduplicate(vma, vma->vm_start, vma->vm_end - vma->vm_start, true); + if (ret) + goto out; + + vma->vm_flags &= ~VM_REPLICA_COMMIT; + } + } + + set_data_replication_policy(mm, DATA_REPLICATION_NONE); +out: + mmap_write_unlock(mm); + + return ret; + +} + +static int numa_mm_on_demand_data_replication(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + int ret = 0; + + mmap_write_lock(mm); + + if (get_table_replication_policy(mm) == TABLE_REPLICATION_NONE) { + ret = -EINVAL; + goto out; + } + + if (get_data_replication_policy(mm) != DATA_REPLICATION_NONE) { + goto out; + } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma_might_be_replicated(vma)) { + vma->vm_flags |= VM_REPLICA_COMMIT; + } + } + + set_data_replication_policy(mm, DATA_REPLICATION_ON_DEMAND); +out: + mmap_write_unlock(mm); + + return ret; +} + +static int numa_mm_all_data_replication(struct mm_struct *mm, data_replication_policy_t policy) +{ + struct vm_area_struct *vma; + int ret = 0; + + mmap_write_lock(mm); + + if (get_table_replication_policy(mm) == TABLE_REPLICATION_NONE) { + ret = -EINVAL; + goto out; + } + + if (get_data_replication_policy(mm) == policy) + goto out; + + if (get_data_replication_policy(mm) == DATA_REPLICATION_ALL + && policy == DATA_REPLICATION_ALL_MAPPED_ON_DEMAND) + goto out_set; + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma_might_be_replicated(vma)) { + vma->vm_flags |= VM_REPLICA_COMMIT; + ret = phys_duplicate(vma, vma->vm_start, vma->vm_end - vma->vm_start); + if (ret) + break; + } + } + +out_set: + set_data_replication_policy(mm, policy); +out: + mmap_write_unlock(mm); + + return ret; +} + + +/* CMD: + * 0 - no data replicas at all + * 1 - replicas are created on demand via numa balancer + * 2 - all mapped ro-data will be replicated, new ro-data will be replicated on demand + * 3 - all ro-data always replicated. + * + * All transitions are supported, however without rollback + * (because it doesn't make any sense) and sometimes they won't do anything + * meaningful (for example, 2 -> 1 doesn't make sense, but it will work anyway) + * If we are starting application from cgroup, some options are the same + * (for new process states 2 and 1 are identical) + */ +int numa_dispatch_data_replication_request(struct mm_struct *mm, int cmd) +{ + switch (cmd) { + case DATA_REPLICATION_NONE: + return numa_mm_disable_replication(mm); + case DATA_REPLICATION_ON_DEMAND: + return numa_mm_on_demand_data_replication(mm); + case DATA_REPLICATION_ALL_MAPPED_ON_DEMAND: + return numa_mm_all_data_replication(mm, DATA_REPLICATION_ALL_MAPPED_ON_DEMAND); + case DATA_REPLICATION_ALL: + return numa_mm_all_data_replication(mm, DATA_REPLICATION_ALL); + default: + return -EINVAL; + } +} + +void numa_replication_post_mprotect(struct vm_area_struct *vma) +{ + if (numa_is_vma_replicant(vma)) { + numa_replicate_pgtables_vma(vma); + } + if (vma_might_be_replicated(vma) && get_data_replication_policy(vma->vm_mm) == DATA_REPLICATION_ALL) { + phys_duplicate(vma, vma->vm_start, vma->vm_end - vma->vm_start); + } + +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3bdc6aa73c7c2..09ea3a992336d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -161,6 +161,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { [N_CPU] = { { [0] = 1UL } }, #endif /* NUMA */ }; + +nodemask_t __read_mostly replica_nodes = { { [0] = 1UL } }; + EXPORT_SYMBOL(node_states); atomic_long_t _totalram_pages __read_mostly; @@ -5482,13 +5485,11 @@ unsigned long __get_free_pages_node(unsigned int nid, gfp_t gfp_mask, return 0; return (unsigned long) page_address(page); } -EXPORT_SYMBOL(__get_free_pages_node); unsigned long get_zeroed_page_node(unsigned int nid, gfp_t gfp_mask) { return __get_free_pages_node(nid, gfp_mask | __GFP_ZERO, 0); } -EXPORT_SYMBOL(get_zeroed_page_node); #endif /* CONFIG_KERNEL_REPLICATION */ void __free_pages(struct page *page, unsigned int order) diff --git a/mm/page_idle.c b/mm/page_idle.c index edead6a8a5f91..dd7b9cc36b4f5 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -12,6 +12,7 @@ #include <linux/mmu_notifier.h> #include <linux/page_ext.h> #include <linux/page_idle.h> +#include <linux/numa_user_replication.h> #define BITMAP_CHUNK_SIZE sizeof(u64) #define BITMAP_CHUNK_BITS (BITMAP_CHUNK_SIZE * BITS_PER_BYTE) @@ -62,7 +63,7 @@ static bool page_idle_clear_pte_refs_one(struct page *page, * For PTE-mapped THP, one sub page is referenced, * the whole THP is referenced. */ - if (ptep_clear_young_notify(vma, addr, pvmw.pte)) + if (ptep_clear_young_notify_replicated(vma, addr, pvmw.pte)) referenced = true; } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { if (pmdp_clear_young_notify(vma, addr, pvmw.pmd)) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 610ebbee787cc..2bcb9ea753b9e 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -4,6 +4,7 @@ #include <linux/hugetlb.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/numa_user_replication.h> #include "internal.h" @@ -190,7 +191,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) goto next_pte; restart: do { - pgd = pgd_offset(mm, pvmw->address); + pgd = pgd_offset_pgd(this_node_pgd(mm), pvmw->address); if (!pgd_present(*pgd)) { step_forward(pvmw, PGDIR_SIZE); continue; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4e640baf97948..a773b7c57693a 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/hugetlb.h> #include <linux/pgtable.h> +#include <linux/numa_user_replication.h> #include <asm/tlb.h> /* @@ -161,7 +162,11 @@ pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { - assert_spin_locked(pmd_lockptr(mm, pmdp)); + /* + * It will be just fine. + * Trust me. + */ + // assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ if (!pmd_huge_pte(mm, pmdp)) @@ -177,8 +182,8 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; - - assert_spin_locked(pmd_lockptr(mm, pmdp)); + /* Same as in the previous one */ + // assert_spin_locked(pmd_lockptr(mm, pmdp)); /* FIFO */ pgtable = pmd_huge_pte(mm, pmdp); diff --git a/mm/rmap.c b/mm/rmap.c index 150803a7ffb5b..1f7bd740d8ba7 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -72,6 +72,7 @@ #include <linux/page_idle.h> #include <linux/memremap.h> #include <linux/userfaultfd_k.h> +#include <linux/numa_user_replication.h> #include <asm/tlbflush.h> @@ -795,7 +796,7 @@ static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, } if (pvmw.pte) { - if (ptep_clear_flush_young_notify(vma, address, + if (ptep_clear_flush_young_notify_replicated(vma, address, pvmw.pte)) { /* * Don't treat a reference through @@ -937,10 +938,10 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, continue; flush_cache_page(vma, address, pte_pfn(*pte)); - entry = ptep_clear_flush(vma, address, pte); + entry = ptep_clear_flush_replicated(vma, address, pte); entry = pte_wrprotect(entry); entry = pte_mkclean(entry); - set_pte_at(vma->vm_mm, address, pte, entry); + set_pte_at_replicated(vma->vm_mm, address, pte, entry); ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -951,10 +952,10 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, continue; flush_cache_page(vma, address, page_to_pfn(page)); - entry = pmdp_invalidate(vma, address, pmd); + entry = pmdp_invalidate_replicated(vma, address, pmd); entry = pmd_wrprotect(entry); entry = pmd_mkclean(entry); - set_pmd_at(vma->vm_mm, address, pmd, entry); + set_pmd_at_replicated(vma->vm_mm, address, pmd, entry); ret = 1; #else /* unexpected pmd-mapped page? */ @@ -1345,6 +1346,8 @@ void page_remove_rmap(struct page *page, bool compound) { lock_page_memcg(page); + BUG_ON(PageReplicated(compound_head(page))); + if (!PageAnon(page)) { page_remove_file_rmap(page, compound); goto out; @@ -1529,7 +1532,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_entry_t entry; pte_t swp_pte; - pteval = ptep_get_and_clear(mm, pvmw.address, pvmw.pte); + pteval = ptep_get_and_clear_replicated(mm, pvmw.address, pvmw.pte); /* * Store the pfn of the page in a special migration @@ -1547,7 +1550,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_swp_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); - set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + set_pte_at_replicated(mm, pvmw.address, pvmw.pte, swp_pte); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. @@ -1574,11 +1577,11 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * transition on a cached TLB entry is written through * and traps if the PTE is unmapped. */ - pteval = ptep_get_and_clear(mm, address, pvmw.pte); + pteval = ptep_get_and_clear_replicated(mm, address, pvmw.pte); set_tlb_ubc_flush_pending(mm, pte_dirty(pteval), address); } else { - pteval = ptep_clear_flush(vma, address, pvmw.pte); + pteval = ptep_clear_flush_replicated(vma, address, pvmw.pte); } /* Move the dirty bit to the page. Now the pte is gone. */ @@ -1597,8 +1600,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, vma_mmu_pagesize(vma)); } else { dec_mm_counter(mm, mm_counter(page)); + reliable_page_counter(page, mm, -1); - set_pte_at(mm, address, pvmw.pte, pteval); + set_pte_at_replicated(mm, address, pvmw.pte, pteval); } } else if (pte_unused(pteval) && !userfaultfd_armed(vma)) { @@ -1623,7 +1627,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, pte_t swp_pte; if (arch_unmap_one(mm, vma, address, pteval) < 0) { - set_pte_at(mm, address, pvmw.pte, pteval); + set_pte_at_replicated(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; @@ -1641,7 +1645,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); - set_pte_at(mm, address, pvmw.pte, swp_pte); + set_pte_at_replicated(mm, address, pvmw.pte, swp_pte); /* * No need to invalidate here it will synchronize on * against the special swap migration pte. @@ -1701,7 +1705,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * If the page was redirtied, it cannot be * discarded. Remap the page to page table. */ - set_pte_at(mm, address, pvmw.pte, pteval); + set_pte_at_replicated(mm, address, pvmw.pte, pteval); SetPageSwapBacked(page); ret = false; page_vma_mapped_walk_done(&pvmw); @@ -1709,13 +1713,13 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, } if (swap_duplicate(entry) < 0) { - set_pte_at(mm, address, pvmw.pte, pteval); + set_pte_at_replicated(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; } if (arch_unmap_one(mm, vma, address, pteval) < 0) { - set_pte_at(mm, address, pvmw.pte, pteval); + set_pte_at_replicated(mm, address, pvmw.pte, pteval); ret = false; page_vma_mapped_walk_done(&pvmw); break; @@ -1734,7 +1738,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_pte = pte_swp_mksoft_dirty(swp_pte); if (pte_uffd_wp(pteval)) swp_pte = pte_swp_mkuffd_wp(swp_pte); - set_pte_at(mm, address, pvmw.pte, swp_pte); + set_pte_at_replicated(mm, address, pvmw.pte, swp_pte); /* Invalidate as we cleared the pte */ mmu_notifier_invalidate_range(mm, address, address + PAGE_SIZE); @@ -1921,13 +1925,33 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); +#ifdef CONFIG_USER_REPLICATION + down_read(&vma->vm_mm->replication_ctl->rmap_lock); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) { + up_read(&vma->vm_mm->replication_ctl->rmap_lock); + continue; + } + + if (!rwc->rmap_one(page, vma, address, rwc->arg)) { + up_read(&vma->vm_mm->replication_ctl->rmap_lock); + break; + } + if (rwc->done && rwc->done(page)) { + up_read(&vma->vm_mm->replication_ctl->rmap_lock); + break; + } + up_read(&vma->vm_mm->replication_ctl->rmap_lock); +#else if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(page, vma, address, rwc->arg)) break; + if (rwc->done && rwc->done(page)) break; +#endif } if (!locked) @@ -1975,14 +1999,34 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); +#ifdef CONFIG_USER_REPLICATION + down_read(&vma->vm_mm->replication_ctl->rmap_lock); + + if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) { + up_read(&vma->vm_mm->replication_ctl->rmap_lock); + continue; + } + if (!rwc->rmap_one(page, vma, address, rwc->arg)) { + up_read(&vma->vm_mm->replication_ctl->rmap_lock); + goto done; + } + if (rwc->done && rwc->done(page)) { + up_read(&vma->vm_mm->replication_ctl->rmap_lock); + goto done; + } + up_read(&vma->vm_mm->replication_ctl->rmap_lock); +#else if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; if (!rwc->rmap_one(page, vma, address, rwc->arg)) goto done; + if (rwc->done && rwc->done(page)) goto done; +#endif + } done: diff --git a/mm/share_pool.c b/mm/share_pool.c index 80a36e792965e..93e7946b9c980 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -50,6 +50,7 @@ #include <linux/timekeeping.h> #include <linux/time64.h> #include <linux/pagewalk.h> +#include <linux/numa_user_replication.h> #define spg_valid(spg) ((spg)->is_alive == true) @@ -4104,7 +4105,7 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm, page_dup_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); - set_huge_pte_at(mm, haddr, ptep, new_pte); + set_huge_pte_at_replicated(mm, haddr, ptep, new_pte); hugetlb_count_add(pages_per_huge_page(h), mm); diff --git a/mm/shmem.c b/mm/shmem.c index 9cb612d1153bf..875051faa9778 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -38,6 +38,7 @@ #include <linux/hugetlb.h> #include <linux/frontswap.h> #include <linux/fs_parser.h> +#include <linux/numa_user_replication.h> #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ @@ -2481,7 +2482,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, inc_mm_counter(dst_mm, mm_counter_file(page)); reliable_page_counter(page, dst_mm, 1); page_add_file_rmap(page, false); - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + set_pte_at_replicated(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); diff --git a/mm/swap.c b/mm/swap.c index c37fac5a73e8d..5b6759914efd3 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -921,7 +921,6 @@ void release_pages(struct page **pages, int nr) } __ClearPageWaiters(page); - list_add(&page->lru, &pages_to_free); } if (lruvec) diff --git a/mm/swap_state.c b/mm/swap_state.c index 69d71c4be7b88..2d8adb4e13ede 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -343,8 +343,9 @@ void free_pages_and_swap_cache(struct page **pages, int nr) int i; lru_add_drain(); - for (i = 0; i < nr; i++) + for (i = 0; i < nr; i++) { free_swap_cache(pagep[i]); + } release_pages(pagep, nr); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 07f50d5f5bb77..fc6fe524a2cbb 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -40,6 +40,7 @@ #include <linux/swap_slots.h> #include <linux/sort.h> #include <linux/completion.h> +#include <linux/numa_user_replication.h> #include <asm/tlbflush.h> #include <linux/swapops.h> @@ -2048,7 +2049,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, + set_pte_at_replicated(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); reliable_page_counter(page, vma->vm_mm, 1); diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9d462ffa0157b..1dc00abb91bf4 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -16,6 +16,8 @@ #include <linux/hugetlb.h> #include <linux/shmem_fs.h> #include <linux/userswap.h> +#include <linux/numa_user_replication.h> + #include <asm/tlbflush.h> #include "internal.h" @@ -130,7 +132,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, page_add_new_anon_rmap(page, dst_vma, dst_addr, false); lru_cache_add_inactive_or_unevictable(page, dst_vma); - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + set_pte_at_replicated(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); @@ -172,7 +174,7 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm, ret = -EEXIST; if (!pte_none(*dst_pte)) goto out_unlock; - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + set_pte_at_replicated(dst_mm, dst_addr, dst_pte, _dst_pte); /* No need to invalidate - it was non-present before */ update_mmu_cache(dst_vma, dst_addr, dst_pte); ret = 0; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 01bfe4131a53a..2a49f3fe0e356 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -10,7 +10,6 @@ #include <linux/vmalloc.h> #include <linux/mm.h> -#include <linux/numa_replication.h> #include <linux/module.h> #include <linux/highmem.h> #include <linux/sched/signal.h> @@ -40,6 +39,8 @@ #include <linux/hugetlb.h> #include <linux/share_pool.h> #include <linux/pbha.h> +#include <linux/numa_kernel_replication.h> + #include <asm/io.h> #include <asm/tlbflush.h> #include <asm/shmparam.h> @@ -2655,7 +2656,6 @@ static void vm_account_replicated_range(struct vm_struct *area, list_for_each_entry(cursor, &page->lru, lru) { unsigned long addr = (unsigned long)page_address(cursor); - if (addr) { unsigned long page_size; @@ -3106,7 +3106,6 @@ static int vmalloc_map_area_pages(unsigned long addr, unsigned long size, if (area->flags & VM_NUMA_SHARED) { for_each_memory_node(nid) { pgd_t *pgd = per_node_pgd(&init_mm, nid); - ret = vmalloc_map_area_pages_pgd(addr, area->pages, size, gfp_mask, prot, page_shift, pgd); if (ret) -- 2.34.1