
kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC2BJZ ------------------------------------------------- In current design mutable kernel data modifications don't require synchronization between translation tables due to on 64-bit platforms all physical memory already mapped in kernel space and this mapping is persistent. In user space the translation tables synchronizations are quite rare due to the only case is new PUD/P4D allocation. At the current stage only the PGD layer is replicated for the user space. Please refer to the next pics. TT overview: NODE 0 NODE 1 USER KERNEL USER KERNEL --------------------- --------------------- PGD | | | | | | | | |*| | | | | | | | | |*| --------------------- --------------------- | | ------------------- ------------------- | | --------------------- --------------------- PUD | | | | | | | |*|*| | | | | | | | |*|*| --------------------- --------------------- | | ------------------- ------------------- | | --------------------- --------------------- PMD |READ-ONLY|MUTABLE | |READ-ONLY|MUTABLE | --------------------- --------------------- | | | | | -------------------------- | | | -------- ------- -------- PHYS | | | | | | MEM -------- ------- -------- <------> <------> NODE 0 Shared NODE 1 between nodes * - entries unique in each table TT synchronization: NODE 0 NODE 1 USER KERNEL USER KERNEL --------------------- --------------------- PGD | | |0| | | | | | | | | |0| | | | | | | --------------------- --------------------- | | | | | PUD_ALLOC / P4D_ALLOC | | IN USERSPACE | \/ --------------------- --------------------- PGD | | |p| | | | | | | | | |p| | | | | | | --------------------- --------------------- | | | | --------------------------- | --------------------- PUD/P4D | | | | | | | | | | --------------------- Acked-by: Alexander Grubnikov <alexander.grubnikov@huawei.com> Acked-by: Ilya Hanov <ilya.hanov@huawei-partners.com> Acked-by: Denis Darvish <darvish.denis@huawei.com> Co-developed-by: Artem Kuzin <artem.kuzin@huawei.com> Signed-off-by: Artem Kuzin <artem.kuzin@huawei.com> Co-developed-by: Nikita Panov <panov.nikita@huawei.com> Signed-off-by: Nikita Panov <panov.nikita@huawei.com> --- arch/arm64/configs/openeuler_defconfig | 1 + arch/arm64/include/asm/extable.h | 2 + arch/arm64/include/asm/memory.h | 15 + arch/arm64/include/asm/mmu_context.h | 28 +- arch/arm64/include/asm/numa_replication.h | 56 ++ arch/arm64/include/asm/pgtable.h | 15 + arch/arm64/kernel/alternative.c | 46 +- arch/arm64/kernel/cpufeature.c | 5 + arch/arm64/kernel/hibernate.c | 5 + arch/arm64/kernel/insn.c | 63 +- arch/arm64/kernel/module.c | 38 +- arch/arm64/kernel/sleep.S | 6 + arch/arm64/kernel/smp.c | 7 + arch/arm64/kernel/suspend.c | 6 + arch/arm64/kernel/vmlinux.lds.S | 10 + arch/arm64/mm/context.c | 3 +- arch/arm64/mm/init.c | 40 +- arch/arm64/mm/kasan_init.c | 21 +- arch/arm64/mm/mmu.c | 74 ++- arch/arm64/mm/pageattr.c | 66 +++ arch/arm64/mm/pgd.c | 68 +++ arch/arm64/mm/ptdump.c | 17 +- drivers/firmware/efi/arm-runtime.c | 15 +- drivers/firmware/efi/libstub/arm64-stub.c | 37 +- include/asm-generic/pgalloc.h | 47 ++ include/asm-generic/pgtable-nop4d.h | 5 + include/asm-generic/set_memory.h | 12 + include/linux/gfp.h | 5 + include/linux/mm.h | 81 ++- include/linux/mm_types.h | 9 +- include/linux/module.h | 15 +- include/linux/moduleloader.h | 11 + include/linux/numa_replication.h | 104 ++++ include/linux/vmalloc.h | 21 + init/main.c | 11 + kernel/bpf/bpf_struct_ops.c | 9 +- kernel/bpf/trampoline.c | 2 +- kernel/module.c | 123 ++-- mm/Kconfig | 10 + mm/Makefile | 2 + mm/memory.c | 254 +++++++- mm/numa_replication.c | 681 ++++++++++++++++++++++ mm/page_alloc.c | 20 + mm/vmalloc.c | 588 ++++++++++++++----- 44 files changed, 2396 insertions(+), 258 deletions(-) create mode 100644 arch/arm64/include/asm/numa_replication.h create mode 100644 include/linux/numa_replication.h create mode 100644 mm/numa_replication.c diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 5449de73fbbc7..fbff0cd74bd3a 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1186,6 +1186,7 @@ CONFIG_EXTEND_HUGEPAGE_MAPPING=y CONFIG_MEM_SAMPLING=y CONFIG_NUMABALANCING_MEM_SAMPLING=y # CONFIG_THP_NUMA_CONTROL is not set +# CONFIG_KERNEL_REPLICATION is not set # # Data Access Monitoring diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h index 35c70cc4e9c54..0a48478d51863 100644 --- a/arch/arm64/include/asm/extable.h +++ b/arch/arm64/include/asm/extable.h @@ -2,6 +2,8 @@ #ifndef __ASM_EXTABLE_H #define __ASM_EXTABLE_H +#include <asm/pgtable.h> + /* * The exception table consists of pairs of relative offsets: the first * is the relative offset to an instruction that is allowed to fault, diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 644cfa3284a7b..cb7c21d7647ab 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -44,6 +44,8 @@ #define _PAGE_OFFSET(va) (-(UL(1) << (va))) #define PAGE_OFFSET (_PAGE_OFFSET(VA_BITS)) #define KIMAGE_VADDR (MODULES_END) + +#ifndef CONFIG_KERNEL_REPLICATION #define BPF_JIT_REGION_START (KASAN_SHADOW_END) #define BPF_JIT_REGION_SIZE (SZ_128M) #define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) @@ -55,6 +57,19 @@ #define PCI_IO_END (VMEMMAP_START - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) #define FIXADDR_TOP (PCI_IO_START - SZ_2M) +#else +#define MODULES_END (MODULES_VADDR + MODULES_VSIZE) +#define MODULES_VADDR (KASAN_SHADOW_END) +#define MODULES_VSIZE (SZ_128M) +#define VMEMMAP_START (-VMEMMAP_SIZE - SZ_2M) +#define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE) +#define PCI_IO_END (VMEMMAP_START - SZ_2M) +#define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) +#define FIXADDR_TOP (PCI_IO_START - SZ_2M) +#define BPF_JIT_REGION_SIZE (SZ_128M) +#define BPF_JIT_REGION_START (FIXADDR_TOP - BPF_JIT_REGION_SIZE - SZ_2M) +#define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) +#endif #if VA_BITS > 48 #define VA_BITS_MIN (48) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 5a54a5ab5f928..97c3ba775ac0e 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -134,33 +134,7 @@ static inline void cpu_install_idmap(void) * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD, * avoiding the possibility of conflicting TLB entries being allocated. */ -static inline void cpu_replace_ttbr1(pgd_t *pgdp) -{ - typedef void (ttbr_replace_func)(phys_addr_t); - extern ttbr_replace_func idmap_cpu_replace_ttbr1; - ttbr_replace_func *replace_phys; - - /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */ - phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); - - if (system_supports_cnp() && !WARN_ON(pgdp != lm_alias(swapper_pg_dir))) { - /* - * cpu_replace_ttbr1() is used when there's a boot CPU - * up (i.e. cpufeature framework is not up yet) and - * latter only when we enable CNP via cpufeature's - * enable() callback. - * Also we rely on the cpu_hwcap bit being set before - * calling the enable() function. - */ - ttbr1 |= TTBR_CNP_BIT; - } - - replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); - - cpu_install_idmap(); - replace_phys(ttbr1); - cpu_uninstall_idmap(); -} +void cpu_replace_ttbr1(pgd_t *pgdp); /* * It would be nice to return ASIDs back to the allocator, but unfortunately diff --git a/arch/arm64/include/asm/numa_replication.h b/arch/arm64/include/asm/numa_replication.h new file mode 100644 index 0000000000000..43068b5ce2e61 --- /dev/null +++ b/arch/arm64/include/asm/numa_replication.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_NUMA_REPLICATION_H +#define __ASM_NUMA_REPLICATION_H + +#ifdef CONFIG_KERNEL_REPLICATION +#include <asm/pgtable.h> +#include <asm/tlbflush.h> +#include <asm/pgalloc.h> +#include <asm/memory.h> +#include <asm/mmu_context.h> +#include <linux/mm.h> +#include <linux/seq_file.h> + +#define PAGE_TABLE_REPLICATION_LEFT ((max((u64)_end - SZ_2G, (u64)MODULES_VADDR)) & PGDIR_MASK) +#define PAGE_TABLE_REPLICATION_RIGHT ((((u64)_end + SZ_2G) & PGDIR_MASK) + PGDIR_SIZE - 1) + +static inline pgd_t *numa_replicate_pgt_pgd(int nid) +{ + pgd_t *new_pgd; + struct page *pgd_page; + + pgd_page = alloc_pages_node(nid, GFP_PGTABLE_KERNEL, 2); + BUG_ON(pgd_page == NULL); + + new_pgd = (pgd_t *)page_address(pgd_page); + new_pgd += (PTRS_PER_PGD * 2); //Extra pages for KPTI + + copy_page((void *)new_pgd, (void *)swapper_pg_dir); + + return new_pgd; +} + + +void cpu_replace_ttbr1(pgd_t *pgdp); +static inline void numa_load_replicated_pgd(pgd_t *pgd) +{ + cpu_replace_ttbr1(pgd); + local_flush_tlb_all(); +} + +static inline ssize_t numa_cpu_dump(struct seq_file *m) +{ + seq_printf(m, "NODE: #%02d, CPU: #%04d, ttbr1_el1: 0x%p, COMM: %s\n", + numa_node_id(), + smp_processor_id(), + (void *)read_sysreg(ttbr1_el1), + current->group_leader->comm); + return 0; +} + +static inline void numa_sync_text_replicas(unsigned long start, unsigned long end) +{ + __flush_icache_range(start, end); +} +#endif /* CONFIG_KERNEL_REPLICATION */ +#endif /* __ASM_NUMA_REPLICATION_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index f914c30b74871..30769c82bab7a 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -21,8 +21,14 @@ * VMALLOC_END: extends to the available space below vmemmap, PCI I/O space * and fixed mappings */ + +#ifndef CONFIG_KERNEL_REPLICATION #define VMALLOC_START (MODULES_END) #define VMALLOC_END (- PUD_SIZE - VMEMMAP_SIZE - SZ_64K) +#else +#define VMALLOC_START ((MODULES_END & PGDIR_MASK) + PGDIR_SIZE) +#define VMALLOC_END (-PUD_SIZE - VMEMMAP_SIZE - SZ_64K - BPF_JIT_REGION_SIZE) +#endif #define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)) @@ -465,6 +471,15 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd) #define pfn_pmd(pfn,prot) __pmd(__phys_to_pmd_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot)) #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) +#ifdef CONFIG_KERNEL_REPLICATION +static inline pgprot_t pmd_pgprot(pmd_t pmd) +{ + unsigned long pfn = pmd_pfn(pmd); + + return __pgprot(pmd_val(pfn_pmd(pfn, __pgprot(0))) ^ pmd_val(pmd)); +} +#endif /* CONFIG_KERNEL_REPLICATION */ + #define pud_young(pud) pte_young(pud_pte(pud)) #define pud_mkyoung(pud) pte_pud(pte_mkyoung(pud_pte(pud))) #define pud_write(pud) pte_write(pud_pte(pud)) diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 5f8e4c2df53cc..52444aab41afc 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -10,6 +10,7 @@ #include <linux/init.h> #include <linux/cpu.h> +#include <linux/numa_replication.h> #include <asm/cacheflush.h> #include <asm/alternative.h> #include <asm/cpufeature.h> @@ -132,18 +133,42 @@ static void clean_dcache_range_nopatch(u64 start, u64 end) } while (cur += d_size, cur < end); } -static void __apply_alternatives(void *alt_region, bool is_module, - unsigned long *feature_mask) +static void __write_alternatives(struct alt_instr *alt, + alternative_cb_t alt_cb, + __le32 *origptr, __le32 *updptr, + int nr_inst) +{ +#ifdef CONFIG_KERNEL_REPLICATION + if (is_text_replicated() && is_kernel_text((unsigned long)origptr)) { + int nid; + + for_each_memory_node(nid) { + __le32 *ptr = numa_get_replica(origptr, nid); + + alt_cb(alt, origptr, ptr, nr_inst); + clean_dcache_range_nopatch((u64)ptr, + (u64)(ptr + nr_inst)); + } + + return; + } +#endif /* CONFIG_KERNEL_REPLICATION */ + alt_cb(alt, origptr, updptr, nr_inst); +} + + +static void __apply_alternatives(const struct alt_region *region, + bool is_module, + unsigned long *cpucap_mask) { struct alt_instr *alt; - struct alt_region *region = alt_region; __le32 *origptr, *updptr; alternative_cb_t alt_cb; for (alt = region->begin; alt < region->end; alt++) { int nr_inst; - if (!test_bit(alt->cpufeature, feature_mask)) + if (!test_bit(alt->cpufeature, cpucap_mask)) continue; /* Use ARM64_CB_PATCH as an unconditional patch */ @@ -158,16 +183,17 @@ static void __apply_alternatives(void *alt_region, bool is_module, pr_info_once("patching kernel code\n"); - origptr = ALT_ORIG_PTR(alt); - updptr = is_module ? origptr : lm_alias(origptr); - nr_inst = alt->orig_len / AARCH64_INSN_SIZE; - if (alt->cpufeature < ARM64_CB_PATCH) alt_cb = patch_alternative; else alt_cb = ALT_REPL_PTR(alt); - alt_cb(alt, origptr, updptr, nr_inst); + + origptr = ALT_ORIG_PTR(alt); + updptr = is_module ? origptr : lm_alias(origptr); + nr_inst = alt->orig_len / AARCH64_INSN_SIZE; + + __write_alternatives(alt, alt_cb, origptr, updptr, nr_inst); if (!is_module) { clean_dcache_range_nopatch((u64)origptr, @@ -186,7 +212,7 @@ static void __apply_alternatives(void *alt_region, bool is_module, /* Ignore ARM64_CB bit from feature mask */ bitmap_or(applied_alternatives, applied_alternatives, - feature_mask, ARM64_NCAPS); + cpucap_mask, ARM64_NCAPS); bitmap_and(applied_alternatives, applied_alternatives, cpu_hwcaps, ARM64_NCAPS); } diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index dee049d27c745..575ae1d565c59 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -75,6 +75,7 @@ #include <linux/init.h> #include <linux/libfdt.h> #include <linux/pbha.h> +#include <linux/numa_replication.h> #include <asm/cpu.h> #include <asm/cpufeature.h> @@ -3347,7 +3348,11 @@ void __init setup_cpu_features(void) static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap) { +#ifdef CONFIG_KERNEL_REPLICATION + cpu_replace_ttbr1(this_node_pgd(&init_mm)); +#else cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); +#endif /* CONFIG_KERNEL_REPLICATION */ } /* diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index 42003774d261d..b2eec98d80fa4 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -21,6 +21,7 @@ #include <linux/sched.h> #include <linux/suspend.h> #include <linux/utsname.h> +#include <linux/numa_replication.h> #include <asm/barrier.h> #include <asm/cacheflush.h> @@ -123,7 +124,11 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) return -EOVERFLOW; arch_hdr_invariants(&hdr->invariants); +#ifdef CONFIG_KERNEL_REPLICATION + hdr->ttbr1_el1 = virt_to_phys(this_node_pgd(&init_mm)); +#else hdr->ttbr1_el1 = __pa_symbol(swapper_pg_dir); +#endif /* CONFIG_KERNEL_REPLICATION */ hdr->reenter_kernel = _cpu_resume; /* We can't use __hyp_get_vectors() because kvm may still be loaded */ diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index fbd2b7eec1dc5..4c484545dc592 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -15,6 +15,7 @@ #include <linux/stop_machine.h> #include <linux/types.h> #include <linux/uaccess.h> +#include <linux/numa_replication.h> #include <asm/cacheflush.h> #include <asm/debug-monitors.h> @@ -85,6 +86,7 @@ bool aarch64_insn_is_branch_imm(u32 insn) static DEFINE_RAW_SPINLOCK(patch_lock); +#ifndef CONFIG_KERNEL_REPLICATION static bool is_exit_text(unsigned long addr) { /* discarded with init text/data */ @@ -111,10 +113,22 @@ static void __kprobes *patch_map(void *addr, int fixmap) else return addr; + return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + + (uintaddr & ~PAGE_MASK)); +} +#else +static void __kprobes *patch_map(void *addr, int fixmap, int nid) +{ + unsigned long uintaddr = (uintptr_t) addr; + struct page *page; + + page = walk_to_page_node(nid, addr); BUG_ON(!page); + return (void *)set_fixmap_offset(fixmap, page_to_phys(page) + (uintaddr & ~PAGE_MASK)); } +#endif /* CONFIG_KERNEL_REPLICATION */ static void __kprobes patch_unmap(int fixmap) { @@ -136,6 +150,45 @@ int __kprobes aarch64_insn_read(void *addr, u32 *insnp) return ret; } +#ifdef CONFIG_KERNEL_REPLICATION +static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) +{ + int nid; + int ret = 0; + void *waddr = addr; + unsigned long flags = 0; + + raw_spin_lock_irqsave(&patch_lock, flags); + for_each_memory_node(nid) { + waddr = patch_map(addr, FIX_TEXT_POKE0, nid); + ret = copy_to_kernel_nofault(waddr, &insn, AARCH64_INSN_SIZE); + patch_unmap(FIX_TEXT_POKE0); + if (ret || !is_text_replicated()) + break; + } + raw_spin_unlock_irqrestore(&patch_lock, flags); + + return ret; +} +void __kprobes aarch64_literal64_write(void *addr, u64 data) +{ + u64 *waddr; + unsigned long flags = 0; + int nid; + + raw_spin_lock_irqsave(&patch_lock, flags); + for_each_memory_node(nid) { + waddr = patch_map(addr, FIX_TEXT_POKE0, nid); + + WRITE_ONCE(*waddr, data); + + patch_unmap(FIX_TEXT_POKE0); + if (!is_text_replicated()) + break; + } + raw_spin_unlock_irqrestore(&patch_lock, flags); +} +#else static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) { void *waddr = addr; @@ -152,7 +205,6 @@ static int __kprobes __aarch64_insn_write(void *addr, __le32 insn) return ret; } - void __kprobes aarch64_literal64_write(void *addr, u64 data) { u64 *waddr; @@ -166,6 +218,7 @@ void __kprobes aarch64_literal64_write(void *addr, u64 data) patch_unmap(FIX_TEXT_POKE0); raw_spin_unlock_irqrestore(&patch_lock, flags); } +#endif /* CONFIG_KERNEL_REPLICATION */ int __kprobes aarch64_insn_write(void *addr, u32 insn) { @@ -211,9 +264,11 @@ int __kprobes aarch64_insn_patch_text_nosync(void *addr, u32 insn) return -EINVAL; ret = aarch64_insn_write(tp, insn); - if (ret == 0) - __flush_icache_range((uintptr_t)tp, - (uintptr_t)tp + AARCH64_INSN_SIZE); + if (ret == 0) { + dsb(ish); + __flush_icache_all(); + isb(); + } return ret; } diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index acd557c83b6f3..73cd40c5387cc 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -20,7 +20,7 @@ #include <asm/insn.h> #include <asm/sections.h> -void *module_alloc(unsigned long size) +static void *__module_alloc(unsigned long size, unsigned long vm_flags, int nid) { u64 module_alloc_end = module_alloc_base + MODULES_VSIZE; gfp_t gfp_mask = GFP_KERNEL; @@ -35,8 +35,8 @@ void *module_alloc(unsigned long size) module_alloc_end = MODULES_END; p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, - module_alloc_end, gfp_mask, PAGE_KERNEL, VM_NO_HUGE_VMAP, - NUMA_NO_NODE, __builtin_return_address(0)); + module_alloc_end, gfp_mask, PAGE_KERNEL, + vm_flags | VM_NO_HUGE_VMAP, nid, __builtin_return_address(0)); if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && (IS_ENABLED(CONFIG_KASAN_VMALLOC) || !IS_ENABLED(CONFIG_KASAN))) @@ -52,7 +52,7 @@ void *module_alloc(unsigned long size) */ p = __vmalloc_node_range(size, MODULE_ALIGN, module_alloc_base, module_alloc_base + SZ_2G, GFP_KERNEL, - PAGE_KERNEL, VM_NO_HUGE_VMAP, NUMA_NO_NODE, + PAGE_KERNEL, vm_flags | VM_NO_HUGE_VMAP, nid, __builtin_return_address(0)); if (p && (kasan_module_alloc(p, size) < 0)) { @@ -63,6 +63,36 @@ void *module_alloc(unsigned long size) return p; } +#ifdef CONFIG_KERNEL_REPLICATION +void *module_alloc(unsigned long size) +{ + return __module_alloc(size, VM_NUMA_SHARED, NUMA_NO_NODE); +} + +void *module_alloc_replica(unsigned long size) +{ + return __module_alloc(size, VM_NUMA_SHARED, first_memory_node); +} + +void module_replicate_numa(void *ptr) +{ + gfp_t gfp_mask = GFP_KERNEL; + + __vmalloc_node_replicate_range(ptr, gfp_mask, + PAGE_KERNEL, 0); +} +#else +void *module_alloc(unsigned long size) +{ + return __module_alloc(size, 0, NUMA_NO_NODE); +} + +void *module_alloc_replica(unsigned long size) +{ + return module_alloc(size); +} +#endif /*CONFIG_KERNEL_REPLICATION*/ + enum aarch64_reloc_op { RELOC_OP_NONE, RELOC_OP_ABS, diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index ba40d57757d63..301961933eb9c 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -5,6 +5,8 @@ #include <asm/assembler.h> #include <asm/smp.h> +.extern numa_setup_pgd + .text /* * Implementation of MPIDR_EL1 hash algorithm through shifting @@ -138,6 +140,10 @@ SYM_FUNC_START(_cpu_resume) bl kasan_unpoison_task_stack_below #endif +#ifdef CONFIG_KERNEL_REPLICATION + bl numa_setup_pgd +#endif + ldp x19, x20, [x29, #16] ldp x21, x22, [x29, #32] ldp x23, x24, [x29, #48] diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 23707812f87a6..b2ba8f4ff3056 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -37,6 +37,7 @@ #include <linux/kvm_host.h> #include <linux/perf/arm_pmu.h> #include <linux/crash_dump.h> +#include <linux/numa_replication.h> #include <asm/alternative.h> #include <asm/atomic.h> @@ -223,6 +224,12 @@ asmlinkage notrace void secondary_start_kernel(void) mmgrab(mm); current->active_mm = mm; + /* + * Setup per-NUMA node page table if kernel + * replication is enabled. Option supported + * only for 64-bit mode. + */ + numa_setup_pgd(); /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 5258dd6bcaef5..6cafcd5e65bf1 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -4,6 +4,8 @@ #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/pgtable.h> +#include <linux/numa_replication.h> + #include <asm/alternative.h> #include <asm/cacheflush.h> #include <asm/cpufeature.h> @@ -51,7 +53,11 @@ void notrace __cpu_suspend_exit(void) /* Restore CnP bit in TTBR1_EL1 */ if (system_supports_cnp()) +#ifdef CONFIG_KERNEL_REPLICATION + cpu_replace_ttbr1(this_node_pgd(&init_mm)); +#else cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); +#endif /* CONFIG_KERNEL_REPLICATION */ /* * PSTATE was not saved over suspend/resume, re-enable any detected diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 71f4b5f24d15f..193f98e7da748 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -121,6 +121,9 @@ SECTIONS _text = .; HEAD_TEXT } +#ifdef CONFIG_KERNEL_REPLICATION + . = ALIGN(PMD_SIZE); +#endif .text : { /* Real text segment */ _stext = .; /* Text and read-only data */ IRQENTRY_TEXT @@ -150,10 +153,17 @@ SECTIONS "Unexpected GOT/PLT entries detected!") . = ALIGN(SEGMENT_ALIGN); +#ifdef CONFIG_KERNEL_REPLICATION + . = ALIGN(PMD_SIZE); +#endif _etext = .; /* End of text section */ /* everything from this point to __init_begin will be marked RO NX */ +#ifdef CONFIG_KERNEL_REPLICATION + RO_DATA(PMD_SIZE) +#else RO_DATA(PAGE_SIZE) +#endif idmap_pg_dir = .; . += IDMAP_DIR_SIZE; diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index 001737a8f309b..81162bd0182ee 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -11,6 +11,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/mm.h> +#include <linux/numa_replication.h> #include <asm/cpufeature.h> #include <asm/mmu_context.h> @@ -267,7 +268,7 @@ void check_and_switch_context(struct mm_struct *mm) * emulating PAN. */ if (!system_uses_ttbr0_pan()) - cpu_switch_mm(mm->pgd, mm); + cpu_switch_mm(this_node_pgd(mm), mm); } unsigned long arm64_mm_context_get(struct mm_struct *mm) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index e82bb65b48dc7..4c80ddba5e20a 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -678,6 +678,33 @@ static void __init free_unused_memmap(void) } #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ +#ifdef CONFIG_KERNEL_REPLICATION +/* + * It is necessary to preallocate vmalloc pages in advance, + * otherwise the replicated page-tables can be incomplete. + */ +static void __init preallocate_vmalloc_pages(void) +{ + unsigned long addr; + + for (addr = MODULES_VADDR; addr <= VMALLOC_END; + addr = ALIGN(addr + 1, PGDIR_SIZE)) { + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + + p4d = p4d_alloc(&init_mm, pgd, addr); + /* + * No need to check p4d here due to + * only 4-stage page table is possible + */ + pud = pud_alloc(&init_mm, p4d, addr); + if (!pud) + panic("Failed to pre-allocate pud pages for vmalloc area\n"); + } +} +#endif /* CONFIG_KERNEL_REPLICATION */ + /* * mem_init() marks the free areas in the mem_map and tells us how much memory * is free. This is done after various parts of the system have claimed their @@ -722,6 +749,9 @@ void __init mem_init(void) */ sysctl_overcommit_memory = OVERCOMMIT_ALWAYS; } +#ifdef CONFIG_KERNEL_REPLICATION + preallocate_vmalloc_pages(); +#endif /* CONFIG_KERNEL_REPLICATION */ } void free_initmem(void) @@ -734,7 +764,15 @@ void free_initmem(void) * prevents the region from being reused for kernel modules, which * is not supported by kallsyms. */ - unmap_kernel_range((u64)__init_begin, (u64)(__init_end - __init_begin)); +#ifdef CONFIG_KERNEL_REPLICATION + /* + * In case of replicated kernel the per-NUMA node vmalloc + * memory should be released. + */ + vunmap_range_replicas((u64)__init_begin, (u64)__init_end); +#else + unmap_kernel_range((u64)__init_begin, (u64)__init_end - (u64)__init_begin); +#endif /* CONFIG_KERNEL_REPLICATION */ } void dump_mem_limit(void) diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 9528072916155..14bdd9738ec3f 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -13,6 +13,7 @@ #include <linux/memblock.h> #include <linux/start_kernel.h> #include <linux/mm.h> +#include <linux/numa_replication.h> #include <asm/mmu_context.h> #include <asm/kernel-pgtable.h> @@ -83,6 +84,24 @@ static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, return early ? pmd_offset_kimg(pudp, addr) : pmd_offset(pudp, addr); } +static void __init __kasan_p4d_populate(p4d_t *p4dp, phys_addr_t pud_phys, unsigned long addr) +{ +#ifdef CONFIG_KERNEL_REPLICATION + if (is_text_replicated()) { + int nid; + p4d_t *target; + + for_each_memory_node(nid) { + target = (p4d_t *)pgd_offset_pgd(per_node_pgd(&init_mm, nid), addr); + __p4d_populate(target, pud_phys, PMD_TYPE_TABLE); + } + } else + __p4d_populate(p4dp, pud_phys, PMD_TYPE_TABLE); +#else + __p4d_populate(p4dp, pud_phys, PMD_TYPE_TABLE); +#endif /* CONFIG_KERNEL_REPLICATION */ +} + static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node, bool early) { @@ -90,7 +109,7 @@ static pud_t *__init kasan_pud_offset(p4d_t *p4dp, unsigned long addr, int node, phys_addr_t pud_phys = early ? __pa_symbol(kasan_early_shadow_pud) : kasan_alloc_zeroed_page(node); - __p4d_populate(p4dp, pud_phys, PMD_TYPE_TABLE); + __kasan_p4d_populate(p4dp, pud_phys, addr); } return early ? pud_offset_kimg(p4dp, addr) : pud_offset(p4dp, addr); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index adaca1fd5a277..01761486fd6e7 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -24,6 +24,7 @@ #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/pbha.h> +#include <linux/numa_replication.h> #include <asm/barrier.h> #include <asm/cputype.h> @@ -66,6 +67,38 @@ static pud_t bm_pud[PTRS_PER_PUD] __page_aligned_bss __maybe_unused; static DEFINE_SPINLOCK(swapper_pgdir_lock); static DEFINE_MUTEX(fixmap_lock); +void cpu_replace_ttbr1(pgd_t *pgdp) +{ + typedef void (ttbr_replace_func)(phys_addr_t); + extern ttbr_replace_func idmap_cpu_replace_ttbr1; + ttbr_replace_func *replace_phys; + + /* phys_to_ttbr() zeros lower 2 bits of ttbr with 52-bit PA */ + phys_addr_t ttbr1 = phys_to_ttbr(virt_to_phys(pgdp)); + +#ifdef CONFIG_KERNEL_REPLICATION + if (system_supports_cnp() && !WARN_ON(pgdp != this_node_pgd(&init_mm))) { +#else + if (system_supports_cnp() && !WARN_ON(pgdp != lm_alias(swapper_pg_dir))) { +#endif /* CONFIG_KERNEL_REPLICATION */ + /* + * cpu_replace_ttbr1() is used when there's a boot CPU + * up (i.e. cpufeature framework is not up yet) and + * latter only when we enable CNP via cpufeature's + * enable() callback. + * Also we rely on the cpu_hwcap bit being set before + * calling the enable() function. + */ + ttbr1 |= TTBR_CNP_BIT; + } + + replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); + + cpu_install_idmap(); + replace_phys(ttbr1); + cpu_uninstall_idmap(); +} + void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd) { pgd_t *fixmap_pgdp; @@ -454,6 +487,23 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, pgd_pgtable_alloc, flags); } +static void populate_mappings_prot(phys_addr_t phys, unsigned long virt, + phys_addr_t size, pgprot_t prot) +{ +#ifdef CONFIG_KERNEL_REPLICATION + int nid; + + for_each_memory_node(nid) { + __create_pgd_mapping(per_node_pgd(&init_mm, nid), + page_to_phys(walk_to_page_node(nid, (void *)virt)), + virt, size, prot, NULL, NO_CONT_MAPPINGS); + } +#else + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, + NO_CONT_MAPPINGS); +#endif /* CONFIG_KERNEL_REPLICATION */ +} + static void update_mapping_prot(phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot) { @@ -462,9 +512,7 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, - NO_CONT_MAPPINGS); + populate_mappings_prot(phys, virt, size, prot); /* flush the TLBs after updating live kernel mappings */ flush_tlb_kernel_range(virt, virt + size); @@ -641,6 +689,22 @@ static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end, } #ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + +#ifdef CONFIG_KERNEL_REPLICATION +static void __init populate_trampoline_mappings(void) +{ + int nid; + + /* Copy trampoline mappings in replicated tables */ + for_each_memory_node(nid) { + memcpy(per_node_pgd(&init_mm, nid) - (PTRS_PER_PGD * 2), + tramp_pg_dir, PGD_SIZE); + } + /* Be sure that replicated page table can be observed properly */ + dsb(ishst); +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static int __init map_entry_trampoline(void) { int i; @@ -670,6 +734,10 @@ static int __init map_entry_trampoline(void) PAGE_KERNEL_RO); } +#ifdef CONFIG_KERNEL_REPLICATION + populate_trampoline_mappings(); +#endif /* CONFIG_KERNEL_REPLICATION */ + return 0; } core_initcall(map_entry_trampoline); diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index e84a57c4db959..55f65e5bbe7f0 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -50,6 +50,24 @@ static int __change_memory_common(unsigned long start, unsigned long size, return ret; } +#ifdef CONFIG_KERNEL_REPLICATION +static int __change_memory_common_replicas(unsigned long start, unsigned long size, + pgprot_t set_mask, pgprot_t clear_mask) +{ + struct page_change_data data; + int ret; + + data.set_mask = set_mask; + data.clear_mask = clear_mask; + + ret = apply_to_page_range_replicas(&init_mm, start, size, + change_page_range, &data); + + flush_tlb_kernel_range(start, start + size); + return ret; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static int change_memory_common(unsigned long addr, int numpages, pgprot_t set_mask, pgprot_t clear_mask) { @@ -116,6 +134,24 @@ static int change_memory_common(unsigned long addr, int numpages, return __change_memory_common(start, size, set_mask, clear_mask); } +#ifdef CONFIG_KERNEL_REPLICATION + +static int numa_change_memory_common(unsigned long addr, int numpages, + pgprot_t set_mask, pgprot_t clear_mask) +{ + int ret; + + if (numpages == 0) + return 0; + + ret = change_memory_common(addr, numpages, set_mask, clear_mask); + if (ret) + return ret; + + return __change_memory_common_replicas(addr, numpages * PAGE_SIZE, set_mask, clear_mask); +} +#endif /* CONFIG_KERNEL_REPLICATION */ + int set_memory_ro(unsigned long addr, int numpages) { return change_memory_common(addr, numpages, @@ -144,6 +180,36 @@ int set_memory_x(unsigned long addr, int numpages) __pgprot(PTE_PXN)); } +#ifdef CONFIG_KERNEL_REPLICATION +int numa_set_memory_x(unsigned long addr, int numpages) +{ + return numa_change_memory_common(addr, numpages, + __pgprot(PTE_MAYBE_GP), + __pgprot(PTE_PXN)); +} + +int numa_set_memory_nx(unsigned long addr, int numpages) +{ + return numa_change_memory_common(addr, numpages, + __pgprot(PTE_PXN), + __pgprot(PTE_MAYBE_GP)); +} + +int numa_set_memory_ro(unsigned long addr, int numpages) +{ + return numa_change_memory_common(addr, numpages, + __pgprot(PTE_RDONLY), + __pgprot(PTE_WRITE)); +} + +int numa_set_memory_rw(unsigned long addr, int numpages) +{ + return numa_change_memory_common(addr, numpages, + __pgprot(PTE_WRITE), + __pgprot(PTE_RDONLY)); +} +#endif /*CONFIG_KERNEL_REPLICATION*/ + int set_memory_valid(unsigned long addr, int numpages, int enable) { if (enable) diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index 4a64089e5771c..56e8047485a5e 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -10,6 +10,7 @@ #include <linux/gfp.h> #include <linux/highmem.h> #include <linux/slab.h> +#include <linux/numa_replication.h> #include <asm/pgalloc.h> #include <asm/page.h> @@ -17,6 +18,50 @@ static struct kmem_cache *pgd_cache __ro_after_init; +#ifdef CONFIG_KERNEL_REPLICATION +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + int nid; + gfp_t gfp = GFP_PGTABLE_USER | __GFP_THISNODE; + pgd_t **pgd_numa = (pgd_t **)kmalloc(sizeof(pgd_t *) * MAX_NUMNODES, GFP_PGTABLE_KERNEL); + + if (!pgd_numa) + goto pgd_numa_fail; + + mm->pgd_numa = pgd_numa; + + /* + * Kernel replication is not supproted in case of non-page size pgd, + * in general we can support it, but maybe later, due to we need to + * update page tables allocation significantly, so, let's panic here. + */ + BUG_ON(PGD_SIZE != PAGE_SIZE); + for_each_memory_node(nid) { + struct page *page; + + page = alloc_pages_node(nid, gfp, 0); + if (!page) + goto fail; + + per_node_pgd(mm, nid) = (pgd_t *)page_address(page); + } + + for_each_online_node(nid) + per_node_pgd(mm, nid) = per_node_pgd(mm, numa_get_memory_node(nid)); + + mm->pgd = per_node_pgd(mm, numa_get_memory_node(0)); + + return mm->pgd; + +fail: + pgd_free(mm, mm->pgd); + +pgd_numa_fail: + kfree(pgd_numa); + + return NULL; +} +#else pgd_t *pgd_alloc(struct mm_struct *mm) { gfp_t gfp = GFP_PGTABLE_USER; @@ -26,7 +71,29 @@ pgd_t *pgd_alloc(struct mm_struct *mm) else return kmem_cache_alloc(pgd_cache, gfp); } +#endif /* CONFIG_KERNEL_REPLICATION */ +#ifdef CONFIG_KERNEL_REPLICATION +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + int nid; + /* + * Kernel replication is not supproted in case of non-page size pgd, + * in general we can support it, but maybe later, due to we need to + * update page tables allocation significantly, so, let's panic here. + */ + BUG_ON(PGD_SIZE != PAGE_SIZE); + for_each_memory_node(nid) { + if (per_node_pgd(mm, nid) == NULL) + break; + free_page((unsigned long)per_node_pgd(mm, nid)); + } + + for_each_online_node(nid) + per_node_pgd(mm, nid) = NULL; + kfree(mm->pgd_numa); +} +#else void pgd_free(struct mm_struct *mm, pgd_t *pgd) { if (PGD_SIZE == PAGE_SIZE) @@ -34,6 +101,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) else kmem_cache_free(pgd_cache, pgd); } +#endif /* CONFIG_KERNEL_REPLICATION */ void __init pgtable_cache_init(void) { diff --git a/arch/arm64/mm/ptdump.c b/arch/arm64/mm/ptdump.c index 000819979b96f..a1e6a4ecde4eb 100644 --- a/arch/arm64/mm/ptdump.c +++ b/arch/arm64/mm/ptdump.c @@ -18,6 +18,7 @@ #include <linux/ptdump.h> #include <linux/sched.h> #include <linux/seq_file.h> +#include <linux/numa_replication.h> #include <asm/fixmap.h> #include <asm/kasan.h> @@ -357,7 +358,7 @@ static struct ptdump_info kernel_ptdump_info = { .base_addr = PAGE_OFFSET, }; -void ptdump_check_wx(void) +static void ptdump_check_wx_pgd(struct mm_struct *mm, pgd_t *pgd) { struct pg_state st = { .seq = NULL, @@ -376,7 +377,7 @@ void ptdump_check_wx(void) } }; - ptdump_walk_pgd(&st.ptdump, &init_mm, NULL); + ptdump_walk_pgd(&st.ptdump, mm, pgd); if (st.wx_pages || st.uxn_pages) pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", @@ -385,6 +386,18 @@ void ptdump_check_wx(void) pr_info("Checked W+X mappings: passed, no W+X pages found\n"); } +void ptdump_check_wx(void) +{ +#ifdef CONFIG_KERNEL_REPLICATION + int nid; + + for_each_memory_node(nid) + ptdump_check_wx_pgd(&init_mm, per_node_pgd(&init_mm, nid)); +#else + ptdump_check_wx_pgd(&init_mm, init_mm->pgd); +#endif +} + static int ptdump_init(void) { address_markers[PAGE_END_NR].start_address = PAGE_END; diff --git a/drivers/firmware/efi/arm-runtime.c b/drivers/firmware/efi/arm-runtime.c index 9054c2852580d..563a82c941092 100644 --- a/drivers/firmware/efi/arm-runtime.c +++ b/drivers/firmware/efi/arm-runtime.c @@ -19,6 +19,7 @@ #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/pgtable.h> +#include <linux/numa_replication.h> #include <asm/cacheflush.h> #include <asm/efi.h> @@ -49,6 +50,16 @@ device_initcall(ptdump_init); #endif +#ifdef CONFIG_KERNEL_REPLICATION +static void populate_efi_pgd(struct mm_struct *efi_mm) +{ + int nid; + + for_each_memory_node(nid) + memcpy(per_node_pgd(efi_mm, nid), efi_mm->pgd, PGD_SIZE); +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static bool __init efi_virtmap_init(void) { efi_memory_desc_t *md; @@ -73,7 +84,9 @@ static bool __init efi_virtmap_init(void) return false; } } - +#ifdef CONFIG_KERNEL_REPLICATION + populate_efi_pgd(&efi_mm); +#endif if (efi_memattr_apply_permissions(&efi_mm, efi_set_mapping_permissions)) return false; diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 5a8704176c4c5..0ad824b36f2e2 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -288,18 +288,6 @@ static bool check_image_region(u64 base, u64 size) return ret; } -/* - * Although relocatable kernels can fix up the misalignment with respect to - * MIN_KIMG_ALIGN, the resulting virtual text addresses are subtly out of - * sync with those recorded in the vmlinux when kaslr is disabled but the - * image required relocation anyway. Therefore retain 2M alignment unless - * KASLR is in use. - */ -static u64 min_kimg_align(void) -{ - return efi_nokaslr ? MIN_KIMG_ALIGN : EFI_KIMG_ALIGN; -} - efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long *image_size, unsigned long *reserve_addr, @@ -310,6 +298,25 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, unsigned long kernel_size, kernel_memsize = 0; u32 phys_seed = 0; + /* + * Although relocatable kernels can fix up the misalignment with + * respect to MIN_KIMG_ALIGN, the resulting virtual text addresses are + * subtly out of sync with those recorded in the vmlinux when kaslr is + * disabled but the image required relocation anyway. Therefore retain + * 2M alignment if KASLR was explicitly disabled, even if it was not + * going to be activated to begin with. + */ +#ifdef CONFIG_KERNEL_REPLICATION + /* If kernel replication is enabled, the special alignment is necessary + * to avoid extra memory consumption during TT allocations and minimize + * TLB usage. Due to this fact for now we map kernel by huge pages even + * in case of KASLR enabled. Ugly but works. + */ + u64 min_kimg_align = HPAGE_SIZE; +#else + u64 min_kimg_align = efi_nokaslr ? MIN_KIMG_ALIGN : EFI_KIMG_ALIGN; +#endif + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { if (!efi_nokaslr) { status = efi_get_random_bytes(sizeof(phys_seed), @@ -343,7 +350,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, * If KASLR is enabled, and we have some randomness available, * locate the kernel at a randomized offset in physical memory. */ - status = efi_random_alloc(*reserve_size, min_kimg_align(), + status = efi_random_alloc(*reserve_size, min_kimg_align, reserve_addr, phys_seed); } else { status = EFI_OUT_OF_RESOURCES; @@ -352,7 +359,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, if (status != EFI_SUCCESS) { if (!check_image_region((u64)_text, kernel_memsize)) { efi_err("FIRMWARE BUG: Image BSS overlaps adjacent EFI memory region\n"); - } else if (IS_ALIGNED((u64)_text, min_kimg_align())) { + } else if (IS_ALIGNED((u64)_text, min_kimg_align)) { /* * Just execute from wherever we were loaded by the * UEFI PE/COFF loader if the alignment is suitable. @@ -363,7 +370,7 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, } status = efi_allocate_pages_aligned(*reserve_size, reserve_addr, - ULONG_MAX, min_kimg_align()); + ULONG_MAX, min_kimg_align); if (status != EFI_SUCCESS) { efi_err("Failed to relocate kernel\n"); diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index 02932efad3ab4..ec28f86ea36dd 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -132,6 +132,29 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) } return (pmd_t *)page_address(page); } + +#ifdef CONFIG_KERNEL_REPLICATION +static inline pmd_t *pmd_alloc_one_node(unsigned int nid, + struct mm_struct *mm, unsigned long addr) +{ + struct page *page; + gfp_t gfp = GFP_PGTABLE_USER; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + + gfp |= __GFP_THISNODE; + + page = alloc_pages_node(nid, gfp, 0); + if (!page) + return NULL; + if (!pgtable_pmd_page_ctor(page)) { + __free_pages(page, 0); + return NULL; + } + return (pmd_t *)page_address(page); +} +#endif /* CONFIG_KERNEL_REPLICATION */ #endif #ifndef __HAVE_ARCH_PMD_FREE @@ -147,6 +170,21 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) #if CONFIG_PGTABLE_LEVELS > 3 +#ifdef CONFIG_KERNEL_REPLICATION +static inline pud_t *__pud_alloc_one_node(unsigned int nid, + struct mm_struct *mm, unsigned long addr) +{ + gfp_t gfp = GFP_PGTABLE_USER; + + if (mm == &init_mm) + gfp = GFP_PGTABLE_KERNEL; + + gfp |= __GFP_THISNODE; + + return (pud_t *)get_zeroed_page_node(nid, gfp); +} +#endif /* CONFIG_KERNEL_REPLICATION */ + #ifndef __HAVE_ARCH_PUD_ALLOC_ONE /** * pud_alloc_one - allocate a page for PUD-level page table @@ -165,6 +203,15 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) gfp = GFP_PGTABLE_KERNEL; return (pud_t *)get_zeroed_page(gfp); } + +#ifdef CONFIG_KERNEL_REPLICATION +static inline pud_t *pud_alloc_one_node(unsigned int nid, + struct mm_struct *mm, unsigned long addr) +{ + return __pud_alloc_one_node(nid, mm, addr); +} +#endif /* CONFIG_KERNEL_REPLICATION */ + #endif static inline void pud_free(struct mm_struct *mm, pud_t *pud) diff --git a/include/asm-generic/pgtable-nop4d.h b/include/asm-generic/pgtable-nop4d.h index 2f1d0aad645cf..e482970238026 100644 --- a/include/asm-generic/pgtable-nop4d.h +++ b/include/asm-generic/pgtable-nop4d.h @@ -49,6 +49,11 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address) * inside the pgd, so has no extra memory associated with it. */ #define p4d_alloc_one(mm, address) NULL + +#ifdef CONFIG_USER_REPLICATION +#define p4d_alloc_one_node(nid, mm, address) NULL +#endif + #define p4d_free(mm, x) do { } while (0) #define p4d_free_tlb(tlb, x, a) do { } while (0) diff --git a/include/asm-generic/set_memory.h b/include/asm-generic/set_memory.h index c86abf6bc7ba2..886639600e649 100644 --- a/include/asm-generic/set_memory.h +++ b/include/asm-generic/set_memory.h @@ -10,4 +10,16 @@ int set_memory_rw(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); +#ifdef CONFIG_KERNEL_REPLICATION +int numa_set_memory_ro(unsigned long addr, int numpages); +int numa_set_memory_rw(unsigned long addr, int numpages); +int numa_set_memory_x(unsigned long addr, int numpages); +int numa_set_memory_nx(unsigned long addr, int numpages); +#else +#define numa_set_memory_ro set_memory_ro +#define numa_set_memory_rw set_memory_rw +#define numa_set_memory_x set_memory_x +#define numa_set_memory_nx set_memory_nx +#endif /* CONFIG_KERNEL_REPLICATION */ + #endif diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 3f2ac540451c4..79d31e9e2edd1 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -597,6 +597,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); +#ifdef CONFIG_KERNEL_REPLICATION +extern unsigned long __get_free_pages_node(unsigned int nid, gfp_t gfp_mask, unsigned int order); +extern unsigned long get_zeroed_page_node(unsigned int nid, gfp_t gfp_mask); +#endif /* CONFIG_KERNEL_REPLICATION */ + void *alloc_pages_exact(size_t size, gfp_t gfp_mask); void free_pages_exact(void *virt, size_t size); void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); diff --git a/include/linux/mm.h b/include/linux/mm.h index d9d8e68c21096..6eb790b220e5f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -814,6 +814,8 @@ int region_intersects(resource_size_t offset, size_t size, unsigned long flags, struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); +struct page *walk_to_page_node(int nid, const void *addr); + /* * Determine if an address is within the vmalloc range * @@ -2098,9 +2100,25 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, { return 0; } + +#ifdef CONFIG_KERNEL_REPLICATION +static inline int __p4d_alloc_node(unsigned int nid, + struct mm_struct *mm, + pgd_t *pgd, unsigned long address) +{ + return 0; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + #else int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); -#endif + +#ifdef CONFIG_KERNEL_REPLICATION +int __p4d_alloc_node(unsigned int nid, struct mm_struct *mm, + pgd_t *pgd, unsigned long address); +#endif /* CONFIG_KERNEL_REPLICATION */ + +#endif /* __PAGETABLE_P4D_FOLDED */ #if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, @@ -2108,12 +2126,28 @@ static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, { return 0; } + +#ifdef CONFIG_KERNEL_REPLICATION +static inline int __pud_alloc_node(unsigned int nid, + struct mm_struct *mm, + p4d_t *p4d, unsigned long address) +{ + return 0; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); +#ifdef CONFIG_KERNEL_REPLICATION +int __pud_alloc_node(unsigned int nid, + struct mm_struct *mm, + p4d_t *p4d, unsigned long address); +#endif /* CONFIG_KERNEL_REPLICATION */ + static inline void mm_inc_nr_puds(struct mm_struct *mm) { if (mm_pud_folded(mm)) @@ -2136,11 +2170,25 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, return 0; } +#ifdef CONFIG_KERNEL_REPLICATION +static inline int __pmd_alloc_node(unsigned int nid, + struct mm_struct *mm, + pud_t *pud, unsigned long address) +{ + return 0; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); +#ifdef CONFIG_KERNEL_REPLICATION +int __pmd_alloc_node(unsigned int nid, + struct mm_struct *mm, + pud_t *pud, unsigned long address); +#endif /* CONFIG_KERNEL_REPLICATION */ static inline void mm_inc_nr_pmds(struct mm_struct *mm) { @@ -2213,6 +2261,33 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))? NULL: pmd_offset(pud, address); } + +#ifdef CONFIG_KERNEL_REPLICATION +static inline p4d_t *p4d_alloc_node(unsigned int nid, + struct mm_struct *mm, + pgd_t *pgd, unsigned long address) +{ + return (unlikely(pgd_none(*pgd)) && __p4d_alloc_node(nid, mm, pgd, address)) ? + NULL : p4d_offset(pgd, address); +} + +static inline pud_t *pud_alloc_node(unsigned int nid, + struct mm_struct *mm, + p4d_t *p4d, unsigned long address) +{ + return (unlikely(p4d_none(*p4d)) && __pud_alloc_node(nid, mm, p4d, address)) ? + NULL : pud_offset(p4d, address); +} + +static inline pmd_t *pmd_alloc_node(unsigned int nid, + struct mm_struct *mm, + pud_t *pud, unsigned long address) +{ + return (unlikely(pud_none(*pud)) && __pmd_alloc_node(nid, mm, pud, address)) ? + NULL : pmd_offset(pud, address); +} +#endif /* CONFIG_KERNEL_REPLICATION */ + #endif /* CONFIG_MMU */ #if USE_SPLIT_PTE_PTLOCKS @@ -3004,6 +3079,10 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); +#if defined(CONFIG_KERNEL_REPLICATION) && defined(CONFIG_ARM64) +int apply_to_page_range_replicas(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data); +#endif /* CONFIG_KERNEL_REPLICATION && CONFIG_ARM64 */ #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d1c5946ad402d..382d018bbc157 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -19,6 +19,8 @@ #include <asm/mmu.h> +#include <linux/numa.h> + #ifndef AT_VECTOR_SIZE_ARCH #define AT_VECTOR_SIZE_ARCH 0 #endif @@ -435,8 +437,7 @@ struct mm_struct { #endif unsigned long task_size; /* size of task vm space */ unsigned long highest_vm_end; /* highest vma end address */ - pgd_t * pgd; - + pgd_t *pgd; #ifdef CONFIG_MEMBARRIER /** * @membarrier_state: Flags controlling membarrier behavior. @@ -645,7 +646,11 @@ struct mm_struct { #else KABI_RESERVE(4) #endif +#ifdef CONFIG_KERNEL_REPLICATION + KABI_USE(5, pgd_t **pgd_numa) +#else KABI_RESERVE(5) +#endif KABI_RESERVE(6) KABI_RESERVE(7) KABI_RESERVE(8) diff --git a/include/linux/module.h b/include/linux/module.h index b2b2c742a3971..b58fb669a00c4 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -564,8 +564,7 @@ struct module { #else KABI_RESERVE(1) #endif - - KABI_RESERVE(2) + KABI_USE(2, struct module_layout *mutable_data_layout) KABI_RESERVE(3) KABI_RESERVE(4) } ____cacheline_aligned __randomize_layout; @@ -611,9 +610,19 @@ static inline bool within_module_init(unsigned long addr, addr < (unsigned long)mod->init_layout.base + mod->init_layout.size; } +static inline bool within_module_mutable(unsigned long addr, + const struct module *mod) +{ + return (unsigned long)mod->mutable_data_layout->base <= addr && + addr < (unsigned long)mod->mutable_data_layout->base + + mod->mutable_data_layout->size; +} + + static inline bool within_module(unsigned long addr, const struct module *mod) { - return within_module_init(addr, mod) || within_module_core(addr, mod); + return within_module_init(addr, mod) || within_module_core(addr, mod) + || within_module_mutable(addr, mod); } /* Search for module by name: must hold module_mutex. */ diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 0b4541cb4c740..c466e83a43319 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -25,6 +25,17 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section); /* Allocator used for allocating struct module, core sections and init sections. Returns NULL on failure. */ void *module_alloc(unsigned long size); +void *module_alloc_replica(unsigned long size); + +#ifndef CONFIG_KERNEL_REPLICATION +static inline void module_replicate_numa(void *ptr) +{ + (void) ptr; +} +#else +/* Replicate memory allocated in previous function*/ +void module_replicate_numa(void *ptr); +#endif /* CONFIG_KERNEL_REPLICATION */ /* Free memory returned from module_alloc. */ void module_memfree(void *module_region); diff --git a/include/linux/numa_replication.h b/include/linux/numa_replication.h new file mode 100644 index 0000000000000..1a22b56d9312b --- /dev/null +++ b/include/linux/numa_replication.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_NUMA_REPLICATION_H +#define _LINUX_NUMA_REPLICATION_H + +#include <linux/kabi.h> + +/* + * Why? Because linux is defined to 1 for some reason, + * and linux/mm.h converted to 1/mm.h. Perhaps compiler? + * Do not ask me, I have no idea. + */ +#if defined(linux) +#define tmp_linux_value linux +#undef linux +#endif + +#include KABI_HIDE_INCLUDE(<linux/mm_types.h>) +#include KABI_HIDE_INCLUDE(<linux/nodemask.h>) +#include KABI_HIDE_INCLUDE(<linux/module.h>) +#include KABI_HIDE_INCLUDE(<linux/mm.h>) + +#ifdef CONFIG_KERNEL_REPLICATION +#include KABI_HIDE_INCLUDE(<asm/numa_replication.h>) +#endif + +#if defined(tmp_linux_value) +#define linux tmp_linux_value +#undef tmp_linux_value +#endif + + +extern nodemask_t replica_nodes; + +#define for_each_memory_node(nid) \ + for (nid = first_node(replica_nodes); \ + nid != MAX_NUMNODES; \ + nid = next_node(nid, replica_nodes)) + +#ifdef CONFIG_KERNEL_REPLICATION +#define this_node_pgd(mm) ((mm)->pgd_numa[numa_node_id()]) +#define per_node_pgd(mm, nid) ((mm)->pgd_numa[nid]) + +static inline bool numa_addr_has_replica(const void *addr) +{ + return ((unsigned long)addr >= PAGE_TABLE_REPLICATION_LEFT) && + ((unsigned long)addr <= PAGE_TABLE_REPLICATION_RIGHT); +} + +void __init numa_replication_init(void); +void __init numa_replicate_kernel_text(void); +void numa_replicate_kernel_rodata(void); +void numa_replication_fini(void); + +bool is_text_replicated(void); +void numa_setup_pgd(void); +void __init_or_module *numa_get_replica(void *vaddr, int nid); +int numa_get_memory_node(int nid); +void dump_mm_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end); +#else +#define this_node_pgd(mm) ((mm)->pgd) +#define per_node_pgd(mm, nid) ((mm)->pgd) + +static inline void numa_setup_pgd(void) +{ +} + +static inline void __init numa_replication_init(void) +{ +} + +static inline void __init numa_replicate_kernel_text(void) +{ +} + +static inline void numa_replicate_kernel_rodata(void) +{ +} + +static inline void numa_replication_fini(void) +{ +} + +static inline bool numa_addr_has_replica(const void *addr) +{ + return false; +} + +static inline bool is_text_replicated(void) +{ + return false; +} + +static inline void __init_or_module *numa_get_replica(void *vaddr, int nid) +{ + return lm_alias(vaddr); +} + +static inline void dump_mm_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} +#endif /*CONFIG_KERNEL_REPLICATION*/ +#endif /*_LINUX_NUMA_REPLICATION_H*/ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 6077e479b6835..9a3e5baaa47b9 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -36,6 +36,10 @@ struct notifier_block; /* in notifier.h */ #define VM_SHAREPOOL 0 #endif +#ifdef CONFIG_KERNEL_REPLICATION +#define VM_NUMA_SHARED 0x00000800 /* Pages shared between per-NUMA node TT*/ +#endif + /* * VM_KASAN is used slighly differently depending on CONFIG_KASAN_VMALLOC. * @@ -69,6 +73,11 @@ struct vm_struct { unsigned int nr_pages; phys_addr_t phys_addr; const void *caller; +#ifdef CONFIG_KERNEL_REPLICATION + KABI_EXTEND(int node) + KABI_EXTEND(bool replicated) +#endif + }; struct vmap_area { @@ -141,6 +150,18 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); +#ifdef CONFIG_KERNEL_REPLICATION + /* + * DO NOT USE this function if you don't understand what it is doing + */ +int __vmalloc_node_replicate_range(const void *addr, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags); +#ifdef CONFIG_ARM64 +void vunmap_range_replicas(unsigned long addr, unsigned long end); +#endif + +#endif + void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller); void *vmalloc_no_huge(unsigned long size); diff --git a/init/main.c b/init/main.c index 2b466bd041104..a51e0c1dc54f7 100644 --- a/init/main.c +++ b/init/main.c @@ -99,6 +99,7 @@ #include <linux/kcsan.h> #include <linux/init_syscalls.h> #include <linux/randomize_kstack.h> +#include <linux/numa_replication.h> #include <asm/io.h> #include <asm/setup.h> @@ -926,12 +927,15 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) * These use large bootmem allocations and must precede * kmem_cache_init() */ + numa_replication_init(); setup_log_buf(0); vfs_caches_init_early(); sort_main_extable(); trap_init(); mm_init(); poking_init(); + numa_replicate_kernel_text(); + ftrace_init(); /* trace_printk can be enabled here */ @@ -1450,6 +1454,13 @@ static int __ref kernel_init(void *unused) free_initmem(); mark_readonly(); + /* + * RODATA replication is done here due to + * it is necessary to finalize the kernel + * and modules initialization before + */ + numa_replicate_kernel_rodata(); + numa_replication_fini(); /* * Kernel mappings are now finalized - update the userspace page-table * to finalize PTI. diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index ac283f9b2332e..8077577a8e647 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -450,8 +450,8 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, refcount_set(&kvalue->refcnt, 1); bpf_map_inc(map); - set_memory_ro((long)st_map->image, 1); - set_memory_x((long)st_map->image, 1); + numa_set_memory_ro((long)st_map->image, 1); + numa_set_memory_x((long)st_map->image, 1); err = st_ops->reg(kdata); if (likely(!err)) { /* Pair with smp_load_acquire() during lookup_elem(). @@ -468,8 +468,9 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, * in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ - set_memory_nx((long)st_map->image, 1); - set_memory_rw((long)st_map->image, 1); + + numa_set_memory_nx((long)st_map->image, 1); + numa_set_memory_rw((long)st_map->image, 1); bpf_map_put(map); reset_unlock: diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index af9fea93b4883..2f9b974113557 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -37,7 +37,7 @@ void *bpf_jit_alloc_exec_page(void) /* Keep image as writeable. The alternative is to keep flipping ro/rw * everytime new program is attached or detached. */ - set_memory_x((long)image, 1); + numa_set_memory_x((long)image, 1); return image; } diff --git a/kernel/module.c b/kernel/module.c index 57b34c32a9168..0e1b8e91a45ea 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2038,8 +2038,8 @@ static void frob_text(const struct module_layout *layout, static void module_enable_x(const struct module *mod) { - frob_text(&mod->core_layout, set_memory_x); - frob_text(&mod->init_layout, set_memory_x); + frob_text(&mod->core_layout, numa_set_memory_x); + frob_text(&mod->init_layout, numa_set_memory_x); } #else /* !CONFIG_ARCH_HAS_STRICT_MODULE_RWX */ static void module_enable_x(const struct module *mod) { } @@ -2082,11 +2082,11 @@ void module_disable_ro(const struct module *mod) if (!rodata_enabled) return; - frob_text(&mod->core_layout, set_memory_rw); - frob_rodata(&mod->core_layout, set_memory_rw); - frob_ro_after_init(&mod->core_layout, set_memory_rw); - frob_text(&mod->init_layout, set_memory_rw); - frob_rodata(&mod->init_layout, set_memory_rw); + frob_text(&mod->core_layout, numa_set_memory_rw); + frob_rodata(&mod->core_layout, numa_set_memory_rw); + frob_ro_after_init(mod->mutable_data_layout, numa_set_memory_rw); + frob_text(&mod->init_layout, numa_set_memory_rw); + frob_rodata(&mod->init_layout, numa_set_memory_rw); } void module_enable_ro(const struct module *mod, bool after_init) @@ -2096,23 +2096,24 @@ void module_enable_ro(const struct module *mod, bool after_init) set_vm_flush_reset_perms(mod->core_layout.base); set_vm_flush_reset_perms(mod->init_layout.base); - frob_text(&mod->core_layout, set_memory_ro); + set_vm_flush_reset_perms(mod->mutable_data_layout->base); + frob_text(&mod->core_layout, numa_set_memory_ro); - frob_rodata(&mod->core_layout, set_memory_ro); - frob_text(&mod->init_layout, set_memory_ro); - frob_rodata(&mod->init_layout, set_memory_ro); + frob_rodata(&mod->core_layout, numa_set_memory_ro); + frob_text(&mod->init_layout, numa_set_memory_ro); + frob_rodata(&mod->init_layout, numa_set_memory_ro); if (after_init) - frob_ro_after_init(&mod->core_layout, set_memory_ro); + frob_ro_after_init(mod->mutable_data_layout, numa_set_memory_ro); } static void module_enable_nx(const struct module *mod) { - frob_rodata(&mod->core_layout, set_memory_nx); - frob_ro_after_init(&mod->core_layout, set_memory_nx); - frob_writable_data(&mod->core_layout, set_memory_nx); - frob_rodata(&mod->init_layout, set_memory_nx); - frob_writable_data(&mod->init_layout, set_memory_nx); + frob_rodata(&mod->core_layout, numa_set_memory_nx); + frob_ro_after_init(mod->mutable_data_layout, numa_set_memory_nx); + frob_writable_data(mod->mutable_data_layout, numa_set_memory_nx); + frob_rodata(&mod->init_layout, numa_set_memory_nx); + frob_writable_data(mod->mutable_data_layout, numa_set_memory_nx); } static int module_enforce_rwx_sections(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, @@ -2237,6 +2238,7 @@ void __weak module_arch_freeing_init(struct module *mod) /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { + struct module_layout *mut_layout = mod->mutable_data_layout; trace_module_free(mod); mod_sysfs_teardown(mod); @@ -2284,6 +2286,8 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ module_memfree(mod->core_layout.base); + module_memfree(mod->mutable_data_layout->base); + kfree(mut_layout); } void *__symbol_get(const char *symbol) @@ -2512,6 +2516,21 @@ static void layout_sections(struct module *mod, struct load_info *info) pr_debug("Core section allocation order:\n"); for (m = 0; m < ARRAY_SIZE(masks); ++m) { + unsigned int *sizep; + + switch (m) { + case 0: + sizep = &mod->core_layout.size; + break; + case 1: + sizep = &mod->core_layout.size; + break; + case 2: + case 3: + case 4: + sizep = &mod->mutable_data_layout->size; + } + for (i = 0; i < info->hdr->e_shnum; ++i) { Elf_Shdr *s = &info->sechdrs[i]; const char *sname = info->secstrings + s->sh_name; @@ -2521,7 +2540,7 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || module_init_layout_section(sname)) continue; - s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); + s->sh_entsize = get_offset(mod, sizep, s, i); pr_debug("\t%s\n", sname); } switch (m) { @@ -2534,11 +2553,11 @@ static void layout_sections(struct module *mod, struct load_info *info) mod->core_layout.ro_size = mod->core_layout.size; break; case 2: /* RO after init */ - mod->core_layout.size = debug_align(mod->core_layout.size); - mod->core_layout.ro_after_init_size = mod->core_layout.size; + mod->mutable_data_layout->size = debug_align(mod->mutable_data_layout->size); + mod->mutable_data_layout->ro_after_init_size = mod->mutable_data_layout->size; break; case 4: /* whole core */ - mod->core_layout.size = debug_align(mod->core_layout.size); + mod->mutable_data_layout->size = debug_align(mod->mutable_data_layout->size); break; } } @@ -2791,12 +2810,12 @@ static void layout_symtab(struct module *mod, struct load_info *info) } /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); - info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->core_layout.size += strtab_size; - info->core_typeoffs = mod->core_layout.size; - mod->core_layout.size += ndst * sizeof(char); - mod->core_layout.size = debug_align(mod->core_layout.size); + info->symoffs = ALIGN(mod->mutable_data_layout->size, symsect->sh_addralign ?: 1); + info->stroffs = mod->mutable_data_layout->size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->mutable_data_layout->size += strtab_size; + info->core_typeoffs = mod->mutable_data_layout->size; + mod->mutable_data_layout->size += ndst * sizeof(char); + mod->mutable_data_layout->size = debug_align(mod->mutable_data_layout->size); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; @@ -2840,9 +2859,9 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) * Now populate the cut down core kallsyms for after init * and set types up while we still have access to sections. */ - mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; - mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; - mod->core_kallsyms.typetab = mod->core_layout.base + info->core_typeoffs; + mod->core_kallsyms.symtab = dst = mod->mutable_data_layout->base + info->symoffs; + mod->core_kallsyms.strtab = s = mod->mutable_data_layout->base + info->stroffs; + mod->core_kallsyms.typetab = mod->mutable_data_layout->base + info->core_typeoffs; src = mod->kallsyms->symtab; for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { mod->kallsyms->typetab[i] = elf_type(src + i, info); @@ -2889,6 +2908,11 @@ void * __weak module_alloc(unsigned long size) NUMA_NO_NODE, __builtin_return_address(0)); } +void * __weak module_alloc_replica(unsigned long size) +{ + return module_alloc(size); +} + bool __weak module_init_section(const char *name) { return strstarts(name, ".init"); @@ -3444,37 +3468,48 @@ static int find_module_sections(struct module *mod, struct load_info *info) static int move_module(struct module *mod, struct load_info *info) { int i; - void *ptr; + void *core_mem; + void *init_mem; + void *mutable_data_mem; /* Do the allocs. */ - ptr = module_alloc(mod->core_layout.size); + core_mem = module_alloc_replica(mod->core_layout.size); + mutable_data_mem = module_alloc(mod->mutable_data_layout->size); /* * The pointer to this block is stored in the module structure * which is inside the block. Just mark it as not being a * leak. */ - kmemleak_not_leak(ptr); - if (!ptr) + kmemleak_not_leak(core_mem); + kmemleak_not_leak(mutable_data_mem); + if (!core_mem) return -ENOMEM; + if (!mutable_data_mem) { + module_memfree(core_mem); + return -ENOMEM; + } + memset(core_mem, 0, mod->core_layout.size); + memset(mutable_data_mem, 0, mod->mutable_data_layout->size); - memset(ptr, 0, mod->core_layout.size); - mod->core_layout.base = ptr; + mod->core_layout.base = core_mem; + mod->mutable_data_layout->base = mutable_data_mem; if (mod->init_layout.size) { - ptr = module_alloc(mod->init_layout.size); + init_mem = module_alloc(mod->init_layout.size); /* * The pointer to this block is stored in the module structure * which is inside the block. This block doesn't need to be * scanned as it contains data and code that will be freed * after the module is initialized. */ - kmemleak_ignore(ptr); - if (!ptr) { + kmemleak_ignore(init_mem); + if (!init_mem) { module_memfree(mod->core_layout.base); + module_memfree(mod->mutable_data_layout->base); return -ENOMEM; } - memset(ptr, 0, mod->init_layout.size); - mod->init_layout.base = ptr; + memset(init_mem, 0, mod->init_layout.size); + mod->init_layout.base = init_mem; } else mod->init_layout.base = NULL; @@ -3490,6 +3525,8 @@ static int move_module(struct module *mod, struct load_info *info) if (shdr->sh_entsize & INIT_OFFSET_MASK) dest = mod->init_layout.base + (shdr->sh_entsize & ~INIT_OFFSET_MASK); + else if (shdr->sh_flags & SHF_WRITE || shdr->sh_flags & SHF_RO_AFTER_INIT) + dest = mod->mutable_data_layout->base + shdr->sh_entsize; else dest = mod->core_layout.base + shdr->sh_entsize; @@ -3634,6 +3671,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any special cases for the architectures. */ + info->mod->mutable_data_layout = (struct module_layout *)kmalloc(sizeof(struct module_layout), GFP_KERNEL | __GFP_ZERO); layout_sections(info->mod, info); layout_symtab(info->mod, info); @@ -3921,6 +3959,9 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); + /* Replicate read-only memory between numa nodes*/ + module_replicate_numa(mod->core_layout.base); + module_enable_ro(mod, false); module_enable_nx(mod); module_enable_x(mod); diff --git a/mm/Kconfig b/mm/Kconfig index cc43f5124cb38..25bd538856a5e 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1048,6 +1048,16 @@ config THP_NUMA_CONTROL This featrue add a switch to control the behavior of THP migration when do numa balancing. +config KERNEL_REPLICATION + bool "Enable kernel text and ro-data replication between NUMA nodes" + default n + depends on ARM64 && MMU && NUMA && !MAXSMP + + help + Creates per NUMA node pagetable which allows to replicate text and ro-data. + + If unsure, say "n". + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 112966190c1db..259b312f717a4 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -134,3 +134,5 @@ obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o obj-$(CONFIG_MEM_SAMPLING) += mem_sampling.o +obj-$(CONFIG_KERNEL_REPLICATION) += numa_replication.o + diff --git a/mm/memory.c b/mm/memory.c index f580b9c542471..42a50af36a4ca 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -75,6 +75,7 @@ #include <linux/vmalloc.h> #include <linux/userswap.h> #include <linux/pbha.h> +#include <linux/numa_replication.h> #include <trace/events/kmem.h> @@ -257,6 +258,24 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, mm_dec_nr_pmds(tlb->mm); } +static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d) +{ +#ifdef CONFIG_KERNEL_REPLICATION + int nid; + int offset; + + if (mm_p4d_folded(tlb->mm)) { + offset = p4d - (p4d_t *)tlb->mm->pgd; + for_each_memory_node(nid) + p4d_clear((p4d_t *)tlb->mm->pgd_numa[nid] + offset); + } else { + p4d_clear(p4d); + } +#else + p4d_clear(p4d); +#endif /* CONFIG_KERNEL_REPLICATION */ +} + static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) @@ -286,11 +305,29 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, return; pud = pud_offset(p4d, start); - p4d_clear(p4d); + + __free_pud_range(tlb, p4d); + pud_free_tlb(tlb, pud, start); mm_dec_nr_puds(tlb->mm); } +static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd) +{ +#ifdef CONFIG_KERNEL_REPLICATION + int nid; + int offset; + + if (!mm_p4d_folded(tlb->mm)) { + offset = pgd - (pgd_t *)tlb->mm->pgd; + for_each_memory_node(nid) + pgd_clear(tlb->mm->pgd_numa[nid] + offset); + } +#else + pgd_clear(pgd); +#endif /* CONFIG_KERNEL_REPLICATION */ +} + static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) @@ -320,7 +357,9 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, return; p4d = p4d_offset(pgd, start); - pgd_clear(pgd); + + __free_p4d_range(tlb, pgd); + p4d_free_tlb(tlb, p4d, start); } @@ -2592,7 +2631,9 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, return err; } -static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, +static int __apply_to_page_range(struct mm_struct *mm, + pgd_t *pgtable, + unsigned long addr, unsigned long size, pte_fn_t fn, void *data, bool create) { @@ -2605,7 +2646,7 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, if (WARN_ON(addr >= end)) return -EINVAL; - pgd = pgd_offset(mm, addr); + pgd = pgd_offset_pgd(pgtable, addr); do { next = pgd_addr_end(addr, end); if (pgd_none(*pgd) && !create) @@ -2636,10 +2677,32 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { - return __apply_to_page_range(mm, addr, size, fn, data, true); + return __apply_to_page_range(mm, mm->pgd, addr, size, fn, data, true); } EXPORT_SYMBOL_GPL(apply_to_page_range); +#if defined(CONFIG_KERNEL_REPLICATION) && defined(CONFIG_ARM64) +/* + * Same as apply_to_page_range(), but taking into account per-NUMA node + * replicas. + */ +int apply_to_page_range_replicas(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + int nid; + int ret = 0; + + for_each_memory_node(nid) { + ret = __apply_to_page_range(mm, per_node_pgd(mm, nid), + addr, size, fn, data, true); + if (ret) + break; + } + + return ret; +} +#endif /* CONFIG_KERNEL_REPLICATION && CONFIG_ARM64 */ + /* * Scan a region of virtual memory, calling a provided function on * each leaf page table where it exists. @@ -2650,7 +2713,7 @@ EXPORT_SYMBOL_GPL(apply_to_page_range); int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { - return __apply_to_page_range(mm, addr, size, fn, data, false); + return __apply_to_page_range(mm, mm->pgd, addr, size, fn, data, false); } EXPORT_SYMBOL_GPL(apply_to_existing_page_range); @@ -4883,6 +4946,51 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, EXPORT_SYMBOL_GPL(handle_mm_fault); #ifndef __PAGETABLE_P4D_FOLDED + +#ifdef CONFIG_KERNEL_REPLICATION +static void __p4d_populate_to_replicas(struct mm_struct *mm, + p4d_t *p4d, + unsigned long address) +{ + int nid; + pgd_t *pgd; + + if (mm_p4d_folded(mm) || !is_text_replicated()) + return; + + for_each_memory_node(nid) { + pgd = pgd_offset_pgd(mm->pgd_numa[nid], address); + if (pgd_present(*pgd)) + continue; + pgd_populate(mm, pgd, p4d); + } +} + +int __p4d_alloc_node(unsigned int nid, + struct mm_struct *mm, + pgd_t *pgd, unsigned long address) +{ + p4d_t *new = p4d_alloc_one_node(nid, mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); + if (pgd_present(*pgd)) { /* Another has populated it */ + p4d_free(mm, new); + } else { + smp_wmb(); /* See comment in pmd_install() */ + pgd_populate(mm, pgd, new); + } + spin_unlock(&mm->page_table_lock); + return 0; +} +#else +static void __p4d_populate_to_replicas(struct mm_struct *mm, + p4d_t *p4d, + unsigned long address) +{ } +#endif /* CONFIG_KERNEL_REPLICATION */ + /* * Allocate p4d page table. * We've already handled the fast-path in-line. @@ -4898,14 +5006,63 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) spin_lock(&mm->page_table_lock); if (pgd_present(*pgd)) /* Another has populated it */ p4d_free(mm, new); - else + else { pgd_populate(mm, pgd, new); + __p4d_populate_to_replicas(mm, new, address); + } spin_unlock(&mm->page_table_lock); return 0; } #endif /* __PAGETABLE_P4D_FOLDED */ #ifndef __PAGETABLE_PUD_FOLDED + +#ifdef CONFIG_KERNEL_REPLICATION +static void __pud_populate_to_replicas(struct mm_struct *mm, + pud_t *pud, + unsigned long address) +{ + int nid; + p4d_t *p4d; + + if (!mm_p4d_folded(mm) || !is_text_replicated()) + return; + + for_each_online_node(nid) { + p4d = (p4d_t *)pgd_offset_pgd(mm->pgd_numa[nid], address); + if (p4d_present(*p4d)) + continue; + p4d_populate(mm, p4d, pud); + } +} + +int __pud_alloc_node(unsigned int nid, + struct mm_struct *mm, + p4d_t *p4d, unsigned long address) +{ + pud_t *new = pud_alloc_one_node(nid, mm, address); + if (!new) + return -ENOMEM; + + spin_lock(&mm->page_table_lock); + if (!p4d_present(*p4d)) { + mm_inc_nr_puds(mm); + smp_wmb(); /* See comment in pmd_install() */ + p4d_populate(mm, p4d, new); + } else /* Another has populated it */ + pud_free(mm, new); + spin_unlock(&mm->page_table_lock); + return 0; +} +#else +static void __pud_populate_to_replicas(struct mm_struct *mm, + pud_t *pud, + unsigned long address) +{ + return; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + /* * Allocate page upper directory. * We've already handled the fast-path in-line. @@ -4922,6 +5079,7 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); p4d_populate(mm, p4d, new); + __pud_populate_to_replicas(mm, new, address); } else /* Another has populated it */ pud_free(mm, new); spin_unlock(&mm->page_table_lock); @@ -4952,6 +5110,29 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) spin_unlock(ptl); return 0; } + +#ifdef CONFIG_KERNEL_REPLICATION +int __pmd_alloc_node(unsigned int nid, + struct mm_struct *mm, + pud_t *pud, unsigned long address) +{ + spinlock_t *ptl; + pmd_t *new = pmd_alloc_one_node(nid, mm, address); + if (!new) + return -ENOMEM; + + ptl = pud_lock(mm, pud); + if (!pud_present(*pud)) { + mm_inc_nr_pmds(mm); + smp_wmb(); /* See comment in pmd_install() */ + pud_populate(mm, pud, new); + } else { /* Another has populated it */ + pmd_free(mm, new); + } + spin_unlock(ptl); + return 0; +} +#endif /* CONFIG_KERNEL_REPLICATION */ #endif /* __PAGETABLE_PMD_FOLDED */ int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, @@ -5587,3 +5768,62 @@ vm_fault_t do_anon_page_remap(struct vm_area_struct *vma, unsigned long address, return VM_FAULT_OOM; } #endif + +/** + * Walk in replicated tranlation table specified by nid. + * If kernel replication is disabled or text is not replicated yet, + * value of nid is not used + */ +struct page *walk_to_page_node(int nid, const void *vmalloc_addr) +{ + unsigned long addr = (unsigned long)vmalloc_addr; + struct page *page = NULL; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + + if (!is_text_replicated()) + nid = 0; + + pgd = pgd_offset_pgd(per_node_pgd(&init_mm, nid), addr); + if (pgd_none(*pgd)) + return NULL; + if (WARN_ON_ONCE(pgd_leaf(*pgd))) + return NULL; /* XXX: no allowance for huge pgd */ + if (WARN_ON_ONCE(pgd_bad(*pgd))) + return NULL; + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) + return NULL; + if (p4d_leaf(*p4d)) + return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(p4d_bad(*p4d))) + return NULL; + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) + return NULL; + if (pud_leaf(*pud)) + return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pud_bad(*pud))) + return NULL; + + pmd = pmd_offset(pud, addr); + if (pmd_none(*pmd)) + return NULL; + if (pmd_leaf(*pmd)) + return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + if (WARN_ON_ONCE(pmd_bad(*pmd))) + return NULL; + + ptep = pte_offset_map(pmd, addr); + pte = *ptep; + if (pte_present(pte)) + page = pte_page(pte); + pte_unmap(ptep); + + return page; +} diff --git a/mm/numa_replication.c b/mm/numa_replication.c new file mode 100644 index 0000000000000..4bd5b75188bac --- /dev/null +++ b/mm/numa_replication.c @@ -0,0 +1,681 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/pagewalk.h> +#include <linux/numa_replication.h> +#include <linux/memblock.h> +#include <linux/pgtable.h> +#include <linux/hugetlb.h> +#include <linux/kobject.h> +#include <linux/debugfs.h> + +#include <asm/sections.h> +#include <asm/tlbflush.h> + +#define KERNEL_TEXT_START ((unsigned long)&_stext) +#define KERNEL_TEXT_END ((unsigned long)&_etext) + +#define KERNEL_RODATA_START ((unsigned long)&__start_rodata) +#define KERNEL_RODATA_END ((unsigned long)&__end_rodata) + +#define PMD_ALLOC_ORDER (PMD_SHIFT-PAGE_SHIFT) +#define PAGES_PER_PMD (1 << PMD_ALLOC_ORDER) + +#define replication_log(data, fmt, args...) \ +({ \ + if (data && data->m) \ + seq_printf(data->m, fmt, ##args); \ + else \ + pr_info(KERN_CONT fmt, ##args); \ +}) + +struct numa_node_desc { + pgd_t *pgd; + void *text_vaddr; + void *rodata_vaddr; +}; + +static struct numa_node_desc __initdata_or_module node_desc[MAX_NUMNODES]; + +struct dump_data { + struct seq_file *m; +}; + +struct dump_config { + int pgd_extra_info:1; + int p4d_extra_info:1; + int pud_extra_info:1; + int pmd_extra_info:1; + int pte_extra_info:1; + struct dump_data *data; +}; + +static bool text_replicated; +/* + * The first ready NUMA node, used as a source node + * for kernel text and rodata replication + */ +static unsigned int master_node = INT_MAX; +/* + * The case when machine has memoryless nodes is rare + * but possible. To handle memoryless nodes properly + * kernel replication maintains mapping node -> node with memory + * for all NUMA nodes. + */ +static int node_to_memory_node[MAX_NUMNODES]; + +static bool pgtables_extra; +static DEFINE_SPINLOCK(debugfs_lock); + +bool is_text_replicated(void) +{ + return text_replicated; +} + +static void binary_dump(struct dump_data *data, unsigned long value) +{ + int i; + + for (i = BITS_PER_LONG - 1; i >= 0; i--) { + if ((BITS_PER_LONG - 1 - i) % BITS_PER_BYTE == 0) + replication_log(data, "%-9d", i); + } + replication_log(data, "%d\n", 0); + + for (i = BITS_PER_LONG - 1; i >= 0; i--) { + if ((BITS_PER_LONG - 1 - i) % BITS_PER_BYTE == 0) + replication_log(data, "|"); + + replication_log(data, "%d", (1UL << i) & value ? 1 : 0); + } + replication_log(data, "|\n"); +} + +static int pgd_callback(pgd_t *pgd, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pgd_val(*pgd); + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PGDIR_MASK; + next = (addr & PGDIR_MASK) - 1 + PGDIR_SIZE; + + replication_log(c->data, + "PGD ADDR: 0x%p PGD VAL: 0x%016lx [%p --- %p]\n", + pgd, val, (void *)addr, (void *)next); + + if (c->pgd_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int p4d_callback(p4d_t *p4d, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = p4d_val(*p4d); + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & P4D_MASK; + next = (addr & P4D_MASK) - 1 + P4D_SIZE; + + replication_log(c->data, + "P4D ADDR: 0x%p P4D VAL: 0x%016lx [%p --- %p]\n", + p4d, val, (void *)addr, (void *)next); + + if (c->p4d_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int pud_callback(pud_t *pud, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pud_val(*pud); + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PUD_MASK; + next = (addr & PUD_MASK) - 1 + PUD_SIZE; + + replication_log(c->data, + "PUD ADDR: 0x%p PUD VAL: 0x%016lx huge(%d) [%p --- %p]\n", + pud, val, pud_huge(*pud), (void *)addr, (void *)next); + + if (c->pud_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int pmd_callback(pmd_t *pmd, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pmd_val(*pmd); + unsigned long paddr = pmd_pfn(*pmd) << PAGE_SHIFT; + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PMD_MASK; + next = (addr & PMD_MASK) - 1 + PMD_SIZE; + + replication_log(c->data, + "PMD ADDR: 0x%p PMD VAL: 0x%016lx huge(%d) [%p --- %p] to %p\n", + pmd, val, pmd_huge(*pmd), (void *)addr, (void *)next, (void *)paddr); + + if (c->pmd_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int pte_callback(pte_t *pte, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pte_val(*pte); + unsigned long paddr = pte_pfn(*pte) << PAGE_SHIFT; + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PAGE_MASK; + next = (addr & PAGE_MASK) - 1 + PAGE_SIZE; + + replication_log(c->data, + "PTE ADDR: 0x%p PTE VAL: 0x%016lx [%p --- %p] to %p\n", + pte, val, (void *)addr, (void *)next, (void *)paddr); + + if (c->pte_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int pte_hole_callback(unsigned long addr, unsigned long next, + int depth, struct mm_walk *walk) +{ + struct dump_config *c = (struct dump_config *)walk->private; + + replication_log(c->data, "%*chole\n", depth * 2, ' '); + + return 0; +} + +static void dump_pgtables(struct mm_struct *mm, + struct dump_data *data, + unsigned long start, unsigned long end) +{ + int nid = 0; + int extra = pgtables_extra ? 1 : 0; + struct dump_config conf = { + .pgd_extra_info = extra, + .p4d_extra_info = extra, + .pud_extra_info = extra, + .pmd_extra_info = extra, + .pte_extra_info = extra, + .data = data, + }; + + const struct mm_walk_ops ops = { + .pgd_entry = pgd_callback, + .p4d_entry = p4d_callback, + .pud_entry = pud_callback, + .pmd_entry = pmd_callback, + .pte_entry = pte_callback, + .pte_hole = pte_hole_callback + }; + + BUG_ON(data && data->m == NULL); + + start = start & PAGE_MASK; + end = (end & PAGE_MASK) - 1 + PAGE_SIZE; + + replication_log(data, + "----PER-NUMA NODE KERNEL REPLICATION ENABLED----\n"); + mmap_read_lock(mm); + for_each_memory_node(nid) { + replication_log(data, "NUMA node id #%d\n", nid); + replication_log(data, "PGD: %p PGD phys: %p\n", + mm->pgd_numa[nid], (void *)virt_to_phys(mm->pgd_numa[nid])); + walk_page_range_novma(mm, start, end, &ops, mm->pgd_numa[nid], &conf); + } + mmap_read_unlock(mm); + replication_log(data, + "----PER-NUMA NODE KERNEL REPLICATION ENABLED----\n"); +} + +static void dump_kernel_pgtables(struct dump_data *data, + unsigned long start, unsigned long end) +{ + dump_pgtables(&init_mm, data, start, end); +} + +void dump_mm_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + dump_pgtables(mm, NULL, start, end); +} + +static void cpu_dump(void *info) +{ + struct dump_data *data = (struct dump_data *)info; + + spin_lock(&debugfs_lock); + numa_cpu_dump(data->m); + spin_unlock(&debugfs_lock); +} + +static int stats_show(struct seq_file *m, void *v) +{ + int cpu; + struct dump_data data = { + .m = m, + }; + + for_each_online_cpu(cpu) + smp_call_function_single(cpu, cpu_dump, &data, 1); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(stats); + +static int pgtables_show(struct seq_file *m, void *v) +{ + struct dump_data data = { + .m = m, + }; + + dump_kernel_pgtables(&data, + KERNEL_TEXT_START, KERNEL_RODATA_END - 1); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(pgtables); + +void debugfs_init(void) +{ + struct dentry *dir; + static struct dentry *debugfs_dir; + + debugfs_dir = debugfs_create_dir("numa_replication", NULL); + if (IS_ERR(debugfs_dir)) { + pr_err("Failed to create debugfs entry for NUMA" + " replication: %ld\n", + PTR_ERR(debugfs_dir)); + return; + } + dir = debugfs_create_file("stats", 0400, debugfs_dir, + NULL, &stats_fops); + if (IS_ERR(dir)) { + pr_err("Failed to create debugfs entry for NUMA" + " replication stats: %ld\n", + PTR_ERR(dir)); + return; + } + + dir = debugfs_create_file("pgtables_kernel", 0400, debugfs_dir, + NULL, &pgtables_fops); + if (IS_ERR(dir)) { + pr_err("Failed to create debugfs entry for NUMA" + " replication pgtables: %ld\n", + PTR_ERR(dir)); + return; + } + + debugfs_create_bool("pgtables_kernel_extra", 0600, debugfs_dir, + &pgtables_extra); +} + +/* + * The case, when machine has memoryless NUMA nodes + * should be handled in a special way. To do this we + * create node<->memory mapping to have an information + * about the node with memory that memoryless node can use. + */ +static void init_node_to_memory_mapping(void) +{ + int nid; + + for_each_online_node(nid) { + int memory_nid; + int min_dist = INT_MAX; + + node_to_memory_node[nid] = nid; + for_each_memory_node(memory_nid) { + int dist = node_distance(nid, memory_nid); + + if (dist < min_dist) { + min_dist = dist; + node_to_memory_node[nid] = memory_nid; + } + } + pr_info("For node %d memory is on the node - %d\n", + nid, node_to_memory_node[nid]); + } +} + +int numa_get_memory_node(int nid) +{ + return node_to_memory_node[nid]; +} + +/* + * The function creates replica of particular memory area + * and install replicated memory in translation table of + * required NUMA node. + */ +static void replicate_memory(void *dst, unsigned long start, unsigned long end, int nid) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pgprot_t prot; + unsigned int nr_pmd = 0; + unsigned long vaddr = start; + struct page *pages = virt_to_page(dst); + + memcpy(dst, lm_alias(start), end - start); + for (; vaddr < end; vaddr += PMD_SIZE, nr_pmd++) { + pgd = pgd_offset_pgd(node_desc[nid].pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); + pmd = pmd_offset(pud, vaddr); + + prot = pmd_pgprot(*pmd); + + set_pmd(pmd, pfn_pmd(page_to_pfn(pages) + nr_pmd * PAGES_PER_PMD, prot)); + } +} + +static void __init replicate_kernel_text(int nid) +{ + replicate_memory(node_desc[nid].text_vaddr, + KERNEL_TEXT_START, KERNEL_TEXT_END, nid); + numa_sync_text_replicas((unsigned long)node_desc[nid].text_vaddr, + (unsigned long)node_desc[nid].text_vaddr + (KERNEL_TEXT_END - KERNEL_TEXT_START)); +} + +static void replicate_kernel_rodata(int nid) +{ + replicate_memory(node_desc[nid].rodata_vaddr, + KERNEL_RODATA_START, KERNEL_RODATA_END, nid); +} + +//'-1' in next functions have only one purpose - prevent unsgined long overflow +static void replicate_pgt_pmd(p4d_t *dst, p4d_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & PUD_MASK; + unsigned long right = (end & PUD_MASK) - 1 + PUD_SIZE; + unsigned long addr; + + pud_t *clone_pud = pud_offset(dst, left); + pud_t *orig_pud = pud_offset(src, left); + + for (addr = left; + (addr >= left && addr < right); addr += PUD_SIZE) { + pmd_t *new_pmd; + + if (pud_none(*orig_pud) || pud_huge(*orig_pud)) + goto skip; + + pud_clear(clone_pud); + new_pmd = pmd_alloc_node(nid, &init_mm, clone_pud, addr); + BUG_ON(new_pmd == NULL); + + copy_page(pud_pgtable(*clone_pud), pud_pgtable(*orig_pud)); +skip: + clone_pud++; + orig_pud++; + } +} + +static void replicate_pgt_pud(pgd_t *dst, pgd_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & P4D_MASK; + unsigned long right = (end & P4D_MASK) - 1 + P4D_SIZE; + unsigned long addr; + + p4d_t *clone_p4d = p4d_offset(dst, left); + p4d_t *orig_p4d = p4d_offset(src, left); + + for (addr = left; + (addr >= left && addr < right); addr += P4D_SIZE) { + pud_t *new_pud; + + if (p4d_none(*orig_p4d) || p4d_huge(*orig_p4d)) + goto skip; + + p4d_clear(clone_p4d); + new_pud = pud_alloc_node(nid, &init_mm, clone_p4d, addr); + BUG_ON(new_pud == NULL); + + copy_page(p4d_pgtable(*clone_p4d), p4d_pgtable(*orig_p4d)); + /* + * start and end passed to the next function must be in + * range of p4ds, so min and max are used here + */ + replicate_pgt_pmd(clone_p4d, orig_p4d, max(addr, start), + min(addr - 1 + P4D_SIZE, end), nid); +skip: + clone_p4d++; + orig_p4d++; + } +} + +static void replicate_pgt_p4d(pgd_t *dst, pgd_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & PGDIR_MASK; + unsigned long right = (end & PGDIR_MASK) - 1 + PGDIR_SIZE; + unsigned long addr; + + pgd_t *clone_pgd = pgd_offset_pgd(dst, left); + pgd_t *orig_pgd = pgd_offset_pgd(src, left); + + for (addr = left; + (addr >= left && addr < right); addr += PGDIR_SIZE) { + p4d_t *new_p4d; + + /* TODO: remove last condition and do something better + * In the case of a folded P4D level, pgd_none and pgd_huge + * always return 0, so we might start to replicate empty entries. + * We obviously want to avoid this, so the last check is performed here. + */ + if (pgd_none(*orig_pgd) || pgd_huge(*orig_pgd) || + (unsigned long)(orig_pgd->pgd) == 0) + goto skip; + + pgd_clear(clone_pgd); + new_p4d = p4d_alloc_node(nid, &init_mm, clone_pgd, addr); + BUG_ON(new_p4d == NULL); + + copy_page((void *)pgd_page_vaddr(*clone_pgd), + (void *)pgd_page_vaddr(*orig_pgd)); + replicate_pgt_pud(clone_pgd, orig_pgd, max(addr, start), + min(addr - 1 + PGDIR_SIZE, end), nid); +skip: + clone_pgd++; + orig_pgd++; + } +} + +static void replicate_pgt(int nid, unsigned long start, unsigned long end) +{ + replicate_pgt_p4d(node_desc[nid].pgd, init_mm.pgd, start, end, nid); +} + +/* + * Page tables replication works in a way when first + * pgd level replicated and then the replication of the + * left part if done. The only part of pagetable that + * contains text and rodata is replicated. Obviously a + * part of upper layer entries of page table should be + * replicated too. As result, the pgd, p4d, pud and pmd + * layers are touched by replication. In particular, the + * page table sub-tree that cover kernel text and rodata. + */ +static void replicate_pgtables(void) +{ + int nid; + + init_mm.pgd_numa = (pgd_t **)kmalloc(sizeof(pgd_t *) * MAX_NUMNODES, GFP_PGTABLE_KERNEL); + BUG_ON(!init_mm.pgd_numa); + + for_each_memory_node(nid) { + node_desc[nid].pgd = numa_replicate_pgt_pgd(nid); + replicate_pgt(nid, PAGE_TABLE_REPLICATION_LEFT, + PAGE_TABLE_REPLICATION_RIGHT); + } + + init_mm.pgd = node_desc[numa_get_memory_node(0)].pgd; + + for_each_online_node(nid) { + int memory_nid = numa_get_memory_node(nid); + + init_mm.pgd_numa[nid] = node_desc[memory_nid].pgd; + } + + init_mm.pgd = init_mm.pgd_numa[first_memory_node]; +} + +/* + * Kernel text replication includes two steps: + * 1. page tables replication for init_mm + * 2. kernel text pages replication and + * corresponding page table update. + * 3. setup page table, related to + * current NUMA node on current cpu, + * for other NUMA cpus page tables will + * be updated later, during cpu initialization. + * Master node - the first NUMA node, used as + * a source for replicas. Memory for master node + * is expected to be already local. + */ +void __init numa_replicate_kernel_text(void) +{ + int nid; + + replicate_pgtables(); + + for_each_memory_node(nid) { + if (nid == master_node) + continue; + replicate_kernel_text(nid); + } + + text_replicated = true; + numa_setup_pgd(); +} + +void numa_replicate_kernel_rodata(void) +{ + int nid; + + for_each_memory_node(nid) { + if (nid == master_node) + continue; + replicate_kernel_rodata(nid); + } + + flush_tlb_all(); +} + +void numa_setup_pgd(void) +{ + numa_load_replicated_pgd(init_mm.pgd_numa[numa_node_id()]); +} + +void __init_or_module *numa_get_replica(void *vaddr, int nid) +{ + unsigned long addr = (unsigned long)vaddr; + unsigned long offset = addr - KERNEL_TEXT_START; + + BUG_ON(addr < KERNEL_TEXT_START || addr >= KERNEL_TEXT_END); + BUG_ON(node_desc[nid].text_vaddr == NULL); + BUG_ON(numa_get_memory_node(nid) != nid); + + return node_desc[nid].text_vaddr + offset; +} + +nodemask_t __read_mostly replica_nodes = { { [0] = 1UL } }; + +void __init numa_replication_init(void) +{ + int nid; + + nodes_clear(replica_nodes); + + for_each_node_state(nid, N_MEMORY) { + __node_set(nid, &replica_nodes); + } + + for_each_memory_node(nid) + pr_info("Memory node: %d\n", nid); + + init_node_to_memory_mapping(); + master_node = page_to_nid(virt_to_page(lm_alias((void *)KERNEL_TEXT_START))); + + pr_info("Master Node: #%d\n", master_node); + for_each_memory_node(nid) { + if (nid == master_node) { + node_desc[nid].text_vaddr = lm_alias((void *)KERNEL_TEXT_START); + node_desc[nid].rodata_vaddr = lm_alias((void *)KERNEL_RODATA_START); + } else { + node_desc[nid].text_vaddr = memblock_alloc_try_nid( + (KERNEL_TEXT_END - KERNEL_TEXT_START), + HPAGE_SIZE, 0, MEMBLOCK_ALLOC_ANYWHERE, nid); + + node_desc[nid].rodata_vaddr = memblock_alloc_try_nid( + (KERNEL_RODATA_END - KERNEL_RODATA_START), + HPAGE_SIZE, 0, MEMBLOCK_ALLOC_ANYWHERE, nid); + } + + BUG_ON(node_desc[nid].text_vaddr == NULL); + BUG_ON(node_desc[nid].rodata_vaddr == NULL); + } +} + +void numa_replication_fini(void) +{ + int nid; + + /* + * Clear linear address space info + */ + for_each_memory_node(nid) { + node_desc[nid].text_vaddr = NULL; + node_desc[nid].rodata_vaddr = NULL; + } + + debugfs_init(); + + pr_info("Replicated page table : [%p --- %p]\n", + (void *)PAGE_TABLE_REPLICATION_LEFT, + (void *)PAGE_TABLE_REPLICATION_RIGHT); + + dump_kernel_pgtables(NULL, KERNEL_TEXT_START, KERNEL_RODATA_END - 1); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7cf3cd1d028b7..3bdc6aa73c7c2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5471,6 +5471,26 @@ static inline void free_the_page(struct page *page, unsigned int order) __free_pages_ok(page, order, FPI_NONE); } +#ifdef CONFIG_KERNEL_REPLICATION +unsigned long __get_free_pages_node(unsigned int nid, gfp_t gfp_mask, + unsigned int order) +{ + struct page *page; + + page = alloc_pages_node(nid, gfp_mask & ~__GFP_HIGHMEM, order); + if (!page) + return 0; + return (unsigned long) page_address(page); +} +EXPORT_SYMBOL(__get_free_pages_node); + +unsigned long get_zeroed_page_node(unsigned int nid, gfp_t gfp_mask) +{ + return __get_free_pages_node(nid, gfp_mask | __GFP_ZERO, 0); +} +EXPORT_SYMBOL(get_zeroed_page_node); +#endif /* CONFIG_KERNEL_REPLICATION */ + void __free_pages(struct page *page, unsigned int order) { /* get PageHead before we drop reference */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4a2c6ce0ad568..01bfe4131a53a 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -10,6 +10,7 @@ #include <linux/vmalloc.h> #include <linux/mm.h> +#include <linux/numa_replication.h> #include <linux/module.h> #include <linux/highmem.h> #include <linux/sched/signal.h> @@ -400,29 +401,26 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, } while (p4d++, addr = next, addr != end); } -/** - * unmap_kernel_range_noflush - unmap kernel VM area - * @start: start of the VM area to unmap - * @size: size of the VM area to unmap - * - * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify - * should have been allocated using get_vm_area() and its friends. +/* + * vunmap_range_noflush_pgd is similar to vunmap_range, but does not + * flush caches or TLBs, and able to work with pgd granularity. * - * NOTE: - * This function does NOT do any cache flushing. The caller is responsible - * for calling flush_cache_vunmap() on to-be-mapped areas before calling this - * function and flush_tlb_kernel_range() after. + * The caller is responsible for calling flush_cache_vmap() before calling + * this function, and flush_tlb_kernel_range after it has returned + * successfully (and before the addresses are expected to cause a page fault + * or be re-mapped for something else, if TLB flushes are being delayed or + * coalesced). */ -void unmap_kernel_range_noflush(unsigned long start, unsigned long size) +static void vunmap_range_noflush_pgd(pgd_t *pgtable, + unsigned long start, unsigned long end) { - unsigned long end = start + size; unsigned long next; pgd_t *pgd; unsigned long addr = start; pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); - pgd = pgd_offset_k(addr); + pgd = pgd_offset_pgd(pgtable, addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) @@ -436,6 +434,47 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size) arch_sync_kernel_mappings(start, end); } +/* + * vunmap_range_noflush is similar to vunmap_range_noflush_pgd, but works + * only with init_mm->pgd. + * + * This is an internal function only. Do not use outside mm/. + */ +void __unmap_kernel_range_noflush(unsigned long start, unsigned long end) +{ + vunmap_range_noflush_pgd(init_mm.pgd, start, end); +} + +/** + * unmap_kernel_range_noflush - unmap kernel VM area + * @start: start of the VM area to unmap + * @size: size of the VM area to unmap + * + * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify + * should have been allocated using get_vm_area() and its friends. + * + * NOTE: + * This function does NOT do any cache flushing. The caller is responsible + * for calling flush_cache_vunmap() on to-be-mapped areas before calling this + * function and flush_tlb_kernel_range() after. + */ +void unmap_kernel_range_noflush(unsigned long start, unsigned long size) +{ + __unmap_kernel_range_noflush(start, start + size); +} + +#if defined(CONFIG_KERNEL_REPLICATION) && defined(CONFIG_ARM64) +void vunmap_range_replicas(unsigned long addr, unsigned long end) +{ + int nid; + + flush_cache_vunmap(addr, end); + for_each_memory_node(nid) + vunmap_range_noflush_pgd(init_mm.pgd_numa[nid], addr, end); + flush_tlb_kernel_range(addr, end); +} +#endif /*CONFIG_KERNEL_REPLICATION*/ + static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, int *nr, pgtbl_mod_mask *mask) @@ -518,7 +557,8 @@ static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr, return 0; } -static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, +static int vmap_small_pages_range_noflush_pgd(pgd_t *pgtable, + unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages) { unsigned long start = addr; @@ -529,12 +569,13 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); - pgd = pgd_offset_k(addr); + pgd = pgd_offset_pgd(pgtable, addr); do { next = pgd_addr_end(addr, end); if (pgd_bad(*pgd)) mask |= PGTBL_PGD_MODIFIED; - err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask); + err = vmap_pages_p4d_range(pgd, addr, next, + prot, pages, &nr, &mask); if (err) return err; } while (pgd++, addr = next, addr != end); @@ -545,8 +586,49 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, return 0; } -static int vmap_pages_range_noflush(unsigned long addr, unsigned long end, - pgprot_t prot, struct page **pages, unsigned int page_shift) +static int vmap_range_noflush_pgd(pgd_t *pgtable, + unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int max_page_shift) +{ + pgd_t *pgd; + unsigned long start; + unsigned long next; + int err; + pgtbl_mod_mask mask = 0; + + might_sleep(); + BUG_ON(addr >= end); + + start = addr; + pgd = pgd_offset_pgd(pgtable, addr); + do { + next = pgd_addr_end(addr, end); + err = vmap_p4d_range(pgd, addr, next, phys_addr, prot, + max_page_shift, &mask); + if (err) + break; + } while (pgd++, phys_addr += (next - addr), addr = next, addr != end); + + if (mask & ARCH_PAGE_TABLE_SYNC_MASK) + arch_sync_kernel_mappings(start, end); + + return err; +} + +/* + * vmap_pages_range_noflush_pgd is similar to vmap_pages_range, but does not + * flush caches. + * + * The caller is responsible for calling flush_cache_vmap() after this + * function returns successfully and before the addresses are accessed. + * + * This is an internal function only. Do not use outside mm/. + */ +static int vmap_pages_range_noflush_pgd(pgd_t *pgtable, + unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; @@ -556,13 +638,14 @@ static int vmap_pages_range_noflush(unsigned long addr, unsigned long end, if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) || page_shift == PAGE_SHIFT) - return vmap_small_pages_range_noflush(addr, end, prot, pages); + return vmap_small_pages_range_noflush_pgd(pgtable, addr, end, + prot, pages); for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) { int err; - err = vmap_range_noflush(addr, addr + (1UL << page_shift), - __pa(page_address(pages[i])), prot, + err = vmap_range_noflush_pgd(pgtable, addr, addr + (1UL << page_shift), + page_to_phys(pages[i]), prot, page_shift); if (err) return err; @@ -573,14 +656,10 @@ static int vmap_pages_range_noflush(unsigned long addr, unsigned long end, return 0; } -static int vmap_pages_range(unsigned long addr, unsigned long end, +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { - int err; - - err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift); - flush_cache_vmap(addr, end); - return err; + return vmap_pages_range_noflush_pgd(init_mm.pgd, addr, end, prot, pages, page_shift); } #ifdef CONFIG_EXTEND_HUGEPAGE_MAPPING @@ -674,58 +753,12 @@ int is_vmalloc_or_module_addr(const void *x) */ struct page *vmalloc_to_page(const void *vmalloc_addr) { - unsigned long addr = (unsigned long) vmalloc_addr; - struct page *page = NULL; - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for * architectures that do not vmalloc module space */ VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); - - if (pgd_none(*pgd)) - return NULL; - if (WARN_ON_ONCE(pgd_leaf(*pgd))) - return NULL; /* XXX: no allowance for huge pgd */ - if (WARN_ON_ONCE(pgd_bad(*pgd))) - return NULL; - - p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) - return NULL; - if (p4d_leaf(*p4d)) - return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(p4d_bad(*p4d))) - return NULL; - - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) - return NULL; - if (pud_leaf(*pud)) - return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(pud_bad(*pud))) - return NULL; - - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) - return NULL; - if (pmd_leaf(*pmd)) - return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); - if (WARN_ON_ONCE(pmd_bad(*pmd))) - return NULL; - - ptep = pte_offset_map(pmd, addr); - pte = *ptep; - if (pte_present(pte)) - page = pte_page(pte); - pte_unmap(ptep); - - return page; + return walk_to_page_node(0, vmalloc_addr); } EXPORT_SYMBOL(vmalloc_to_page); @@ -1784,7 +1817,22 @@ static void free_vmap_area_noflush(struct vmap_area *va) static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); +#ifdef CONFIG_KERNEL_REPLICATION + if (numa_addr_has_replica((void *)va->va_start)) { + int node; + /** + * In some scenarios we might clear + * empty entries here, which is totally fine + */ + for_each_memory_node(node) + vunmap_range_noflush_pgd(init_mm.pgd_numa[node], + va->va_start, va->va_end); + } else { + unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); + } +#else unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start); +#endif /* CONFIG_KERNEL_REPLICATION */ if (debug_pagealloc_enabled_static()) flush_tlb_kernel_range(va->va_start, va->va_end); @@ -2559,18 +2607,74 @@ struct vm_struct *remove_vm_area(const void *addr) return NULL; } +#ifdef CONFIG_KERNEL_REPLICATION +static inline void set_direct_map_page_replicas(const struct vm_struct *area, + struct page *page, + int (*set_direct_map)(struct page *page)) +{ + if (area->replicated) { + struct page *cursor; + + list_for_each_entry(cursor, &page->lru, lru) { + if (page_address(cursor)) + set_direct_map(cursor); + } + } +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static inline void set_area_direct_map(const struct vm_struct *area, int (*set_direct_map)(struct page *page)) { int i; /* HUGE_VMALLOC passes small pages to set_direct_map */ - for (i = 0; i < area->nr_pages; i++) + for (i = 0; i < area->nr_pages; i++) { if (page_address(area->pages[i])) set_direct_map(area->pages[i]); +#ifdef CONFIG_KERNEL_REPLICATION + set_direct_map_page_replicas(area, + area->pages[i], set_direct_map); +#endif /* CONFIG_KERNEL_REPLICATION */ + } } -/* Handle removing and resetting vm mappings related to the vm_struct. */ +#ifdef CONFIG_KERNEL_REPLICATION +static void vm_account_replicated_range(struct vm_struct *area, + struct page *page, + unsigned long *s, + unsigned long *e, + int *flush) +{ + int flush_dmap = 0; + unsigned long start = ULONG_MAX, end = 0; + unsigned int page_order = vm_area_page_order(area); + + if (area->replicated) { + struct page *cursor; + + list_for_each_entry(cursor, &page->lru, lru) { + unsigned long addr = (unsigned long)page_address(cursor); + + if (addr) { + unsigned long page_size; + + page_size = PAGE_SIZE << page_order; + start = min(addr, start); + end = max(addr + page_size, end); + flush_dmap = 1; + } + } + } + + if (flush_dmap) + *flush = flush_dmap; + + *s = start; + *e = end; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) { unsigned long start = ULONG_MAX, end = 0; @@ -2595,12 +2699,12 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) } /* - * If execution gets here, flush the vm mapping and reset the direct - * map. Find the start and end range of the direct mappings to make sure + * Find the start and end range of the direct mappings to make sure that * the vm_unmap_aliases() flush includes the direct map. */ for (i = 0; i < area->nr_pages; i += 1U << page_order) { unsigned long addr = (unsigned long)page_address(area->pages[i]); + if (addr) { unsigned long page_size; @@ -2609,6 +2713,10 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) end = max(addr + page_size, end); flush_dmap = 1; } +#ifdef CONFIG_KERNEL_REPLICATION + vm_account_replicated_range(area, area->pages[i], + &start, &end, &flush_dmap); +#endif /* CONFIG_KERNEL_REPLICATION */ } /* @@ -2621,6 +2729,22 @@ static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages) set_area_direct_map(area, set_direct_map_default_noflush); } +#ifdef CONFIG_KERNEL_REPLICATION +static void vfree_page_replicas(struct vm_struct *area, struct page *page, unsigned int page_order) +{ + if (area->replicated) { + struct page *cursor, *tmp; + + list_for_each_entry_safe(cursor, tmp, &page->lru, lru) { + BUG_ON(!cursor); + + list_del(&cursor->lru); + __free_pages(cursor, page_order); + } + } +} +#endif /* CONFIG_KERNEL_REPLICATION */ + static void __vunmap(const void *addr, int deallocate_pages) { struct vm_struct *area; @@ -2659,6 +2783,9 @@ static void __vunmap(const void *addr, int deallocate_pages) for (i = 0; i < area->nr_pages; i += 1U << page_order) { struct page *page = area->pages[i]; +#ifdef CONFIG_KERNEL_REPLICATION + vfree_page_replicas(area, page, page_order); +#endif /* CONFIG_KERNEL_REPLICATION */ BUG_ON(!page); __free_pages(page, page_order); } @@ -2896,6 +3023,107 @@ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) EXPORT_SYMBOL_GPL(vmap_pfn); #endif /* CONFIG_VMAP_PFN */ +static inline unsigned int +vm_area_alloc_pages(gfp_t gfp, int nid, + unsigned int order, unsigned int nr_pages, struct page **pages) +{ + unsigned int nr_allocated = 0; + struct page *page; + int i; + + + while (nr_allocated < nr_pages) { + if (fatal_signal_pending(current)) + break; + + if (nid == NUMA_NO_NODE) + page = alloc_pages(gfp, order); + else + page = alloc_pages_node(nid, gfp, order); + if (unlikely(!page)) + break; + /* + * Higher order allocations must be able to be treated as + * indepdenent small pages by callers (as they can with + * small-page vmallocs). Some drivers do their own refcounting + * on vmalloc_to_page() pages, some use page->mapping, + * page->lru, etc. + */ + if (order) + split_page(page, order); + + /* + * Careful, we allocate and map page-order pages, but + * tracking is done per PAGE_SIZE page so as to keep the + * vm_struct APIs independent of the physical/mapped size. + */ + for (i = 0; i < (1U << order); i++) + pages[nr_allocated + i] = page + i; + + if (gfpflags_allow_blocking(gfp)) + cond_resched(); + + nr_allocated += 1U << order; + } + + return nr_allocated; +} + +static int vmalloc_map_area_pages_pgd(unsigned long addr, + struct page **pages, unsigned long size, + gfp_t gfp_mask, pgprot_t prot, + unsigned int page_shift, pgd_t *pgd) +{ + int ret = 0; + bool nofail = gfp_mask & __GFP_NOFAIL; + + do { + + ret = vmap_pages_range_noflush_pgd(pgd, addr, addr + size, + prot, pages, page_shift); + if (nofail && (ret < 0)) + schedule_timeout_uninterruptible(1); + } while (nofail && (ret < 0)); + + if (ret < 0) { + warn_alloc(gfp_mask, NULL, + "vmalloc error: size %lu, failed to map pages", + size); + } + + return ret; +} + +static int vmalloc_map_area_pages(unsigned long addr, unsigned long size, + struct vm_struct *area, + gfp_t gfp_mask, pgprot_t prot, + unsigned int page_shift) +{ + int ret; +#ifdef CONFIG_KERNEL_REPLICATION + int nid; + + if (area->flags & VM_NUMA_SHARED) { + for_each_memory_node(nid) { + pgd_t *pgd = per_node_pgd(&init_mm, nid); + + ret = vmalloc_map_area_pages_pgd(addr, area->pages, size, + gfp_mask, prot, page_shift, pgd); + if (ret) + return ret; + } + } else { + ret = vmalloc_map_area_pages_pgd(addr, area->pages, size, + gfp_mask, prot, page_shift, init_mm.pgd); + } +#else + ret = vmalloc_map_area_pages_pgd(addr, area->pages, size, + gfp_mask, prot, page_shift, init_mm.pgd); +#endif /* CONFIG_KERNEL_REPLICATION */ + return ret; +} + + static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, unsigned int page_shift, int node) @@ -2906,77 +3134,54 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; - struct page **pages; - unsigned int i; + int ret = 0; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) gfp_mask |= __GFP_HIGHMEM; - /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, 1, nested_gfp, node, + area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, area->caller); } else { - pages = kmalloc_node(array_size, nested_gfp, node); + area->pages = kmalloc_node(array_size, nested_gfp, node); } - if (!pages) { - free_vm_area(area); + if (!area->pages) { warn_alloc(gfp_mask, NULL, - "vmalloc size %lu allocation failure: " - "page array size %lu allocation failed", - nr_small_pages * PAGE_SIZE, array_size); - return NULL; + "vmalloc error: size %lu, failed to allocated page array size %lu", + nr_small_pages * PAGE_SIZE, array_size); + goto fail; } - area->pages = pages; - area->nr_pages = nr_small_pages; set_vm_area_page_order(area, page_shift - PAGE_SHIFT); - page_order = vm_area_page_order(area); - /* - * Careful, we allocate and map page_order pages, but tracking is done - * per PAGE_SIZE page so as to keep the vm_struct APIs independent of - * the physical/mapped size. - */ - for (i = 0; i < area->nr_pages; i += 1U << page_order) { - struct page *page; - int p; - - /* Compound pages required for remap_vmalloc_page */ - page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order); - if (unlikely(!page)) { - /* Successfully allocated i pages, free them in __vfree() */ - area->nr_pages = i; - atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - warn_alloc(gfp_mask, NULL, - "vmalloc size %lu allocation failure: " - "page order %u allocation failed", - area->nr_pages * PAGE_SIZE, page_order); - goto fail; - } + area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, + node, page_order, nr_small_pages, area->pages); - for (p = 0; p < (1U << page_order); p++) - area->pages[i + p] = page + p; - - if (gfpflags_allow_blocking(gfp_mask)) - cond_resched(); - } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (vmap_pages_range(addr, addr + size, prot, pages, page_shift) < 0) { + /* + * If not enough pages were obtained to accomplish an + * allocation request, free them via __vfree() if any. + */ + if (area->nr_pages != nr_small_pages) { warn_alloc(gfp_mask, NULL, - "vmalloc size %lu allocation failure: " - "failed to map pages", - area->nr_pages * PAGE_SIZE); + "vmalloc error: size %lu, page order %u, failed to allocate pages", + area->nr_pages * PAGE_SIZE, page_order); goto fail; } - return area->addr; + flush_cache_vmap(addr, addr + size); + + ret = vmalloc_map_area_pages(addr, size, area, gfp_mask, prot, page_shift); + if (ret) + goto fail; + flush_cache_vmap(addr, addr + size); + return area->addr; fail: __vfree(area->addr); return NULL; @@ -3006,7 +3211,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, const void *caller) { struct vm_struct *area; - void *addr; + void *ret; unsigned long real_size = size; unsigned long real_align = align; unsigned int shift = PAGE_SHIFT; @@ -3057,8 +3262,14 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, goto fail; } - addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node); - if (!addr) +#ifdef CONFIG_KERNEL_REPLICATION + if (numa_addr_has_replica(area->addr)) + area->flags |= VM_NUMA_SHARED; + area->node = node; +#endif + /* Allocate physical pages and map them into vmalloc space. */ + ret = __vmalloc_area_node(area, gfp_mask, prot, shift, node); + if (!ret) goto fail; /* @@ -3067,11 +3278,10 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, * Now, it is fully initialized, so remove this flag here. */ clear_vm_uninitialized_flag(area); - size = PAGE_ALIGN(size); kmemleak_vmalloc(area, size, gfp_mask); - return addr; + return area->addr; fail: #ifdef CONFIG_EXTEND_HUGEPAGE_MAPPING @@ -3115,6 +3325,124 @@ void *__vmalloc_node(unsigned long size, unsigned long align, return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } + +#ifdef CONFIG_KERNEL_REPLICATION +static void numa_replicate_page_range(struct page **src, struct page **dst, int nr_pages) +{ + int i; + void *from, *to; + + for (i = 0; i < nr_pages; i++) { + from = kmap(src[i]); + to = kmap(dst[i]); + + copy_page(to, from); + + kunmap(src[i]); + kunmap(dst[i]); + } +} + +int __vmalloc_node_replicate_range(const void *addr, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags) +{ + int i, ret, node = 0; + struct vm_struct *area; + unsigned int page_order; + unsigned int nr_allocated; + struct page **pages; + unsigned long area_start, area_end; + const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; + unsigned long array_size; + + gfp_mask |= __GFP_NOWARN; + if (!(gfp_mask & (GFP_DMA | GFP_DMA32))) + gfp_mask |= __GFP_HIGHMEM; + + if (unlikely(!numa_addr_has_replica(addr))) + return -EINVAL; + + area = find_vm_area(addr); + if (unlikely(!area)) + return -ENOENT; + + if (area->node == NUMA_NO_NODE) + return -EINVAL; + + array_size = sizeof(struct page *) * area->nr_pages; + if (array_size > PAGE_SIZE) + pages = __vmalloc(array_size, nested_gfp); + else + pages = kmalloc(array_size, nested_gfp); + + if (!pages) + return -ENOMEM; + + page_order = vm_area_page_order(area); + for (i = 0; i < area->nr_pages; i++) + INIT_LIST_HEAD(&area->pages[i]->lru); + + area_start = (unsigned long)area->addr; + area_end = (unsigned long)(area->addr + area->nr_pages * PAGE_SIZE); + + for_each_memory_node(node) { + if (area->node == node) + continue; + + nr_allocated = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN, + node, page_order, area->nr_pages, pages); + if (nr_allocated != area->nr_pages) + goto fail_alloc_pages; + + for (i = 0; i < area->nr_pages; i++) + list_add(&pages[i]->lru, &area->pages[i]->lru); + + vunmap_range_noflush_pgd(init_mm.pgd_numa[node], + area_start, area_end); + + /* + * We can't fail here (hopefully) + * Possible errors: not enough memory for tables and not empty entries. + * Both unrealistic because we just cleared entries in existed tables. + */ + ret = vmalloc_map_area_pages_pgd(area_start, pages, + nr_allocated * PAGE_SIZE, + gfp_mask, prot, PAGE_SHIFT, + per_node_pgd(&init_mm, node)); + if (ret != 0) + goto fail_map_pages; + + atomic_long_add(nr_allocated, &nr_vmalloc_pages); + numa_replicate_page_range(area->pages, pages, area->nr_pages); + + for (i = 0; i < area->nr_pages; i++) + pages[i] = NULL; + } + kvfree(pages); + + flush_tlb_kernel_range(area_start, area_end); + area->replicated = true; + + return 0; +fail_alloc_pages: + for (i = 0; i < nr_allocated; i++) + __free_pages(pages[i], 0); + +fail_map_pages: + kfree(pages); + for (i = 0; i < area->nr_pages; i++) { + struct page *page, *tmp; + + list_for_each_entry_safe(page, tmp, &area->pages[i]->lru, lru) { + list_del(&page->lru); + __free_pages(page, 0); + } + } + + return ret; +} +#endif /* CONFIG_KERNEL_REPLICATION */ + /* * This is only for performance analysis of vmalloc and stress purpose. * It is required by vmalloc test module, therefore do not use it other -- 2.34.1