 
            From: Nikita Panov <panov.nikita@huawei.com> kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBOJU2 ------------------------------------------------- In current design mutable kernel data modifications don't require synchronization between translation tables due to on 64-bit platforms all physical memory already mapped in kernel space and this mapping is persistent. In user space the translation tables synchronizations are quite rare due to the only case is new PUD/P4D allocation. At the current stage only the PGD layer is replicated for the user space. Please refer to the next pics. TT overview: NODE 0 NODE 1 USER KERNEL USER KERNEL --------------------- --------------------- PGD | | | | | | | | |*| | | | | | | | | |*| --------------------- --------------------- | | ------------------- ------------------- | | --------------------- --------------------- PUD | | | | | | | |*|*| | | | | | | | |*|*| --------------------- --------------------- | | ------------------- ------------------- | | --------------------- --------------------- PMD |READ-ONLY|MUTABLE | |READ-ONLY|MUTABLE | --------------------- --------------------- | | | | | -------------------------- | | | -------- ------- -------- PHYS | | | | | | MEM -------- ------- -------- <------> <------> NODE 0 Shared NODE 1 between nodes * - entries unique in each table TT synchronization: NODE 0 NODE 1 USER KERNEL USER KERNEL --------------------- --------------------- PGD | | |0| | | | | | | | | |0| | | | | | | --------------------- --------------------- | | | | | PUD_ALLOC / P4D_ALLOC | | IN USERSPACE | \/ --------------------- --------------------- PGD | | |p| | | | | | | | | |p| | | | | | | --------------------- --------------------- | | | | --------------------------- | --------------------- PUD/P4D | | | | | | | | | | --------------------- Acked-by: Alexander Grubnikov <alexander.grubnikov@huawei.com> Acked-by: Ilya Hanov <ilya.hanov@huawei-partners.com> Acked-by: Denis Darvish <darvish.denis@huawei.com> Co-developed-by: Artem Kuzin <artem.kuzin@huawei.com> Signed-off-by: Artem Kuzin <artem.kuzin@huawei.com> Co-developed-by: Nikita Panov <panov.nikita@huawei.com> Signed-off-by: Nikita Panov <panov.nikita@huawei.com> --- include/linux/mm_types.h | 4 + include/linux/numa_kernel_replication.h | 198 +++++++ mm/Makefile | 1 + mm/memory.c | 148 ++++- mm/numa_kernel_replication.c | 759 ++++++++++++++++++++++++ 5 files changed, 1095 insertions(+), 15 deletions(-) create mode 100644 include/linux/numa_kernel_replication.h create mode 100644 mm/numa_kernel_replication.c diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4b9a8723d3eb..6a0015a55211 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -981,7 +981,11 @@ struct mm_struct { #endif } __randomize_layout; +#ifdef CONFIG_KERNEL_REPLICATION + KABI_USE(1, pgd_t **pgd_numa) +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/include/linux/numa_kernel_replication.h b/include/linux/numa_kernel_replication.h new file mode 100644 index 000000000000..ee1ab0f111c7 --- /dev/null +++ b/include/linux/numa_kernel_replication.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef _LINUX_NUMA_REPLICATION_H +#define _LINUX_NUMA_REPLICATION_H + +#ifdef CONFIG_KERNEL_REPLICATION + +#include <linux/kabi.h> + +/* + * Why? Because linux is defined to 1 for some reason, + * and linux/mm.h converted to 1/mm.h. Perhaps compiler? + * Do not ask me, I have no idea. + */ +#if defined(linux) +#define tmp_linux_value linux +#undef linux +#endif + +#include KABI_HIDE_INCLUDE(<linux/mm_types.h>) +#include KABI_HIDE_INCLUDE(<linux/nodemask.h>) +#include KABI_HIDE_INCLUDE(<linux/module.h>) +#include KABI_HIDE_INCLUDE(<linux/mm.h>) +#include KABI_HIDE_INCLUDE(<asm/numa_replication.h>) + +#if defined(tmp_linux_value) +#define linux tmp_linux_value +#undef tmp_linux_value +#endif + +typedef enum { + NONE = 0, + PMD_PROPAGATION = 1, + PUD_PROPAGATION = 2, + P4D_PROPAGATION = 3, + PGD_PROPAGATION = 4 +} propagation_level_t; + +extern nodemask_t replica_nodes; + +#define for_each_memory_node(nid) \ + for (nid = first_node(replica_nodes); \ + nid != MAX_NUMNODES; \ + nid = next_node(nid, replica_nodes)) + +#define this_node_pgd(mm) ((mm)->pgd_numa[numa_node_id()]) +#define per_node_pgd(mm, nid) ((mm)->pgd_numa[nid]) + +static inline bool numa_addr_has_replica(const void *addr) +{ + return ((unsigned long)addr >= PAGE_TABLE_REPLICATION_LEFT) && + ((unsigned long)addr <= PAGE_TABLE_REPLICATION_RIGHT); +} + +void __init numa_replication_init(void); +void __init numa_replicate_kernel_text(void); +void numa_replicate_kernel_rodata(void); +void numa_replication_fini(void); + +bool is_text_replicated(void); +propagation_level_t get_propagation_level(void); +void numa_setup_pgd(void); +void __init_or_module *numa_get_replica(void *vaddr, int nid); +int numa_get_memory_node(int nid); +void dump_mm_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end); + +/* Macro to walk over mm->pgd_numa and cast it to appropriate level type */ +#define for_each_pgtable_replica(table, mm, replica, nid, offset) \ + for (nid = first_node(replica_nodes), offset = ((unsigned long)table) & (~PAGE_MASK), \ + replica = (typeof(table))(((unsigned long)mm->pgd_numa[nid]) + offset); \ + nid != MAX_NUMNODES; \ + nid = next_node(nid, replica_nodes), \ + replica = (typeof(table))(((unsigned long)mm->pgd_numa[nid]) + offset)) + +static inline void pgd_populate_replicated(struct mm_struct *mm, pgd_t *pgdp, p4d_t *p4dp) +{ + int nid; + pgd_t *curr_pgd; + unsigned long offset; + + if (get_propagation_level() == PGD_PROPAGATION) { + for_each_pgtable_replica(pgdp, mm, curr_pgd, nid, offset) { + pgd_populate(mm, curr_pgd, p4dp); + } + } else { + pgd_populate(mm, pgdp, p4dp); + } +} + +static inline void p4d_populate_replicated(struct mm_struct *mm, p4d_t *p4dp, pud_t *pudp) +{ + int nid; + p4d_t *curr_p4d; + unsigned long offset; + + if (get_propagation_level() == P4D_PROPAGATION) { + for_each_pgtable_replica(p4dp, mm, curr_p4d, nid, offset) { + p4d_populate(mm, curr_p4d, pudp); + } + } else { + p4d_populate(mm, p4dp, pudp); + } +} + +static inline void pud_populate_replicated(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp) +{ + int nid; + pud_t *curr_pud; + unsigned long offset; + + if (get_propagation_level() == PUD_PROPAGATION) { + for_each_pgtable_replica(pudp, mm, curr_pud, nid, offset) { + pud_populate(mm, curr_pud, pmdp); + } + } else { + pud_populate(mm, pudp, pmdp); + } +} + +static inline void pmd_populate_replicated(struct mm_struct *mm, pmd_t *pmdp, pgtable_t ptep) +{ + int nid; + pmd_t *curr_pmd; + unsigned long offset; + + if (get_propagation_level() == PMD_PROPAGATION) { + for_each_pgtable_replica(pmdp, mm, curr_pmd, nid, offset) { + pmd_populate(mm, curr_pmd, ptep); + } + } else { + pmd_populate(mm, pmdp, ptep); + } +} + +#else + +#if defined(linux) +#define tmp_linux_value linux +#undef linux +#endif + +#include KABI_HIDE_INCLUDE(<linux/mm.h>) + +#if defined(tmp_linux_value) +#define linux tmp_linux_value +#undef tmp_linux_value +#endif + +#define this_node_pgd(mm) ((mm)->pgd) +#define per_node_pgd(mm, nid) ((mm)->pgd) + +static inline void numa_setup_pgd(void) +{ +} + +static inline void __init numa_replication_init(void) +{ +} + +static inline void __init numa_replicate_kernel_text(void) +{ +} + +static inline void numa_replicate_kernel_rodata(void) +{ +} + +static inline void numa_replication_fini(void) +{ +} + +static inline bool numa_addr_has_replica(const void *addr) +{ + return false; +} + +static inline bool is_text_replicated(void) +{ + return false; +} + +static inline void *numa_get_replica(void *vaddr, int nid) +{ + return lm_alias(vaddr); +} + +static inline void dump_mm_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + +#define pgd_populate_replicated pgd_populate +#define p4d_populate_replicated p4d_populate +#define pud_populate_replicated pud_populate +#define pmd_populate_replicated pmd_populate + +#endif /*CONFIG_KERNEL_REPLICATION*/ +#endif /*_LINUX_NUMA_REPLICATION_H*/ diff --git a/mm/Makefile b/mm/Makefile index 11df2de8fdbe..45058cdf65d8 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -140,6 +140,7 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o +obj-$(CONFIG_KERNEL_REPLICATION) += numa_kernel_replication.o obj-$(CONFIG_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_ETMEM) += etmem.o diff --git a/mm/memory.c b/mm/memory.c index 6f89c44b3dfa..247eec2ad417 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -79,6 +79,7 @@ #include <linux/sched/sysctl.h> #include <linux/userswap.h> #include <linux/dynamic_pool.h> +#include <linux/numa_kernel_replication.h> #include <trace/events/kmem.h> @@ -185,6 +186,96 @@ void mm_trace_rss_stat(struct mm_struct *mm, int member) trace_rss_stat(mm, member); } +#ifdef CONFIG_KERNEL_REPLICATION + +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) +{ + unsigned long offset; + int nid; + pmd_t *curr_pmd; + pgtable_t token = pmd_pgtable(*pmd); + + if (get_propagation_level() == PMD_PROPAGATION) { + for_each_pgtable_replica(pmd, tlb->mm, curr_pmd, nid, offset) { + pmd_clear(curr_pmd); + } + } else { + pmd_clear(pmd); + } + + pte_free_tlb(tlb, token, addr); + mm_dec_nr_ptes(tlb->mm); + (void)token; +} + +static inline void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr) +{ + unsigned long offset; + int nid; + pud_t *curr_pud; + pmd_t *pmd = pmd_offset(pud, addr); + + if (get_propagation_level() == PUD_PROPAGATION) { + for_each_pgtable_replica(pud, tlb->mm, curr_pud, nid, offset) { + pud_clear(curr_pud); + } + } else { + pud_clear(pud); + } + + pmd_free_tlb(tlb, pmd, addr); + mm_dec_nr_pmds(tlb->mm); + (void)pmd; +} + +static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long addr) +{ + unsigned long offset; + int nid; + p4d_t *curr_p4d; + pud_t *pud = pud_offset(p4d, addr); + + if (get_propagation_level() == P4D_PROPAGATION) { + for_each_pgtable_replica(p4d, tlb->mm, curr_p4d, nid, offset) { + p4d_clear(curr_p4d); + } + } else { + p4d_clear(p4d); + } + + pud_free_tlb(tlb, pud, addr); + mm_dec_nr_puds(tlb->mm); + (void)pud; +} + +static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr) +{ + unsigned long offset; + int nid; + pgd_t *curr_pgd; + p4d_t *p4d = p4d_offset(pgd, addr); + + if (get_propagation_level() == PGD_PROPAGATION) { + for_each_pgtable_replica(pgd, tlb->mm, curr_pgd, nid, offset) { + pgd_clear(curr_pgd); + } + } else { + pgd_clear(pgd); + } + p4d_free_tlb(tlb, p4d, addr); + /* + * Why? If 4-level paging is enabled via kconfig, + * all functions execept p4d_offset are empty, + * and we get unused variable error + */ + (void)p4d; +} +#else + /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. @@ -196,8 +287,43 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pmd_clear(pmd); pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); + (void)token; } +static void __free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr) +{ + pmd_t *pmd = pmd_offset(pud, addr); + + pud_clear(pud); + pmd_free_tlb(tlb, pmd, addr); + mm_dec_nr_pmds(tlb->mm); + (void)pmd; +} + +static inline void __free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, + unsigned long addr) +{ + pud_t *pud = pud_offset(p4d, addr); + + p4d_clear(p4d); + pud_free_tlb(tlb, pud, addr); + mm_dec_nr_puds(tlb->mm); + (void)pud; +} + +static inline void __free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr) +{ + p4d_t *p4d = p4d_offset(pgd, addr); + + pgd_clear(pgd); + p4d_free_tlb(tlb, p4d, addr); + (void)p4d; +} + +#endif + static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) @@ -226,10 +352,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, if (end - 1 > ceiling - 1) return; - pmd = pmd_offset(pud, start); - pud_clear(pud); - pmd_free_tlb(tlb, pmd, start); - mm_dec_nr_pmds(tlb->mm); + __free_pmd_range(tlb, pud, start); } static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, @@ -260,10 +383,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, if (end - 1 > ceiling - 1) return; - pud = pud_offset(p4d, start); - p4d_clear(p4d); - pud_free_tlb(tlb, pud, start); - mm_dec_nr_puds(tlb->mm); + __free_pud_range(tlb, p4d, start); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -294,9 +414,7 @@ static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, if (end - 1 > ceiling - 1) return; - p4d = p4d_offset(pgd, start); - pgd_clear(pgd); - p4d_free_tlb(tlb, p4d, start); + __free_p4d_range(tlb, pgd, start); } /* @@ -440,7 +558,7 @@ void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) * smp_rmb() barriers in page table walking code. */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - pmd_populate(mm, pmd, *pte); + pmd_populate_replicated(mm, pmd, *pte); *pte = NULL; } spin_unlock(ptl); @@ -6050,7 +6168,7 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) p4d_free(mm, new); } else { smp_wmb(); /* See comment in pmd_install() */ - pgd_populate(mm, pgd, new); + pgd_populate_replicated(mm, pgd, new); } spin_unlock(&mm->page_table_lock); return 0; @@ -6094,7 +6212,7 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) if (!p4d_present(*p4d)) { mm_inc_nr_puds(mm); smp_wmb(); /* See comment in pmd_install() */ - p4d_populate(mm, p4d, new); + p4d_populate_replicated(mm, p4d, new); } else /* Another has populated it */ pud_free(mm, new); spin_unlock(&mm->page_table_lock); @@ -6118,7 +6236,7 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) if (!pud_present(*pud)) { mm_inc_nr_pmds(mm); smp_wmb(); /* See comment in pmd_install() */ - pud_populate(mm, pud, new); + pud_populate_replicated(mm, pud, new); } else { /* Another has populated it */ pmd_free(mm, new); } diff --git a/mm/numa_kernel_replication.c b/mm/numa_kernel_replication.c new file mode 100644 index 000000000000..c2d289b7b9df --- /dev/null +++ b/mm/numa_kernel_replication.c @@ -0,0 +1,759 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/kernel.h> +#include <linux/pagewalk.h> +#include <linux/numa_kernel_replication.h> +#include <linux/memblock.h> +#include <linux/pgtable.h> +#include <linux/hugetlb.h> +#include <linux/kobject.h> +#include <linux/debugfs.h> + +#include <asm/sections.h> +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> + +#define KERNEL_TEXT_START ((unsigned long)&_stext) +#define KERNEL_TEXT_END ((unsigned long)&_etext) + +#define KERNEL_RODATA_START ((unsigned long)&__start_rodata) +#define KERNEL_RODATA_END ((unsigned long)&__end_rodata) + +#define PMD_ALLOC_ORDER (PMD_SHIFT-PAGE_SHIFT) +#define PAGES_PER_PMD (1 << PMD_ALLOC_ORDER) + +#define replication_log(data, fmt, args...) \ +({ \ + if (data && data->m) \ + seq_printf(data->m, fmt, ##args); \ + else \ + pr_info(KERN_CONT fmt, ##args); \ +}) + +struct numa_node_desc { + pgd_t *pgd; + void *text_vaddr; + void *rodata_vaddr; +}; + +static struct numa_node_desc __initdata_or_module node_desc[MAX_NUMNODES]; + +struct dump_data { + struct seq_file *m; +}; + +struct dump_config { + int pgd_extra_info:1; + int p4d_extra_info:1; + int pud_extra_info:1; + int pmd_extra_info:1; + int pte_extra_info:1; + struct dump_data *data; +}; + +static bool text_replicated; +static propagation_level_t prop_level = NONE; +/* + * The first ready NUMA node, used as a source node + * for kernel text and rodata replication + */ +static unsigned int master_node = INT_MAX; +/* + * The case when machine has memoryless nodes is rare + * but possible. To handle memoryless nodes properly + * kernel replication maintains mapping node -> node with memory + * for all NUMA nodes. + */ +static int node_to_memory_node[MAX_NUMNODES]; + +static bool pgtables_extra; +static DEFINE_SPINLOCK(debugfs_lock); + +propagation_level_t get_propagation_level(void) +{ + return prop_level; +} + +bool is_text_replicated(void) +{ + return text_replicated; +} + +static void binary_dump(struct dump_data *data, unsigned long value) +{ + int i; + + for (i = BITS_PER_LONG - 1; i >= 0; i--) { + if ((BITS_PER_LONG - 1 - i) % BITS_PER_BYTE == 0) + replication_log(data, "%-9d", i); + } + replication_log(data, "%d\n", 0); + + for (i = BITS_PER_LONG - 1; i >= 0; i--) { + if ((BITS_PER_LONG - 1 - i) % BITS_PER_BYTE == 0) + replication_log(data, "|"); + + replication_log(data, "%d", (1UL << i) & value ? 1 : 0); + } + replication_log(data, "|\n"); +} + +static int pgd_callback(pgd_t *pgd, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pgd_val(*pgd); + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PGDIR_MASK; + next = (addr & PGDIR_MASK) - 1 + PGDIR_SIZE; + + replication_log(c->data, + "PGD ADDR: 0x%p PGD VAL: 0x%016lx [%p --- %p]\n", + pgd, val, (void *)addr, (void *)next); + + if (c->pgd_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int p4d_callback(p4d_t *p4d, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = p4d_val(*p4d); + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & P4D_MASK; + next = (addr & P4D_MASK) - 1 + P4D_SIZE; + + replication_log(c->data, + "P4D ADDR: 0x%p P4D VAL: 0x%016lx [%p --- %p]\n", + p4d, val, (void *)addr, (void *)next); + + if (c->p4d_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int pud_callback(pud_t *pud, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pud_val(*pud); + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PUD_MASK; + next = (addr & PUD_MASK) - 1 + PUD_SIZE; + + replication_log(c->data, + "PUD ADDR: 0x%p PUD VAL: 0x%016lx huge(%d) [%p --- %p]\n", + pud, val, pud_huge(*pud), (void *)addr, (void *)next); + + if (c->pud_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int pmd_callback(pmd_t *pmd, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pmd_val(*pmd); + unsigned long paddr = pmd_pfn(*pmd) << PAGE_SHIFT; + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PMD_MASK; + next = (addr & PMD_MASK) - 1 + PMD_SIZE; + + replication_log(c->data, + "PMD ADDR: 0x%p PMD VAL: 0x%016lx huge(%d) [%p --- %p] to %p\n", + pmd, val, pmd_huge(*pmd), (void *)addr, (void *)next, (void *)paddr); + + if (c->pmd_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int pte_callback(pte_t *pte, + unsigned long addr, unsigned long next, + struct mm_walk *walk) +{ + unsigned long val = pte_val(*pte); + unsigned long paddr = pte_pfn(*pte) << PAGE_SHIFT; + struct dump_config *c = (struct dump_config *)walk->private; + + if (!val) + return 0; + + addr = addr & PAGE_MASK; + next = (addr & PAGE_MASK) - 1 + PAGE_SIZE; + + replication_log(c->data, + "PTE ADDR: 0x%p PTE VAL: 0x%016lx [%p --- %p] to %p\n", + pte, val, (void *)addr, (void *)next, (void *)paddr); + + if (c->pte_extra_info) + binary_dump(c->data, val); + + return 0; +} + +static int pte_hole_callback(unsigned long addr, unsigned long next, + int depth, struct mm_walk *walk) +{ + struct dump_config *c = (struct dump_config *)walk->private; + + replication_log(c->data, "%*chole\n", depth * 2, ' '); + + return 0; +} + +static void dump_pgtables(struct mm_struct *mm, + struct dump_data *data, + unsigned long start, unsigned long end) +{ + int nid = 0; + int extra = pgtables_extra ? 1 : 0; + bool locked = false; + struct dump_config conf = { + .pgd_extra_info = extra, + .p4d_extra_info = extra, + .pud_extra_info = extra, + .pmd_extra_info = extra, + .pte_extra_info = extra, + .data = data, + }; + + const struct mm_walk_ops ops = { + .pgd_entry = pgd_callback, + .p4d_entry = p4d_callback, + .pud_entry = pud_callback, + .pmd_entry = pmd_callback, + .pte_entry = pte_callback, + .pte_hole = pte_hole_callback + }; + + BUG_ON(data && data->m == NULL); + + start = start & PAGE_MASK; + end = (end & PAGE_MASK) - 1 + PAGE_SIZE; + + replication_log(data, + "----PER-NUMA NODE KERNEL REPLICATION ENABLED----\n"); + + if (rwsem_is_locked(&mm->mmap_lock)) + locked = true; + else + mmap_read_lock(mm); + + for_each_memory_node(nid) { + replication_log(data, "NUMA node id #%d\n", nid); + replication_log(data, "PGD: %p PGD phys: %p\n", + mm->pgd_numa[nid], (void *)virt_to_phys(mm->pgd_numa[nid])); + walk_page_range_novma(mm, start, end, &ops, mm->pgd_numa[nid], &conf); + } + + if (!locked) + mmap_read_unlock(mm); + + replication_log(data, + "----PER-NUMA NODE KERNEL REPLICATION ENABLED----\n"); +} + +static void dump_kernel_pgtables(struct dump_data *data, + unsigned long start, unsigned long end) +{ + dump_pgtables(&init_mm, data, start, end); +} + +void dump_mm_pgtables(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + dump_pgtables(mm, NULL, start, end); +} + +static void cpu_dump(void *info) +{ + struct dump_data *data = (struct dump_data *)info; + + spin_lock(&debugfs_lock); + numa_cpu_dump(data->m); + spin_unlock(&debugfs_lock); +} + +static int stats_show(struct seq_file *m, void *v) +{ + int cpu; + struct dump_data data = { + .m = m, + }; + + for_each_online_cpu(cpu) + smp_call_function_single(cpu, cpu_dump, &data, 1); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(stats); + +static int pgtables_show(struct seq_file *m, void *v) +{ + struct dump_data data = { + .m = m, + }; + + dump_kernel_pgtables(&data, + KERNEL_TEXT_START, KERNEL_RODATA_END - 1); + + return 0; +} + +DEFINE_SHOW_ATTRIBUTE(pgtables); + +void debugfs_init(void) +{ + struct dentry *dir; + static struct dentry *debugfs_dir; + + debugfs_dir = debugfs_create_dir("numa_replication", NULL); + if (IS_ERR(debugfs_dir)) { + pr_err("Failed to create debugfs entry for NUMA" + " replication: %ld\n", + PTR_ERR(debugfs_dir)); + return; + } + dir = debugfs_create_file("stats", 0400, debugfs_dir, + NULL, &stats_fops); + if (IS_ERR(dir)) { + pr_err("Failed to create debugfs entry for NUMA" + " replication stats: %ld\n", + PTR_ERR(dir)); + return; + } + + dir = debugfs_create_file("pgtables_kernel", 0400, debugfs_dir, + NULL, &pgtables_fops); + if (IS_ERR(dir)) { + pr_err("Failed to create debugfs entry for NUMA" + " replication pgtables: %ld\n", + PTR_ERR(dir)); + return; + } + + debugfs_create_bool("pgtables_kernel_extra", 0600, debugfs_dir, + &pgtables_extra); +} + +/* + * The case, when machine has memoryless NUMA nodes + * should be handled in a special way. To do this we + * create node<->memory mapping to have an information + * about the node with memory that memoryless node can use. + */ +static void init_node_to_memory_mapping(void) +{ + int nid; + + for_each_online_node(nid) { + int memory_nid; + int min_dist = INT_MAX; + + node_to_memory_node[nid] = nid; + for_each_memory_node(memory_nid) { + int dist = node_distance(nid, memory_nid); + + if (dist < min_dist) { + min_dist = dist; + node_to_memory_node[nid] = memory_nid; + } + } + pr_info("For node %d memory is on the node - %d\n", + nid, node_to_memory_node[nid]); + } +} + +int numa_get_memory_node(int nid) +{ + return node_to_memory_node[nid]; +} + +/* + * The function creates replica of particular memory area + * and install replicated memory in translation table of + * required NUMA node. + */ +static void replicate_memory(void *dst, unsigned long start, unsigned long end, int nid) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + pgprot_t prot; + unsigned int offset_in_pages = 0; + unsigned long vaddr = start; + struct page *pages = virt_to_page(dst); + + memcpy(dst, lm_alias(start), end - start); + while (vaddr < end) { + pgd = pgd_offset_pgd(node_desc[nid].pgd, vaddr); + p4d = p4d_offset(pgd, vaddr); + pud = pud_offset(p4d, vaddr); + pmd = pmd_offset(pud, vaddr); + + if (pmd_leaf(*pmd)) { + prot = pmd_pgprot(*pmd); + + set_pmd(pmd, pfn_pmd(page_to_pfn(pages) + offset_in_pages, prot)); + offset_in_pages += PAGES_PER_PMD; + vaddr += PMD_SIZE; + continue; + } + pte = pte_offset_kernel(pmd, vaddr); + prot = pte_pgprot(*pte); + __set_pte(pte, pfn_pte(page_to_pfn(pages) + offset_in_pages, prot)); + offset_in_pages++; + vaddr += PAGE_SIZE; + } +} + +static void __init replicate_kernel_text(int nid) +{ + replicate_memory(node_desc[nid].text_vaddr, + KERNEL_TEXT_START, KERNEL_TEXT_END, nid); + numa_sync_text_replicas((unsigned long)node_desc[nid].text_vaddr, + (unsigned long)node_desc[nid].text_vaddr + (KERNEL_TEXT_END - KERNEL_TEXT_START)); +} + +static void replicate_kernel_rodata(int nid) +{ + replicate_memory(node_desc[nid].rodata_vaddr, + KERNEL_RODATA_START, KERNEL_RODATA_END, nid); +} + +//'-1' in next functions have only one purpose - prevent unsgined long overflow +static void replicate_pgt_pte(pud_t *dst, pud_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & PMD_MASK; + unsigned long right = (end & PMD_MASK) - 1 + PMD_SIZE; + unsigned long addr; + + pmd_t *clone_pmd = pmd_offset(dst, left); + pmd_t *orig_pmd = pmd_offset(src, left); + + for (addr = left; + (addr >= left && addr < right); addr += PMD_SIZE) { + pgtable_t new_pte; + + if (pmd_none(*orig_pmd) || pmd_huge(*orig_pmd) || + pmd_val(*orig_pmd) == 0) + goto skip; + + pmd_clear(clone_pmd); + new_pte = pte_alloc_one_node(nid, &init_mm); + pmd_populate_kernel(&init_mm, clone_pmd, page_to_virt(new_pte)); + BUG_ON(new_pte == NULL); + + copy_page(page_to_virt(pmd_pgtable(*clone_pmd)), + page_to_virt(pmd_pgtable(*orig_pmd))); +skip: + clone_pmd++; + orig_pmd++; + } +} + +//'-1' in next functions have only one purpose - prevent unsgined long overflow +static void replicate_pgt_pmd(p4d_t *dst, p4d_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & PUD_MASK; + unsigned long right = (end & PUD_MASK) - 1 + PUD_SIZE; + + pud_t *clone_pud = pud_offset(dst, left); + pud_t *orig_pud = pud_offset(src, left); + + for (unsigned long addr = left; + (addr >= left && addr < right); addr += PUD_SIZE) { + pmd_t *new_pmd; + + if (pud_none(*orig_pud) || pud_huge(*orig_pud) || + pud_val(*orig_pud) == 0) + goto skip; + + pud_clear(clone_pud); + new_pmd = pmd_alloc_node(nid, &init_mm, clone_pud, addr); + BUG_ON(new_pmd == NULL); + + copy_page(pud_pgtable(*clone_pud), pud_pgtable(*orig_pud)); + + replicate_pgt_pte(clone_pud, orig_pud, max(addr, start), + min(addr - 1 + PUD_SIZE, end), nid); +skip: + clone_pud++; + orig_pud++; + } +} + +static void replicate_pgt_pud(pgd_t *dst, pgd_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & P4D_MASK; + unsigned long right = (end & P4D_MASK) - 1 + P4D_SIZE; + + p4d_t *clone_p4d = p4d_offset(dst, left); + p4d_t *orig_p4d = p4d_offset(src, left); + + for (unsigned long addr = left; + (addr >= left && addr < right); addr += P4D_SIZE) { + pud_t *new_pud; + + if (p4d_none(*orig_p4d) || p4d_huge(*orig_p4d) || + p4d_val(*orig_p4d) == 0) + goto skip; + + p4d_clear(clone_p4d); + new_pud = pud_alloc_node(nid, &init_mm, clone_p4d, addr); + BUG_ON(new_pud == NULL); + + copy_page(p4d_pgtable(*clone_p4d), p4d_pgtable(*orig_p4d)); + /* + * start and end passed to the next function must be in + * range of p4ds, so min and max are used here + */ + replicate_pgt_pmd(clone_p4d, orig_p4d, max(addr, start), + min(addr - 1 + P4D_SIZE, end), nid); +skip: + clone_p4d++; + orig_p4d++; + } +} + +static void replicate_pgt_p4d(pgd_t *dst, pgd_t *src, + unsigned long start, unsigned long end, + unsigned int nid) +{ + unsigned long left = start & PGDIR_MASK; + unsigned long right = (end & PGDIR_MASK) - 1 + PGDIR_SIZE; + + pgd_t *clone_pgd = pgd_offset_pgd(dst, left); + pgd_t *orig_pgd = pgd_offset_pgd(src, left); + + for (unsigned long addr = left; + (addr >= left && addr < right); addr += PGDIR_SIZE) { + p4d_t *new_p4d; + + /* TODO: remove last condition and do something better + * In the case of a folded P4D level, pgd_none and pgd_huge + * always return 0, so we might start to replicate empty entries. + * We obviously want to avoid this, so the last check is performed here. + */ + if (pgd_none(*orig_pgd) || pgd_huge(*orig_pgd) || + pgd_val(*orig_pgd) == 0) + goto skip; + + pgd_clear(clone_pgd); + new_p4d = p4d_alloc_node(nid, &init_mm, clone_pgd, addr); + BUG_ON(new_p4d == NULL); + + copy_page((void *)pgd_page_vaddr(*clone_pgd), + (void *)pgd_page_vaddr(*orig_pgd)); + replicate_pgt_pud(clone_pgd, orig_pgd, max(addr, start), + min(addr - 1 + PGDIR_SIZE, end), nid); +skip: + clone_pgd++; + orig_pgd++; + } +} + +static void replicate_pgt(int nid, unsigned long start, unsigned long end) +{ + replicate_pgt_p4d(node_desc[nid].pgd, init_mm.pgd, start, end, nid); +} + +/* + * Page tables replication works in a way when first + * pgd level replicated and then the replication of the + * left part if done. The only part of pagetable that + * contains text and rodata is replicated. Obviously a + * part of upper layer entries of page table should be + * replicated too. As result, the pgd, p4d, pud and pmd + * layers are touched by replication. In particular, the + * page table sub-tree that cover kernel text and rodata. + */ +static void replicate_pgtables(void) +{ + int nid; + + init_mm.pgd_numa = (pgd_t **)kmalloc(sizeof(pgd_t *) * MAX_NUMNODES, GFP_PGTABLE_KERNEL); + BUG_ON(!init_mm.pgd_numa); + + for_each_memory_node(nid) { + node_desc[nid].pgd = numa_replicate_pgt_pgd(nid); + replicate_pgt(nid, PAGE_TABLE_REPLICATION_LEFT, + PAGE_TABLE_REPLICATION_RIGHT); + } + + init_mm.pgd = node_desc[numa_get_memory_node(0)].pgd; + + for_each_online_node(nid) { + int memory_nid = numa_get_memory_node(nid); + + init_mm.pgd_numa[nid] = node_desc[memory_nid].pgd; + } +} + +/* + * Kernel text replication includes two steps: + * 1. page tables replication for init_mm + * 2. kernel text pages replication and + * corresponding page table update. + * 3. setup page table, related to + * current NUMA node on current cpu, + * for other NUMA cpus page tables will + * be updated later, during cpu initialization. + * Master node - the first NUMA node, used as + * a source for replicas. Memory for master node + * is expected to be already local. + */ +void __init numa_replicate_kernel_text(void) +{ + int nid; + + replicate_pgtables(); + + for_each_memory_node(nid) { + if (nid == master_node) + continue; + replicate_kernel_text(nid); + } + + text_replicated = true; + + if (!mm_p4d_folded(&init_mm)) + prop_level = PGD_PROPAGATION; + if (mm_p4d_folded(&init_mm) && !mm_pud_folded(&init_mm)) + prop_level = P4D_PROPAGATION; + if (mm_p4d_folded(&init_mm) && mm_pud_folded(&init_mm) && !mm_pmd_folded(&init_mm)) + prop_level = PUD_PROPAGATION; + if (mm_p4d_folded(&init_mm) && mm_pud_folded(&init_mm) && mm_pmd_folded(&init_mm)) + prop_level = PMD_PROPAGATION; + + BUG_ON(prop_level == NONE); + + numa_setup_pgd(); +} + +void numa_replicate_kernel_rodata(void) +{ + int nid; + + for_each_memory_node(nid) { + if (nid == master_node) + continue; + replicate_kernel_rodata(nid); + } + + flush_tlb_all(); +} + +void numa_setup_pgd(void) +{ + numa_load_replicated_pgd(init_mm.pgd_numa[numa_node_id()]); +} + +void __init_or_module *numa_get_replica(void *vaddr, int nid) +{ + unsigned long addr = (unsigned long)vaddr; + unsigned long offset = addr - KERNEL_TEXT_START; + + BUG_ON(addr < KERNEL_TEXT_START || addr >= KERNEL_TEXT_END); + BUG_ON(node_desc[nid].text_vaddr == NULL); + BUG_ON(numa_get_memory_node(nid) != nid); + + return node_desc[nid].text_vaddr + offset; +} + +nodemask_t __ro_after_init replica_nodes = { { [0] = 1UL } }; + +void __init numa_replication_init(void) +{ + int nid; + + unsigned long align = PAGE_SIZE; +#ifdef CONFIG_ARM64_4K_PAGES + align = HPAGE_SIZE; +#else + align = CONT_PTE_SIZE; +#endif + nodes_clear(replica_nodes); + + for_each_node_state(nid, N_MEMORY) { + __node_set(nid, &replica_nodes); + } + + for_each_memory_node(nid) + pr_info("Memory node: %d\n", nid); + + init_node_to_memory_mapping(); + master_node = page_to_nid(virt_to_page(lm_alias((void *)KERNEL_TEXT_START))); + + pr_info("Master Node: #%d\n", master_node); + for_each_memory_node(nid) { + if (nid == master_node) { + node_desc[nid].text_vaddr = lm_alias((void *)KERNEL_TEXT_START); + node_desc[nid].rodata_vaddr = lm_alias((void *)KERNEL_RODATA_START); + } else { + node_desc[nid].text_vaddr = memblock_alloc_try_nid( + (KERNEL_TEXT_END - KERNEL_TEXT_START), + align, 0, MEMBLOCK_ALLOC_ANYWHERE, nid); + + node_desc[nid].rodata_vaddr = memblock_alloc_try_nid( + (KERNEL_RODATA_END - KERNEL_RODATA_START), + align, 0, MEMBLOCK_ALLOC_ANYWHERE, nid); + } + + BUG_ON(node_desc[nid].text_vaddr == NULL); + BUG_ON(node_desc[nid].rodata_vaddr == NULL); + } +} + +void numa_replication_fini(void) +{ + int nid; + + /* + * Clear addresses form linear space + */ + for_each_memory_node(nid) { + node_desc[nid].text_vaddr = NULL; + node_desc[nid].rodata_vaddr = NULL; + } + + debugfs_init(); + + pr_info("Replicated page table : [%p --- %p]\n", + (void *)PAGE_TABLE_REPLICATION_LEFT, + (void *)PAGE_TABLE_REPLICATION_RIGHT); + + dump_kernel_pgtables(NULL, KERNEL_TEXT_START, KERNEL_RODATA_END - 1); +} -- 2.34.1