
kunpeng inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IBPRS0 ------------------------------------------------- The new cmdline option allows enabling/disabling kernel replication feature. By default, the replication feature is disabled. This allows to set CONFIG_KERNEL_REPLICATION=y by default, but enabling this feature is done via kernel cmdline. Signed-off-by: Nikita Panov <panov.nikita@huawei.com> --- .../admin-guide/kernel-parameters.txt | 7 ++ arch/arm64/configs/openeuler_defconfig | 2 +- arch/arm64/mm/pgd.c | 56 +++++++++---- include/linux/numa_kernel_replication.h | 28 ++++++- kernel/module/main.c | 11 ++- mm/numa_kernel_replication.c | 78 ++++++++++++++++++- mm/vmalloc.c | 6 +- 7 files changed, 163 insertions(+), 25 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5bf51e171ee7..0f1f6143f53f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2534,6 +2534,13 @@ some extension. These two features are alternatives. Current only arm64 is supported. + kernel_replication= + [ARM64] + Format: [on|off] + If CONFIG_KERNEL_REPLICATION is set, it allows + enabling/disabling the kernel replication feature + via cmdline. Default value is off. + kgdbdbgp= [KGDB,HW] kgdb over EHCI usb debug port. Format: <Controller#>[,poll interval] The controller # is the number of the ehci usb debug diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 28de20ca2d71..0f3e72a3c2b6 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1212,7 +1212,7 @@ CONFIG_ARM64_HAFT=y CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y CONFIG_LOCK_MM_AND_FIND_VMA=y -# CONFIG_KERNEL_REPLICATION is not set +CONFIG_KERNEL_REPLICATION=y CONFIG_IOMMU_MM_DATA=y # CONFIG_ASCEND_FEATURES is not set CONFIG_PAGE_CACHE_LIMIT=y diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index adf642eba4e7..8326bd693b29 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -18,6 +18,24 @@ static struct kmem_cache *pgd_cache __ro_after_init; +static pgd_t *pgd_alloc_orig(struct mm_struct *mm) +{ + gfp_t gfp = GFP_PGTABLE_USER; + + if (PGD_SIZE == PAGE_SIZE) + return (pgd_t *)__get_free_page(gfp); + else + return kmem_cache_alloc(pgd_cache, gfp); +} + +static void pgd_free_orig(struct mm_struct *mm, pgd_t *pgd) +{ + if (PGD_SIZE == PAGE_SIZE) + free_page((unsigned long)pgd); + else + kmem_cache_free(pgd_cache, pgd); +} + #ifdef CONFIG_KERNEL_REPLICATION pgd_t *page_pgd_alloc(struct mm_struct *mm) { @@ -37,11 +55,11 @@ pgd_t *page_pgd_alloc(struct mm_struct *mm) WARN_ON_ONCE(page_to_nid(page) != nid); - per_node_pgd(mm, nid) = (pgd_t *)page_address(page); + *per_node_pgd_ptr(mm, nid) = (pgd_t *)page_address(page); } for_each_online_node(nid) - per_node_pgd(mm, nid) = per_node_pgd(mm, numa_get_memory_node(nid)); + *per_node_pgd_ptr(mm, nid) = per_node_pgd(mm, numa_get_memory_node(nid)); mm->pgd = per_node_pgd(mm, numa_get_memory_node(0));/*!!!*/ @@ -53,7 +71,7 @@ pgd_t *page_pgd_alloc(struct mm_struct *mm) return NULL; } -pgd_t *pgd_alloc(struct mm_struct *mm) +static pgd_t *pgd_alloc_replica(struct mm_struct *mm) { pgd_t **pgd_numa = (pgd_t **)kmalloc(sizeof(pgd_t *) * MAX_NUMNODES, GFP_PGTABLE_KERNEL); @@ -81,34 +99,44 @@ static void page_pgd_free(struct mm_struct *mm, pgd_t *pgd) } for_each_online_node(nid) - per_node_pgd(mm, nid) = NULL; + *per_node_pgd_ptr(mm, nid) = NULL; } -void pgd_free(struct mm_struct *mm, pgd_t *pgd) +static void pgd_free_replica(struct mm_struct *mm, pgd_t *pgd) { page_pgd_free(mm, pgd); kfree(mm->pgd_numa); } -#else /* !CONFIG_KERNEL_REPLICATION */ pgd_t *pgd_alloc(struct mm_struct *mm) { - gfp_t gfp = GFP_PGTABLE_USER; - - if (PGD_SIZE == PAGE_SIZE) - return (pgd_t *)__get_free_page(gfp); + mm->pgd_numa = NULL; + if (is_text_replicated()) + return pgd_alloc_replica(mm); else - return kmem_cache_alloc(pgd_cache, gfp); + return pgd_alloc_orig(mm); + } void pgd_free(struct mm_struct *mm, pgd_t *pgd) { - if (PGD_SIZE == PAGE_SIZE) - free_page((unsigned long)pgd); + if (is_text_replicated()) + pgd_free_replica(mm, pgd); else - kmem_cache_free(pgd_cache, pgd); + pgd_free_orig(mm, pgd); +} + +#else /* !CONFIG_KERNEL_REPLICATION */ +pgd_t *pgd_alloc(struct mm_struct *mm) +{ + return pgd_alloc_orig(mm); +} + +void pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ + pgd_free_orig(mm, pgd); } #endif /* CONFIG_KERNEL_REPLICATION */ diff --git a/include/linux/numa_kernel_replication.h b/include/linux/numa_kernel_replication.h index ee1ab0f111c7..37e7b56b5aa9 100644 --- a/include/linux/numa_kernel_replication.h +++ b/include/linux/numa_kernel_replication.h @@ -42,8 +42,31 @@ extern nodemask_t replica_nodes; nid != MAX_NUMNODES; \ nid = next_node(nid, replica_nodes)) -#define this_node_pgd(mm) ((mm)->pgd_numa[numa_node_id()]) -#define per_node_pgd(mm, nid) ((mm)->pgd_numa[nid]) +bool is_text_replicated(void); + +static inline pgd_t *this_node_pgd(struct mm_struct *mm) +{ + if (is_text_replicated()) + return mm->pgd_numa[numa_node_id()]; + else + return mm->pgd; +} + +static inline pgd_t *per_node_pgd(struct mm_struct *mm, int nid) +{ + if (is_text_replicated()) + return mm->pgd_numa[nid]; + else + return mm->pgd; +} + +static inline pgd_t **per_node_pgd_ptr(struct mm_struct *mm, int nid) +{ + if (is_text_replicated()) + return &mm->pgd_numa[nid]; + else + return &mm->pgd; +} static inline bool numa_addr_has_replica(const void *addr) { @@ -56,7 +79,6 @@ void __init numa_replicate_kernel_text(void); void numa_replicate_kernel_rodata(void); void numa_replication_fini(void); -bool is_text_replicated(void); propagation_level_t get_propagation_level(void); void numa_setup_pgd(void); void __init_or_module *numa_get_replica(void *vaddr, int nid); diff --git a/kernel/module/main.c b/kernel/module/main.c index 3aa696b127ca..c5448bc9589a 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1217,6 +1217,9 @@ static void module_replicate_sections(struct module *mod) { int i; + if (!is_text_replicated()) + return; + for (i = 0; i < ARRAY_SIZE(sections_to_replicate); i++) module_replicate(mod->mem[sections_to_replicate[i]].base); } @@ -1228,9 +1231,11 @@ static void *module_memory_alloc(unsigned int size, enum mod_mem_type type) if (mod_mem_use_vmalloc(type)) return vzalloc(size); - for (i = 0; i < ARRAY_SIZE(sections_to_replicate); i++) { - if (type == sections_to_replicate[i]) - return module_alloc_replica(size); + if (is_text_replicated()) { + for (i = 0; i < ARRAY_SIZE(sections_to_replicate); i++) { + if (type == sections_to_replicate[i]) + return module_alloc_replica(size); + } } return module_alloc(size); } diff --git a/mm/numa_kernel_replication.c b/mm/numa_kernel_replication.c index c2d289b7b9df..17664a1300b8 100644 --- a/mm/numa_kernel_replication.c +++ b/mm/numa_kernel_replication.c @@ -65,6 +65,8 @@ static unsigned int master_node = INT_MAX; */ static int node_to_memory_node[MAX_NUMNODES]; +static bool kernel_replication_enabled; + static bool pgtables_extra; static DEFINE_SPINLOCK(debugfs_lock); @@ -254,6 +256,9 @@ static void dump_pgtables(struct mm_struct *mm, start = start & PAGE_MASK; end = (end & PAGE_MASK) - 1 + PAGE_SIZE; + if (!mm->pgd_numa) + return; + replication_log(data, "----PER-NUMA NODE KERNEL REPLICATION ENABLED----\n"); @@ -622,6 +627,18 @@ static void replicate_pgtables(void) } } +static void __init numa_replicate_kernel_text_disabled(void) +{ + int nid; + + init_mm.pgd_numa = (pgd_t **)kmalloc(sizeof(pgd_t *) * MAX_NUMNODES, GFP_PGTABLE_KERNEL); + BUG_ON(!init_mm.pgd_numa); + for_each_online_node(nid) { + init_mm.pgd_numa[nid] = init_mm.pgd; + } +} + + /* * Kernel text replication includes two steps: * 1. page tables replication for init_mm @@ -639,6 +656,11 @@ void __init numa_replicate_kernel_text(void) { int nid; + if (!kernel_replication_enabled) { + numa_replicate_kernel_text_disabled(); + return; + } + replicate_pgtables(); for_each_memory_node(nid) { @@ -667,6 +689,10 @@ void numa_replicate_kernel_rodata(void) { int nid; + if (!kernel_replication_enabled) { + return; + } + for_each_memory_node(nid) { if (nid == master_node) continue; @@ -678,7 +704,7 @@ void numa_replicate_kernel_rodata(void) void numa_setup_pgd(void) { - numa_load_replicated_pgd(init_mm.pgd_numa[numa_node_id()]); + numa_load_replicated_pgd(this_node_pgd(&init_mm)); } void __init_or_module *numa_get_replica(void *vaddr, int nid) @@ -693,8 +719,48 @@ void __init_or_module *numa_get_replica(void *vaddr, int nid) return node_desc[nid].text_vaddr + offset; } +static int __init setup_kernel_replication(char *str) +{ + int ret = 0; + + if (!str) + goto out; + if (!strcmp(str, "on")) { + kernel_replication_enabled = true; + pr_info("Kernel replication enabled via cmdline\n"); + ret = 1; + } else if (!strcmp(str, "off")) { + kernel_replication_enabled = false; + pr_info("Kernel replication disabled via cmdline\n"); + ret = 1; + } +out: + if (!ret) + pr_warn("kernel_replication= cannot parse, ignored\n"); + return ret; +} +__setup("kernel_replication=", setup_kernel_replication); + + nodemask_t __ro_after_init replica_nodes = { { [0] = 1UL } }; +/* + * Let us pretend, that we have only single node fore replicas. + * Do not replicate anything. + */ +static void __init numa_replication_init_disabled(void) +{ + int nid; + + __node_set(0, &replica_nodes); + for_each_online_node(nid) { + node_to_memory_node[nid] = 0; + } + + node_desc[0].text_vaddr = lm_alias((void *)KERNEL_TEXT_START); + node_desc[0].rodata_vaddr = lm_alias((void *)KERNEL_RODATA_START); +} + void __init numa_replication_init(void) { int nid; @@ -707,6 +773,16 @@ void __init numa_replication_init(void) #endif nodes_clear(replica_nodes); + if (kernel_replication_enabled) + pr_info("WARNING! WARNING! WARNING! Kernel replication enabled WARNING! WARNING! WARNING!\n"); + else + pr_info("Kernel replication disabled\n"); + + if (!kernel_replication_enabled) { + numa_replication_init_disabled(); + return; + } + for_each_node_state(nid, N_MEMORY) { __node_set(nid, &replica_nodes); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a136e86e6480..b170cc1ef447 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -493,7 +493,7 @@ void vunmap_range_replicas(unsigned long addr, unsigned long end) flush_cache_vunmap(addr, end); for_each_memory_node(nid) - vunmap_range_noflush_pgd(init_mm.pgd_numa[nid], addr, end); + vunmap_range_noflush_pgd(per_node_pgd(&init_mm, nid), addr, end); flush_tlb_kernel_range(addr, end); } #endif /* CONFIG_KERNEL_REPLICATION && CONFIG_ARM64 */ @@ -2378,7 +2378,7 @@ static void free_unmap_vmap_area(struct vmap_area *va) * empty entries here, which is totally fine */ for_each_memory_node(node) - vunmap_range_noflush_pgd(init_mm.pgd_numa[node], + vunmap_range_noflush_pgd(per_node_pgd(&init_mm, node), va->va_start, va->va_end); } else { vunmap_range_noflush(va->va_start, va->va_end); @@ -4123,7 +4123,7 @@ int __vmalloc_node_replicate_range(const void *addr, gfp_t gfp_mask, for (i = 0; i < area->nr_pages; i++) list_add(&pages[i]->lru, &area->pages[i]->lru); - vunmap_range_noflush_pgd(init_mm.pgd_numa[node], + vunmap_range_noflush_pgd(per_node_pgd(&init_mm, node), area_start, area_end); /* -- 2.34.1