[PATCH openEuler-25.03] mm: introduce kernel cmdline option "kernel_replication="

28 Feb 2025

kunpeng inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/IBPRS0

-------------------------------------------------

The new cmdline option allows enabling/disabling
kernel replication feature.
By default, the replication feature is disabled.
This allows to set CONFIG_KERNEL_REPLICATION=y by default,
but enabling this feature is done via kernel cmdline.

Signed-off-by: Nikita Panov <panov.nikita@huawei.com>
---
 .../admin-guide/kernel-parameters.txt         |  7 ++
 arch/arm64/configs/openeuler_defconfig        |  2 +-
 arch/arm64/mm/pgd.c                           | 56 +++++++++----
 include/linux/numa_kernel_replication.h       | 28 ++++++-
 kernel/module/main.c                          | 11 ++-
 mm/numa_kernel_replication.c                  | 78 ++++++++++++++++++-
 mm/vmalloc.c                                  |  6 +-
 7 files changed, 163 insertions(+), 25 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 5bf51e171ee7..0f1f6143f53f 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -2534,6 +2534,13 @@
 			some extension. These two features are alternatives.
 			Current only arm64 is supported.
 
+	kernel_replication=
+			[ARM64]
+			Format: [on|off]
+			If CONFIG_KERNEL_REPLICATION is set, it allows
+			enabling/disabling the kernel replication feature
+			via cmdline. Default value is off.
+
 	kgdbdbgp=	[KGDB,HW] kgdb over EHCI usb debug port.
 			Format: <Controller#>[,poll interval]
 			The controller # is the number of the ehci usb debug
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 28de20ca2d71..0f3e72a3c2b6 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -1212,7 +1212,7 @@ CONFIG_ARM64_HAFT=y
 CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y
 CONFIG_PER_VMA_LOCK=y
 CONFIG_LOCK_MM_AND_FIND_VMA=y
-# CONFIG_KERNEL_REPLICATION is not set
+CONFIG_KERNEL_REPLICATION=y
 CONFIG_IOMMU_MM_DATA=y
 # CONFIG_ASCEND_FEATURES is not set
 CONFIG_PAGE_CACHE_LIMIT=y
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index adf642eba4e7..8326bd693b29 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -18,6 +18,24 @@
 
 static struct kmem_cache *pgd_cache __ro_after_init;
 
+static pgd_t *pgd_alloc_orig(struct mm_struct *mm)
+{
+	gfp_t gfp = GFP_PGTABLE_USER;
+
+	if (PGD_SIZE == PAGE_SIZE)
+		return (pgd_t *)__get_free_page(gfp);
+	else
+		return kmem_cache_alloc(pgd_cache, gfp);
+}
+
+static void pgd_free_orig(struct mm_struct *mm, pgd_t *pgd)
+{
+	if (PGD_SIZE == PAGE_SIZE)
+		free_page((unsigned long)pgd);
+	else
+		kmem_cache_free(pgd_cache, pgd);
+}
+
 #ifdef CONFIG_KERNEL_REPLICATION
 pgd_t *page_pgd_alloc(struct mm_struct *mm)
 {
@@ -37,11 +55,11 @@ pgd_t *page_pgd_alloc(struct mm_struct *mm)
 
 		WARN_ON_ONCE(page_to_nid(page) != nid);
 
-		per_node_pgd(mm, nid) = (pgd_t *)page_address(page);
+		*per_node_pgd_ptr(mm, nid) = (pgd_t *)page_address(page);
 	}
 
 	for_each_online_node(nid)
-		per_node_pgd(mm, nid) = per_node_pgd(mm, numa_get_memory_node(nid));
+		*per_node_pgd_ptr(mm, nid) = per_node_pgd(mm, numa_get_memory_node(nid));
 
 	mm->pgd = per_node_pgd(mm, numa_get_memory_node(0));/*!!!*/
 
@@ -53,7 +71,7 @@ pgd_t *page_pgd_alloc(struct mm_struct *mm)
 	return NULL;
 }
 
-pgd_t *pgd_alloc(struct mm_struct *mm)
+static pgd_t *pgd_alloc_replica(struct mm_struct *mm)
 {
 	pgd_t **pgd_numa = (pgd_t **)kmalloc(sizeof(pgd_t *) * MAX_NUMNODES, GFP_PGTABLE_KERNEL);
 
@@ -81,34 +99,44 @@ static void page_pgd_free(struct mm_struct *mm, pgd_t *pgd)
 	}
 
 	for_each_online_node(nid)
-		per_node_pgd(mm, nid) = NULL;
+		*per_node_pgd_ptr(mm, nid) = NULL;
 
 }
 
-void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+static void pgd_free_replica(struct mm_struct *mm, pgd_t *pgd)
 {
 	page_pgd_free(mm, pgd);
 
 	kfree(mm->pgd_numa);
 }
 
-#else /* !CONFIG_KERNEL_REPLICATION */
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
-	gfp_t gfp = GFP_PGTABLE_USER;
-
-	if (PGD_SIZE == PAGE_SIZE)
-		return (pgd_t *)__get_free_page(gfp);
+	mm->pgd_numa = NULL;
+	if (is_text_replicated())
+		return pgd_alloc_replica(mm);
 	else
-		return kmem_cache_alloc(pgd_cache, gfp);
+		return pgd_alloc_orig(mm);
+
 }
 
 void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 {
-	if (PGD_SIZE == PAGE_SIZE)
-		free_page((unsigned long)pgd);
+	if (is_text_replicated())
+		pgd_free_replica(mm, pgd);
 	else
-		kmem_cache_free(pgd_cache, pgd);
+		pgd_free_orig(mm, pgd);
+}
+
+#else /* !CONFIG_KERNEL_REPLICATION */
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	return pgd_alloc_orig(mm);
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	pgd_free_orig(mm, pgd);
 }
 #endif /* CONFIG_KERNEL_REPLICATION */
 
diff --git a/include/linux/numa_kernel_replication.h b/include/linux/numa_kernel_replication.h
index ee1ab0f111c7..37e7b56b5aa9 100644
--- a/include/linux/numa_kernel_replication.h
+++ b/include/linux/numa_kernel_replication.h
@@ -42,8 +42,31 @@ extern nodemask_t replica_nodes;
 	     nid != MAX_NUMNODES;			\
 	     nid = next_node(nid, replica_nodes))
 
-#define this_node_pgd(mm) ((mm)->pgd_numa[numa_node_id()])
-#define per_node_pgd(mm, nid) ((mm)->pgd_numa[nid])
+bool is_text_replicated(void);
+
+static inline pgd_t *this_node_pgd(struct mm_struct *mm)
+{
+	if (is_text_replicated())
+		return mm->pgd_numa[numa_node_id()];
+	else
+		return mm->pgd;
+}
+
+static inline pgd_t *per_node_pgd(struct mm_struct *mm, int nid)
+{
+	if (is_text_replicated())
+		return mm->pgd_numa[nid];
+	else
+		return mm->pgd;
+}
+
+static inline pgd_t **per_node_pgd_ptr(struct mm_struct *mm, int nid)
+{
+	if (is_text_replicated())
+		return &mm->pgd_numa[nid];
+	else
+		return &mm->pgd;
+}
 
 static inline bool numa_addr_has_replica(const void *addr)
 {
@@ -56,7 +79,6 @@ void __init numa_replicate_kernel_text(void);
 void numa_replicate_kernel_rodata(void);
 void numa_replication_fini(void);
 
-bool is_text_replicated(void);
 propagation_level_t get_propagation_level(void);
 void numa_setup_pgd(void);
 void __init_or_module *numa_get_replica(void *vaddr, int nid);
diff --git a/kernel/module/main.c b/kernel/module/main.c
index 3aa696b127ca..c5448bc9589a 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1217,6 +1217,9 @@ static void module_replicate_sections(struct module *mod)
 {
 	int i;
 
+	if (!is_text_replicated())
+		return;
+
 	for (i = 0; i < ARRAY_SIZE(sections_to_replicate); i++)
 		module_replicate(mod->mem[sections_to_replicate[i]].base);
 }
@@ -1228,9 +1231,11 @@ static void *module_memory_alloc(unsigned int size, enum mod_mem_type type)
 	if (mod_mem_use_vmalloc(type))
 		return vzalloc(size);
 
-	for (i = 0; i < ARRAY_SIZE(sections_to_replicate); i++) {
-		if (type == sections_to_replicate[i])
-			return module_alloc_replica(size);
+	if (is_text_replicated()) {
+		for (i = 0; i < ARRAY_SIZE(sections_to_replicate); i++) {
+			if (type == sections_to_replicate[i])
+				return module_alloc_replica(size);
+		}
 	}
 	return module_alloc(size);
 }
diff --git a/mm/numa_kernel_replication.c b/mm/numa_kernel_replication.c
index c2d289b7b9df..17664a1300b8 100644
--- a/mm/numa_kernel_replication.c
+++ b/mm/numa_kernel_replication.c
@@ -65,6 +65,8 @@ static unsigned int master_node = INT_MAX;
  */
 static int node_to_memory_node[MAX_NUMNODES];
 
+static bool kernel_replication_enabled;
+
 static bool pgtables_extra;
 static DEFINE_SPINLOCK(debugfs_lock);
 
@@ -254,6 +256,9 @@ static void dump_pgtables(struct mm_struct *mm,
 	start = start & PAGE_MASK;
 	end = (end & PAGE_MASK) - 1 + PAGE_SIZE;
 
+	if (!mm->pgd_numa)
+		return;
+
 	replication_log(data,
 			"----PER-NUMA NODE KERNEL REPLICATION ENABLED----\n");
 
@@ -622,6 +627,18 @@ static void replicate_pgtables(void)
 	}
 }
 
+static void __init numa_replicate_kernel_text_disabled(void)
+{
+	int nid;
+
+	init_mm.pgd_numa = (pgd_t **)kmalloc(sizeof(pgd_t *) * MAX_NUMNODES, GFP_PGTABLE_KERNEL);
+	BUG_ON(!init_mm.pgd_numa);
+	for_each_online_node(nid) {
+		init_mm.pgd_numa[nid] = init_mm.pgd;
+	}
+}
+
+
 /*
  * Kernel text replication includes two steps:
  * 1. page tables replication for init_mm
@@ -639,6 +656,11 @@ void __init numa_replicate_kernel_text(void)
 {
 	int nid;
 
+	if (!kernel_replication_enabled) {
+		numa_replicate_kernel_text_disabled();
+		return;
+	}
+
 	replicate_pgtables();
 
 	for_each_memory_node(nid) {
@@ -667,6 +689,10 @@ void numa_replicate_kernel_rodata(void)
 {
 	int nid;
 
+	if (!kernel_replication_enabled) {
+		return;
+	}
+
 	for_each_memory_node(nid) {
 		if (nid == master_node)
 			continue;
@@ -678,7 +704,7 @@ void numa_replicate_kernel_rodata(void)
 
 void numa_setup_pgd(void)
 {
-	numa_load_replicated_pgd(init_mm.pgd_numa[numa_node_id()]);
+	numa_load_replicated_pgd(this_node_pgd(&init_mm));
 }
 
 void __init_or_module *numa_get_replica(void *vaddr, int nid)
@@ -693,8 +719,48 @@ void __init_or_module *numa_get_replica(void *vaddr, int nid)
 	return node_desc[nid].text_vaddr + offset;
 }
 
+static int __init setup_kernel_replication(char *str)
+{
+	int ret = 0;
+
+	if (!str)
+		goto out;
+	if (!strcmp(str, "on")) {
+		kernel_replication_enabled = true;
+		pr_info("Kernel replication enabled via cmdline\n");
+		ret = 1;
+	} else if (!strcmp(str, "off")) {
+		kernel_replication_enabled = false;
+		pr_info("Kernel replication disabled via cmdline\n");
+		ret = 1;
+	}
+out:
+	if (!ret)
+		pr_warn("kernel_replication= cannot parse, ignored\n");
+	return ret;
+}
+__setup("kernel_replication=", setup_kernel_replication);
+
+
 nodemask_t __ro_after_init replica_nodes = { { [0] = 1UL } };
 
+/*
+ * Let us pretend, that we have only single node fore replicas.
+ * Do not replicate anything.
+ */
+static void __init numa_replication_init_disabled(void)
+{
+	int nid;
+
+	__node_set(0, &replica_nodes);
+	for_each_online_node(nid) {
+		node_to_memory_node[nid] = 0;
+	}
+
+	node_desc[0].text_vaddr = lm_alias((void *)KERNEL_TEXT_START);
+	node_desc[0].rodata_vaddr = lm_alias((void *)KERNEL_RODATA_START);
+}
+
 void __init numa_replication_init(void)
 {
 	int nid;
@@ -707,6 +773,16 @@ void __init numa_replication_init(void)
 #endif
 	nodes_clear(replica_nodes);
 
+	if (kernel_replication_enabled)
+		pr_info("WARNING! WARNING! WARNING! Kernel replication enabled WARNING! WARNING! WARNING!\n");
+	else
+		pr_info("Kernel replication disabled\n");
+
+	if (!kernel_replication_enabled) {
+		numa_replication_init_disabled();
+		return;
+	}
+
 	for_each_node_state(nid, N_MEMORY) {
 		__node_set(nid, &replica_nodes);
 	}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a136e86e6480..b170cc1ef447 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -493,7 +493,7 @@ void vunmap_range_replicas(unsigned long addr, unsigned long end)
 
 	flush_cache_vunmap(addr, end);
 	for_each_memory_node(nid)
-		vunmap_range_noflush_pgd(init_mm.pgd_numa[nid], addr, end);
+		vunmap_range_noflush_pgd(per_node_pgd(&init_mm, nid), addr, end);
 	flush_tlb_kernel_range(addr, end);
 }
 #endif /* CONFIG_KERNEL_REPLICATION && CONFIG_ARM64 */
@@ -2378,7 +2378,7 @@ static void free_unmap_vmap_area(struct vmap_area *va)
 		 *  empty entries here, which is totally fine
 		 */
 		for_each_memory_node(node)
-			vunmap_range_noflush_pgd(init_mm.pgd_numa[node],
+			vunmap_range_noflush_pgd(per_node_pgd(&init_mm, node),
 					va->va_start, va->va_end);
 	} else {
 		vunmap_range_noflush(va->va_start, va->va_end);
@@ -4123,7 +4123,7 @@ int __vmalloc_node_replicate_range(const void *addr, gfp_t gfp_mask,
 		for (i = 0; i < area->nr_pages; i++)
 			list_add(&pages[i]->lru, &area->pages[i]->lru);
 
-		vunmap_range_noflush_pgd(init_mm.pgd_numa[node],
+		vunmap_range_noflush_pgd(per_node_pgd(&init_mm, node),
 					 area_start, area_end);
 
 		/*
-- 
2.34.1