[PATCH openEuler-25.03 12/20] arm64: enable per-NUMA node kernel text and rodata replication

24 Feb 2025

From: Nikita Panov <panov.nikita@huawei.com>

During boot memory for replicas is allocated,
local translation tables are created,
original text and rodata are copied to replicas,
and replicas are mapped to local tables.
On startup of the secondary CPUs, after minimal initialization,
the local pgtable is loaded to ttbr1.
For the user-space part, every mm_switch table from the local node is loaded.

Acked-by: Artem Kuzin <artem.kuzin@huawei.com>
Acked-by: Alexander Grubnikov <alexander.grubnikov@huawei.com>
Acked-by: Ilya Hanov <ilya.hanov@huawei-partners.com>
Acked-by: Denis Darvish <darvish.denis@huawei.com>
Signed-off-by: Nikita Panov <panov.nikita@huawei.com>
---
 arch/arm64/include/asm/pgtable.h |  4 ++
 arch/arm64/kernel/smp.c          |  8 ++++
 arch/arm64/mm/context.c          |  3 +-
 arch/arm64/mm/init.c             | 49 ++++++++++++++++++++
 arch/arm64/mm/mmu.c              | 41 ++++++++++++++++-
 arch/arm64/mm/pgd.c              | 77 ++++++++++++++++++++++++++++++++
 6 files changed, 179 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index d3bf5143b250..b30e43b84a64 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -21,7 +21,11 @@
  * VMALLOC_END: extends to the available space below vmemmap, PCI I/O space
  *	and fixed mappings
  */
+#ifdef CONFIG_KERNEL_REPLICATION
+#define VMALLOC_START		((MODULES_END & PGDIR_MASK) + PGDIR_SIZE)
+#else /* !CONFIG_KERNEL_REPLICATION */
 #define VMALLOC_START		(MODULES_END)
+#endif /* CONFIG_KERNEL_REPLICATION */
 #define VMALLOC_END		(VMEMMAP_START - SZ_256M)
 
 #define vmemmap			((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT))
diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c
index 50f6576f1b31..2cadcfd15814 100644
--- a/arch/arm64/kernel/smp.c
+++ b/arch/arm64/kernel/smp.c
@@ -34,6 +34,7 @@
 #include <linux/kexec.h>
 #include <linux/crash_dump.h>
 #include <linux/kvm_host.h>
+#include <linux/numa_kernel_replication.h>
 
 #include <asm/alternative.h>
 #include <asm/atomic.h>
@@ -209,6 +210,13 @@ asmlinkage notrace void secondary_start_kernel(void)
 	mmgrab(mm);
 	current->active_mm = mm;
 
+	/*
+	 * Setup per-NUMA node page table if kernel
+	 * replication is enabled. Option supported
+	 * only for 64-bit mode.
+	 */
+	numa_setup_pgd();
+
 	/*
 	 * TTBR0 is only used for the identity mapping at this stage. Make it
 	 * point to zero page to avoid speculatively fetching new entries.
diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 188197590fc9..ae3e3dc0d2fc 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -11,6 +11,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/numa_kernel_replication.h>
 
 #include <asm/cpufeature.h>
 #include <asm/mmu_context.h>
@@ -267,7 +268,7 @@ void check_and_switch_context(struct mm_struct *mm)
 	 * emulating PAN.
 	 */
 	if (!system_uses_ttbr0_pan())
-		cpu_switch_mm(mm->pgd, mm);
+		cpu_switch_mm(this_node_pgd(mm), mm);
 }
 
 unsigned long arm64_mm_context_get(struct mm_struct *mm)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 66a7fff9f373..8d955787e030 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -597,6 +597,47 @@ void __init bootmem_init(void)
 	memblock_dump_all();
 }
 
+#ifdef CONFIG_KERNEL_REPLICATION
+/*
+ * It is necessary to preallocate vmalloc pages in advance,
+ * otherwise the replicated page-tables can be incomplete.
+ */
+void __init preallocate_vmalloc_pages(void)
+{
+	unsigned long addr;
+
+	for (addr = MODULES_VADDR; addr <= VMALLOC_END;
+			addr = ALIGN(addr + 1, PGDIR_SIZE)) {
+		pgd_t *pgd = pgd_offset_k(addr);
+		p4d_t *p4d;
+		pud_t *pud;
+		pmd_t *pmd;
+		int pte;
+
+		p4d = p4d_alloc(&init_mm, pgd, addr);
+		/*
+		 * No need to check p4d here due to
+		 * only 4-stage page table is possible
+		 */
+		pud = pud_alloc(&init_mm, p4d, addr);
+		if (!pud)
+			panic("Failed to pre-allocate pud pages for vmalloc area\n");
+		if (!mm_pud_folded(&init_mm))
+			continue;
+
+		pmd = pmd_alloc(&init_mm, pud, addr);
+		if (!pmd)
+			panic("Failed to pre-allocate pmd pages for vmalloc area\n");
+		if (!mm_pmd_folded(&init_mm))
+			continue;
+
+		pte = pte_alloc(&init_mm, pmd);
+		if (pte)
+			panic("Failed to pre-allocate pte pages for vmalloc area\n");
+	}
+}
+#endif /* CONFIG_KERNEL_REPLICATION */
+
 /*
  * mem_init() marks the free areas in the mem_map and tells us how much memory
  * is free.  This is done after various parts of the system have claimed their
@@ -651,7 +692,15 @@ void free_initmem(void)
 	 * prevents the region from being reused for kernel modules, which
 	 * is not supported by kallsyms.
 	 */
+#ifdef CONFIG_KERNEL_REPLICATION
+	/*
+	 * In case of replicated kernel the per-NUMA node vmalloc
+	 * memory should be released.
+	 */
+	vunmap_range_replicas((u64)__init_begin, (u64)__init_end);
+#else
 	vunmap_range((u64)__init_begin, (u64)__init_end);
+#endif /* CONFIG_KERNEL_REPLICATION */
 }
 
 void dump_mem_limit(void)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index c846cc54e9ce..f19d8b8ab382 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -25,6 +25,7 @@
 #include <linux/vmalloc.h>
 #include <linux/set_memory.h>
 #include <linux/kfence.h>
+#include <linux/numa_kernel_replication.h>
 
 #include <asm/barrier.h>
 #include <asm/cputype.h>
@@ -477,6 +478,23 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			     pgd_pgtable_alloc, flags);
 }
 
+static void populate_mappings_prot(phys_addr_t phys, unsigned long virt,
+				   phys_addr_t size, pgprot_t prot)
+{
+#ifdef CONFIG_KERNEL_REPLICATION
+	int nid;
+
+	for_each_memory_node(nid) {
+		__create_pgd_mapping(per_node_pgd(&init_mm, nid),
+			page_to_phys(walk_to_page_node(nid, (void *)virt)),
+			virt, size, prot, NULL, NO_CONT_MAPPINGS);
+	}
+#else
+	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
+			     NO_CONT_MAPPINGS);
+#endif /* CONFIG_KERNEL_REPLICATION */
+}
+
 static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
 				phys_addr_t size, pgprot_t prot)
 {
@@ -486,8 +504,7 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt,
 		return;
 	}
 
-	__create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL,
-			     NO_CONT_MAPPINGS);
+	populate_mappings_prot(phys, virt, size, prot);
 
 	/* flush the TLBs after updating live kernel mappings */
 	flush_tlb_kernel_range(virt, virt + size);
@@ -676,6 +693,22 @@ static pgprot_t kernel_exec_prot(void)
 }
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
+
+#ifdef CONFIG_KERNEL_REPLICATION
+static void __init populate_trampoline_mappings(void)
+{
+	int nid;
+
+	/* Copy trampoline mappings in replicated tables */
+	for_each_memory_node(nid) {
+		memcpy(per_node_pgd(&init_mm, nid) - (PAGE_SIZE * 2 / sizeof(pgd_t)),
+				tramp_pg_dir, PGD_SIZE);
+	}
+	/* Be sure that replicated page table can be observed properly */
+	dsb(ishst);
+}
+#endif /* CONFIG_KERNEL_REPLICATION */
+
 static int __init map_entry_trampoline(void)
 {
 	int i;
@@ -701,6 +734,10 @@ static int __init map_entry_trampoline(void)
 		__set_fixmap(FIX_ENTRY_TRAMP_TEXT1 - i,
 			     pa_start + i * PAGE_SIZE, PAGE_KERNEL_RO);
 
+#ifdef CONFIG_KERNEL_REPLICATION
+	populate_trampoline_mappings();
+#endif /* CONFIG_KERNEL_REPLICATION */
+
 	return 0;
 }
 core_initcall(map_entry_trampoline);
diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c
index 4a64089e5771..adf642eba4e7 100644
--- a/arch/arm64/mm/pgd.c
+++ b/arch/arm64/mm/pgd.c
@@ -10,6 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/highmem.h>
 #include <linux/slab.h>
+#include <linux/numa_kernel_replication.h>
 
 #include <asm/pgalloc.h>
 #include <asm/page.h>
@@ -17,6 +18,81 @@
 
 static struct kmem_cache *pgd_cache __ro_after_init;
 
+#ifdef CONFIG_KERNEL_REPLICATION
+pgd_t *page_pgd_alloc(struct mm_struct *mm)
+{
+	int nid;
+	gfp_t gfp = GFP_PGTABLE_USER | __GFP_THISNODE;
+	/*
+	 * Kernel replication is not supproted in case of non-page size pgd,
+	 * in general we can support it, but maybe later, due to we need to
+	 * update page tables allocation significantly, so, let's panic here.
+	 */
+	for_each_memory_node(nid) {
+		struct page *page;
+
+		page = alloc_pages_node(nid, gfp, 0);
+		if (!page)
+			goto fail;
+
+		WARN_ON_ONCE(page_to_nid(page) != nid);
+
+		per_node_pgd(mm, nid) = (pgd_t *)page_address(page);
+	}
+
+	for_each_online_node(nid)
+		per_node_pgd(mm, nid) = per_node_pgd(mm, numa_get_memory_node(nid));
+
+	mm->pgd = per_node_pgd(mm, numa_get_memory_node(0));/*!!!*/
+
+	return mm->pgd;
+
+fail:
+	pgd_free(mm, mm->pgd);
+
+	return NULL;
+}
+
+pgd_t *pgd_alloc(struct mm_struct *mm)
+{
+	pgd_t **pgd_numa = (pgd_t **)kmalloc(sizeof(pgd_t *) * MAX_NUMNODES, GFP_PGTABLE_KERNEL);
+
+	if (!pgd_numa)
+		return NULL;
+
+	mm->pgd_numa = pgd_numa;
+
+	return page_pgd_alloc(mm);
+}
+
+static void page_pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	int nid;
+	/*
+	 * Kernel replication is not supproted in case of non-page size pgd,
+	 * in general we can support it, but maybe later, due to we need to
+	 * update page tables allocation significantly, so, let's panic here.
+	 */
+	for_each_memory_node(nid) {
+		if (per_node_pgd(mm, nid) == NULL)
+			break;
+		WARN_ON_ONCE(page_to_nid(virt_to_page(per_node_pgd(mm, nid))) != nid);
+		free_page((unsigned long)per_node_pgd(mm, nid));
+	}
+
+	for_each_online_node(nid)
+		per_node_pgd(mm, nid) = NULL;
+
+}
+
+void pgd_free(struct mm_struct *mm, pgd_t *pgd)
+{
+	page_pgd_free(mm, pgd);
+
+	kfree(mm->pgd_numa);
+}
+
+#else /* !CONFIG_KERNEL_REPLICATION */
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	gfp_t gfp = GFP_PGTABLE_USER;
@@ -34,6 +110,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 	else
 		kmem_cache_free(pgd_cache, pgd);
 }
+#endif /* CONFIG_KERNEL_REPLICATION */
 
 void __init pgtable_cache_init(void)
 {
-- 
2.34.1