HVO was previously disabled on arm64 [1] due to the lack of necessary BBM(break-before-make) logic when changing page tables. This set of patches fix this by adding necessary BBM sequence when changing page table, and supporting vmemmap page fault handling to fixup kernel address translation fault if vmemmap is concurrently accessed.
I have tested this patch set with concurrently accessing the vmemmap address when do BBM and can recover by vmemmap fault handler. Also tested under the config of 2/3/4 pgtable levels with 4K/64K page size and all works well.
[1] commit 060a2c92d1b6 ("arm64: mm: hugetlb: Disable HUGETLB_PAGE_OPTIMIZE_VMEMMAP")
Mark Rutland (1): arm64: mm: kfence: only handle translation faults
Nanyong Sun (2): mm: HVO: introduce helper function to update and flush pgtable arm64: mm: HVO: support BBM of vmemmap pgtable safely
arch/arm64/include/asm/esr.h | 4 ++ arch/arm64/include/asm/pgtable.h | 6 ++ arch/arm64/include/asm/tlbflush.h | 16 +++++ arch/arm64/mm/fault.c | 98 +++++++++++++++++++++++++++++-- arch/arm64/mm/mmu.c | 29 +++++++++ mm/sparse-vmemmap.c | 49 +++++++++++++--- 6 files changed, 189 insertions(+), 13 deletions(-)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/9164 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/F...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/9164 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/F...
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8YLWW
--------------------------------
Add pmd/pte update and tlb flush helper function to update page table. This refactoring patch is designed to facilitate each architecture to implement its own special logic in preparation for the arm64 architecture to follow the necessary break-before-make sequence when updating page tables.
Signed-off-by: Nanyong Sun sunnanyong@huawei.com Reviewed-by: Muchun Song songmuchun@bytedance.com --- mm/sparse-vmemmap.c | 45 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 38 insertions(+), 7 deletions(-)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 6803c89c5d21..bf5bb7872e9c 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -54,6 +54,37 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; };
+#ifndef vmemmap_update_pmd +static inline void vmemmap_update_pmd(unsigned long addr, + pmd_t *pmdp, pte_t *ptep) +{ + pmd_populate_kernel(&init_mm, pmdp, ptep); +} +#endif + +#ifndef vmemmap_update_pte +static inline void vmemmap_update_pte(unsigned long addr, + pte_t *ptep, pte_t pte) +{ + set_pte_at(&init_mm, addr, ptep, pte); +} +#endif + +#ifndef vmemmap_flush_tlb_all +static inline void vmemmap_flush_tlb_all(void) +{ + flush_tlb_all(); +} +#endif + +#ifndef vmemmap_flush_tlb_range +static inline void vmemmap_flush_tlb_range(unsigned long start, + unsigned long end) +{ + flush_tlb_kernel_range(start, end); +} +#endif + static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; @@ -88,8 +119,8 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
/* Make pte visible before pmd. See comment in __pte_alloc(). */ smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - flush_tlb_kernel_range(start, start + PMD_SIZE); + vmemmap_update_pmd(start, pmd, pgtable); + vmemmap_flush_tlb_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } @@ -221,7 +252,7 @@ static int vmemmap_remap_range(unsigned long start, unsigned long end, return ret; } while (pgd++, addr = next, addr != end);
- flush_tlb_kernel_range(start, end); + vmemmap_flush_tlb_range(start, end);
return 0; } @@ -269,15 +300,15 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
/* * Makes sure that preceding stores to the page contents from - * vmemmap_remap_free() become visible before the set_pte_at() - * write. + * vmemmap_remap_free() become visible before the + * vmemmap_update_pte() write. */ smp_wmb(); }
entry = mk_pte(walk->reuse_page, pgprot); list_add_tail(&page->lru, walk->vmemmap_pages); - set_pte_at(&init_mm, addr, pte, entry); + vmemmap_update_pte(addr, pte, entry); }
/* @@ -315,7 +346,7 @@ static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, copy_page(to, (void *)walk->reuse_addr); reset_struct_pages(to);
- set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); + vmemmap_update_pte(addr, pte, mk_pte(page, pgprot)); }
/**
From: Mark Rutland mark.rutland@arm.com
mainline inclusion from mainline-v6.2-rc1 commit 0bb1fbffc631064db567ccaeb9ed6b6df6342b66 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8YLWW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------------
Alexander noted that KFENCE only expects to handle faults from invalid page table entries (i.e. translation faults), but arm64's fault handling logic will call kfence_handle_page_fault() for other types of faults, including alignment faults caused by unaligned atomics. This has the unfortunate property of causing those other faults to be reported as "KFENCE: use-after-free", which is misleading and hinders debugging.
Fix this by only forwarding unhandled translation faults to the KFENCE code, similar to what x86 does already.
Alexander has verified that this passes all the tests in the KFENCE test suite and avoids bogus reports on misaligned atomics.
Link: https://lore.kernel.org/all/20221102081620.1465154-1-zhongbaisong@huawei.com... Fixes: 840b23986344 ("arm64, kfence: enable KFENCE for ARM64") Signed-off-by: Mark Rutland mark.rutland@arm.com Reviewed-by: Alexander Potapenko glider@google.com Tested-by: Alexander Potapenko glider@google.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Marco Elver elver@google.com Cc: Will Deacon will@kernel.org Link: https://lore.kernel.org/r/20221114104411.2853040-1-mark.rutland@arm.com Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Nanyong Sun sunnanyong@huawei.com --- arch/arm64/mm/fault.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 9bd10909861a..629b55672eb8 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -299,6 +299,11 @@ static void die_kernel_fault(const char *msg, unsigned long addr, do_exit(SIGKILL); }
+static bool is_translation_fault(unsigned long esr) +{ + return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT; +} + static void __do_kernel_fault(unsigned long addr, unsigned int esr, struct pt_regs *regs) { @@ -325,7 +330,8 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { - if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) + if (is_translation_fault(esr) && + kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) return;
msg = "paging request";
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8YLWW
--------------------------------
Implement vmemmap_update_pmd and vmemmap_update_pte on arm64 to do BBM(break-before-make) logic when change the page table of vmemmap address, they will under the init_mm.page_table_lock. If a translation fault of vmemmap address concurrently happened after pte/pmd cleared, vmemmap page fault handler will acquire the init_mm.page_table_lock to wait for vmemmap update to complete, by then the virtual address is valid again, so PF can return and access can continue. In other case, do the traditional kernel fault.
Implement vmemmap_flush_tlb_all/range on arm64 with nothing to do because tlb already flushed in every single BBM.
Fixes: 1e63ac088f20 ("arm64: mm: hugetlb: enable HUGETLB_PAGE_FREE_VMEMMAP for arm64") Signed-off-by: Nanyong Sun sunnanyong@huawei.com --- arch/arm64/include/asm/esr.h | 4 ++ arch/arm64/include/asm/pgtable.h | 6 ++ arch/arm64/include/asm/tlbflush.h | 16 ++++++ arch/arm64/mm/fault.c | 94 +++++++++++++++++++++++++++++-- arch/arm64/mm/mmu.c | 29 ++++++++++ mm/sparse-vmemmap.c | 4 +- 6 files changed, 146 insertions(+), 7 deletions(-)
diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index 34b009cd6004..20c6f7f51761 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -112,6 +112,10 @@ #define ESR_ELx_FSC_SERROR (0x11) #define ESR_ELx_FSC_ACCESS (0x08) #define ESR_ELx_FSC_FAULT (0x04) +#define ESR_ELx_FSC_FAULT_L0 (0x04) +#define ESR_ELx_FSC_FAULT_L1 (0x05) +#define ESR_ELx_FSC_FAULT_L2 (0x06) +#define ESR_ELx_FSC_FAULT_L3 (0x07) #define ESR_ELx_FSC_PERM (0x0C)
/* ISS field definitions for Data Aborts */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 4e7e047d6fb9..f914c30b7487 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -1038,6 +1038,12 @@ static inline pgprot_t arch_filter_pgprot(pgprot_t prot) return PAGE_READONLY_EXEC; }
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +void vmemmap_update_pmd(unsigned long addr, pmd_t *pmdp, pte_t *ptep); +#define vmemmap_update_pmd vmemmap_update_pmd +void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte); +#define vmemmap_update_pte vmemmap_update_pte +#endif
#endif /* !__ASSEMBLY__ */
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h index 4c28c6c4acba..353ec955915e 100644 --- a/arch/arm64/include/asm/tlbflush.h +++ b/arch/arm64/include/asm/tlbflush.h @@ -441,6 +441,22 @@ static inline void __flush_tlb_kernel_pgtable(unsigned long kaddr) dsb(ish); isb(); } + +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static inline void vmemmap_flush_tlb_all(void) +{ + /* do nothing, already flushed tlb in every single BBM */ +} +#define vmemmap_flush_tlb_all vmemmap_flush_tlb_all + +static inline void vmemmap_flush_tlb_range(unsigned long start, + unsigned long end) +{ + /* do nothing, already flushed tlb in every single BBM */ +} +#define vmemmap_flush_tlb_range vmemmap_flush_tlb_range +#endif + #endif
#endif diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 629b55672eb8..a2c61725c176 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -299,6 +299,85 @@ static void die_kernel_fault(const char *msg, unsigned long addr, do_exit(SIGKILL); }
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +static inline bool vmemmap_fault_may_fixup(unsigned long addr, + unsigned long esr) +{ + if (!hugetlb_optimize_vmemmap_enabled()) + return false; + + if (addr < VMEMMAP_START || addr >= VMEMMAP_END) + return false; + + /* + * Only try to handle translation fault level 2 or level 3, + * because hugetlb vmemmap optimize only clear pmd or pte. + */ + switch (esr & ESR_ELx_FSC) { + case ESR_ELx_FSC_FAULT_L2: + case ESR_ELx_FSC_FAULT_L3: + return true; + default: + return false; + } +} + +/* + * PMD mapped vmemmap should has been split as PTE mapped + * by HVO now, here we only check this case, other cases + * should fail. + * Also should check the addr is healthy enough that will not cause + * a level2 or level3 translation fault again after page fault + * handled with success, so we need check both bits[1:0] of PMD and + * PTE as ARM Spec mentioned below: + * A Translation fault is generated if bits[1:0] of a translation + * table descriptor identify the descriptor as either a Fault + * encoding or a reserved encoding. + */ +static inline bool vmemmap_addr_healthy(unsigned long addr) +{ + pmd_t *pmdp, pmd; + pte_t *ptep, pte; + + pmdp = pmd_off_k(addr); + pmd = READ_ONCE(*pmdp); + if (!pmd_table(pmd)) + return false; + + ptep = pte_offset_kernel(pmdp, addr); + pte = ptep_get(ptep); + return (pte_val(pte) & PTE_TYPE_MASK) == PTE_TYPE_PAGE; +} + +static bool vmemmap_handle_page_fault(unsigned long addr, + unsigned long esr) +{ + bool ret; + unsigned long flags; + + if (likely(!vmemmap_fault_may_fixup(addr, esr))) + return false; + + spin_lock_irqsave(&init_mm.page_table_lock, flags); + ret = vmemmap_addr_healthy(addr); + spin_unlock_irqrestore(&init_mm.page_table_lock, flags); + + return ret; +} +#else +static inline bool vmemmap_fault_may_fixup(unsigned long addr, + unsigned long esr) +{ + return false; +} + +static inline bool vmemmap_handle_page_fault(unsigned long addr, + unsigned long esr) +{ + return false; +} +#endif /* CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP */ + static bool is_translation_fault(unsigned long esr) { return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT; @@ -316,9 +395,11 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, if (!is_el1_instruction_abort(esr) && fixup_exception(regs)) return;
- if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs), - "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr)) + if (is_spurious_el1_translation_fault(addr, esr, regs)) { + WARN_RATELIMIT(!vmemmap_fault_may_fixup(addr, esr), + "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr); return; + }
if (is_el1_permission_fault(addr, esr, regs)) { if (esr & ESR_ELx_WNR) @@ -330,9 +411,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned int esr, } else if (addr < PAGE_SIZE) { msg = "NULL pointer dereference"; } else { - if (is_translation_fault(esr) && - kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) - return; + if (is_translation_fault(esr)) { + if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs)) + return; + if (vmemmap_handle_page_fault(addr, esr)) + return; + }
msg = "paging request"; } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 095c192c729b..adaca1fd5a27 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1148,6 +1148,35 @@ static void free_empty_tables(unsigned long addr, unsigned long end, #endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP + +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP +/* + * In the window between the page table entry is cleared and filled + * with a new value, other threads have the opportunity to concurrently + * access the vmemmap area then page translation fault occur. + * Therefore, we need to ensure that the init_mm.page_table_lock is held + * to synchronize the vmemmap page fault handling which will wait for + * this lock to be released to ensure that the page table entry has been + * refreshed with a new valid value. + */ +void vmemmap_update_pmd(unsigned long addr, pmd_t *pmdp, pte_t *ptep) +{ + lockdep_assert_held(&init_mm.page_table_lock); + pmd_clear(pmdp); + flush_tlb_kernel_range(addr, addr + PMD_SIZE); + pmd_populate_kernel(&init_mm, pmdp, ptep); +} + +void vmemmap_update_pte(unsigned long addr, pte_t *ptep, pte_t pte) +{ + spin_lock_irq(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, ptep); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + set_pte_at(&init_mm, addr, ptep, pte); + spin_unlock_irq(&init_mm.page_table_lock); +} +#endif + #if !ARM64_SWAPPER_USES_SECTION_MAPS int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, struct vmem_altmap *altmap) diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index bf5bb7872e9c..a47b027af1f7 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -107,7 +107,7 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) set_pte_at(&init_mm, addr, pte, entry); }
- spin_lock(&init_mm.page_table_lock); + spin_lock_irq(&init_mm.page_table_lock); if (likely(pmd_leaf(*pmd))) { /* * Higher order allocations from buddy allocator must be able to @@ -124,7 +124,7 @@ static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) } else { pte_free_kernel(&init_mm, pgtable); } - spin_unlock(&init_mm.page_table_lock); + spin_unlock_irq(&init_mm.page_table_lock);
return 0; }