[PATCH v4 OLK-6.6 0/4] arm64/ras: Firmware-first SEI error handling with ESB synchronization
This series introduces firmware-first RAS (Reliability, Availability, and Serviceability) error handling for ARM64 SEI (Synchronous External Interrupt) errors, with support for ESB (Error Synchronization Barrier) synchronization. The implementation includes: - Boot parameter to enable/disable ESB synchronization for SEI - Entry code patching to use ESB instruction for SEI handling - Vendor-specific SEI error handling via APEI/GHES - Sysctl interface for runtime control of vendor SEI handling Liao Chang (1): arm64/entry: Add support to synchronize SEI at the exception boundary Wupeng Ma (2): ACPI/APEI/arm64: add vendor SEI handling for firmware-first RAS arm64: openeuler_defconfig: enable CONFIG_ARM64_SYNC_SEI by default Zheng Chuan (1): arm64: Add runtime switch to control ESB for SEI synchronization Documentation/admin-guide/sysctl/kernel.rst | 27 ++++ arch/arm64/Kconfig | 14 ++ arch/arm64/configs/openeuler_defconfig | 1 + arch/arm64/include/asm/acpi.h | 2 + arch/arm64/include/asm/setup.h | 11 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/arm64_sync_sei.c | 124 +++++++++++++++ arch/arm64/kernel/entry.S | 64 ++++++++ arch/arm64/kernel/traps.c | 16 ++ arch/arm64/kernel/vmlinux.lds.S | 9 ++ arch/arm64/kernel/xcall/entry.S | 6 +- drivers/acpi/apei/apei-internal.h | 2 + drivers/acpi/apei/ghes-vendor-info.c | 162 +++++++++++++++++++- drivers/acpi/apei/ghes.c | 5 + 14 files changed, 441 insertions(+), 3 deletions(-) create mode 100644 arch/arm64/kernel/arm64_sync_sei.c -- 2.43.0
From: Liao Chang <liaochang1@huawei.com> hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9116 ------------------------------------------ In order to support hisilicon TF firmware which delegates SEI to lower exception software by jumping to excetion table directly, it needs some changes to the standard Arm64 exception handling. Signed-off-by: Liao Chang <liaochang1@huawei.com> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- arch/arm64/Kconfig | 9 ++++++ arch/arm64/kernel/entry.S | 52 +++++++++++++++++++++++++++++++++ arch/arm64/kernel/xcall/entry.S | 6 ++-- 3 files changed, 65 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 74e4639776de..86fbcc277e56 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2505,6 +2505,15 @@ config ARM64_PSEUDO_NMI If unsure, say N +config ARM64_SYNC_SEI + bool "Use ESB to Synchronize SEI At Exception Boundary(EXPERIMENTAL)" + depends on ARM64_RAS_EXTN + help + For Firmware-First, Use the ESB to synchronize SEI occurs before + exception entry from EL0 and exit to EL0. + + if unsure, say N + if ARM64_PSEUDO_NMI config ARM64_DEBUG_PRIORITY_MASKING bool "Debug interrupt priority masking" diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 039ec8d40899..72135826baad 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -29,6 +29,29 @@ #include <asm/asm-uaccess.h> #include <asm/unistd.h> + .macro sync_sei, label = esb +#ifdef CONFIG_ARM64_SYNC_SEI + .if \label != xcall + /* Use ESB to synchronize SEI at the entry and exit of exception */ + esb + .endif + .endm + + .macro sei_restore_sp_el0, tmp1:req, tmp2:req + /* + * It must restore SP_EL0 from per-cpu variable __entry_task, since TF + * firmware clobbers the SP_EL0 before SEI is delegated back. + */ + mov \tmp1, (1UL << VA_BITS) + mrs \tmp2, sp_el0 + cmp \tmp2, \tmp1 + b.cs .Lskip_sp_el0_restore + ldr_this_cpu \tmp2, __entry_task, \tmp1 + msr sp_el0, \tmp2 +.Lskip_sp_el0_restore: +#endif /* CONFIG_ARM64_SYNC_SEI */ + .endm + .macro clear_gp_regs .irp n,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29 mov x\n, xzr @@ -39,6 +62,7 @@ .align 7 .Lventry_start\@: .if \el == 0 + sync_sei \label /* * This must be the first instruction of the EL0 vector entries. It is * skipped by the trampoline vectors, to trigger the cleanup. @@ -75,7 +99,15 @@ tbnz x0, #THREAD_SHIFT, 0f sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp +#ifdef CONFIG_FAST_SYSCALL + .if \label == xcall + b el\el\ht\()_\regsize\()_sync + .else + b el\el\ht\()_\regsize\()_\label + .endif +#else b el\el\ht\()_\regsize\()_\label +#endif 0: /* @@ -107,7 +139,15 @@ sub sp, sp, x0 mrs x0, tpidrro_el0 #endif +#ifdef CONFIG_FAST_SYSCALL + .if \label == xcall + b el\el\ht\()_\regsize\()_sync + .else + b el\el\ht\()_\regsize\()_\label + .endif +#else b el\el\ht\()_\regsize\()_\label +#endif .org .Lventry_start\@ + 128 // Did we overflow the ventry slot? .endm @@ -715,6 +755,13 @@ SYM_CODE_START_LOCAL(el\el\ht\()_\regsize\()_\label) .endif #endif kernel_entry \el, \regsize + +#ifdef CONFIG_ARM64_SYNC_SEI + .if \el == 1 && \ht == h && \label == error + sei_restore_sp_el0 x20, x21 + .endif +#endif + mov x0, sp bl el\el\ht\()_\regsize\()_\label\()_handler .if \el == 0 @@ -832,6 +879,7 @@ alternative_else_nop_endif .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 1: + sync_sei .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif @@ -906,6 +954,10 @@ alternative_endif .endif // \bhb == BHB_MITIGATION_FW add x30, x30, #(1b - \vector_start + 4) +#ifdef CONFIG_ARM64_SYNC_SEI + /* Skip the 'ESB' and 'B' at default vector entry */ + add x30, x30, #4 +#endif ret .org 1b + 128 // Did we overflow the ventry slot? .endm diff --git a/arch/arm64/kernel/xcall/entry.S b/arch/arm64/kernel/xcall/entry.S index d5ed68db1547..460922506e62 100644 --- a/arch/arm64/kernel/xcall/entry.S +++ b/arch/arm64/kernel/xcall/entry.S @@ -209,13 +209,13 @@ SYM_CODE_START_LOCAL(el0t_fast_syscall) SYM_CODE_END(el0t_fast_syscall) SYM_CODE_START_LOCAL(el0t_64_sync_ventry) - kernel_ventry 0, t, 64, sync + kernel_ventry 0, t, 64, xcall SYM_CODE_END(el0t_64_sync_ventry) SYM_CODE_START_LOCAL(el0t_64_sync_ventry_vector) ldp x20, x21, [sp, #16 * 10] add sp, sp, #PT_REGS_SIZE - kernel_ventry 0, t, 64, sync + kernel_ventry 0, t, 64, xcall SYM_CODE_END(el0t_64_sync_ventry_vector) SYM_CODE_START_LOCAL(el0t_64_sync_table) @@ -240,6 +240,7 @@ SYM_CODE_END(el0t_64_sync_table) .macro xcall_ventry .align 7 .Lventry_start\@: + sync_sei /* * This must be the first instruction of the EL0 vector entries. It is * skipped by the trampoline vectors, to trigger the cleanup. @@ -266,6 +267,7 @@ SYM_CODE_END(el0t_64_sync_table) .macro sync_ventry .align 7 .Lventry_start\@: + sync_sei /* * This must be the first instruction of the EL0 vector entries. It is * skipped by the trampoline vectors, to trigger the cleanup. -- 2.43.0
From: Zheng Chuan <zhengchuan@huawei.com> hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9116 ------------------------------------------ Background: ESB (Error Synchronization Barrier) is used to synchronize SEI (SError Interrupt) at exception boundaries in firmware-first RAS model. However, ESB has performance impact and may not be needed on all platforms. Design: Add runtime sysctl (/proc/sys/kernel/arm64_sync_sei) that allows dynamic toggling of ESB (Error Synchronization Barrier) instruction patching at exception entry boundaries without rebooting. insertion: - Default is disabled (arm64_sync_sei = false) to minimize impact - When enabled via 'arm64_sync_sei', ESB is inserted at: - Exception entry from EL0 - Exception return to EL0 (before ERET) - Use linker section (.esb_patch_table) to collect ESB patch point addresses at link time, then aarch64_insn_patch_text() to atomically swap ESB/NOP instructions across all CPUs at runtime. - Simplify arm64_sync_sei_cb() (alternative_cb for sei_restore_sp_el0) to depend solely on ARM64_HAS_RAS_EXTN capability rather than the dynamic toggle, since SP_EL0 restoration in the el1h_64_error handler is a firmware-first correctness requirement independent of whether ESB is active at exception entry. Signed-off-by: Zheng Chuan <zhengchuan@huawei.com> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> Signed-off-by: Deng Guangxing <dengguangxing@huawei.com> --- Documentation/admin-guide/sysctl/kernel.rst | 27 +++++ arch/arm64/include/asm/setup.h | 11 ++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/arm64_sync_sei.c | 124 ++++++++++++++++++++ arch/arm64/kernel/entry.S | 18 ++- arch/arm64/kernel/vmlinux.lds.S | 9 ++ 6 files changed, 187 insertions(+), 3 deletions(-) create mode 100644 arch/arm64/kernel/arm64_sync_sei.c diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 3b8953c49183..0df2d1e151fc 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -238,6 +238,33 @@ Note: to decide what to do with it. +arm64_sync_sei (ARM64 only) +=========================== + +Enable or disable ESB (Error Synchronization Barrier) instruction at +exception entry from EL0 and exception return to EL0 (before ERET), as +well as the vendor-specific SEI handler (apei_claim_sei) for +Uncorrected Recoverable (UER) errors. + +When enabled: + +- ESB synchronizes SEI (SError Interrupt) at exception boundaries in the + firmware-first RAS model, ensuring that any pending SError is handled + at a known point. +- The vendor SEI handler is invoked for UER-type SError, allowing + platform-specific error recovery instead of panicking. + +This has a performance impact on system call throughput and is disabled +by default. + +Requires the CPU to support the RAS extension (ARM64_HAS_RAS_EXTN). +Writing 1 on a CPU without RAS extension has no effect. + +This sysctl does not affect the SP_EL0 restoration in the SError +handler (``sei_restore_sp_el0``), which is always active on RAS-capable +platforms for firmware-first correctness. + + dmesg_restrict ============== diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h index 2e4d7da74fb8..ac3bd8fc6b3e 100644 --- a/arch/arm64/include/asm/setup.h +++ b/arch/arm64/include/asm/setup.h @@ -44,4 +44,15 @@ static inline bool arch_parse_debug_rodata(char *arg) } #define arch_parse_debug_rodata arch_parse_debug_rodata +#ifdef CONFIG_ARM64_SYNC_SEI +bool arm64_sync_sei_enabled(void); +extern unsigned long __start_esb_patch_table[]; +extern unsigned long __stop_esb_patch_table[]; +#else +static inline bool arm64_sync_sei_enabled(void) +{ + return false; +} +#endif + #endif diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 300bfcb8a890..5fed5f7d6868 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o obj-$(CONFIG_ARM64_ILP32) += vdso-ilp32/ obj-$(CONFIG_FAST_SYSCALL) += xcall/ obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.o +obj-$(CONFIG_ARM64_SYNC_SEI) += arm64_sync_sei.o obj-$(CONFIG_IPI_AS_NMI) += ipi_nmi.o obj-$(CONFIG_HISI_VIRTCCA_GUEST) += virtcca_cvm_guest.o virtcca_cvm_tsi.o obj-$(CONFIG_HISI_VIRTCCA_HOST) += virtcca_cvm_host.o diff --git a/arch/arm64/kernel/arm64_sync_sei.c b/arch/arm64/kernel/arm64_sync_sei.c new file mode 100644 index 000000000000..fc556768a26f --- /dev/null +++ b/arch/arm64/kernel/arm64_sync_sei.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/cpu.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/sysctl.h> +#include <asm/alternative.h> +#include <asm/cpufeature.h> +#include <asm/insn.h> +#include <asm/patching.h> +#include <asm/setup.h> + +static bool arm64_sync_sei __read_mostly; + +/* + * alternative_cb callback for sei_restore_sp_el0 in entry.S. + * + * The SP_EL0 restoration is needed on firmware-first RAS platforms where + * the trusted firmware clobbers SP_EL0 before delegating SEI back to the + * kernel. This is a correctness requirement in the el1h_64_error handler + * path, not a performance decision, so it depends solely on the CPU having + * the RAS extension (ARM64_HAS_RAS_EXTN). It is independent of the + * dynamic arm64_sync_sei sysctl which only controls ESB at exception + * boundaries for performance reasons. + */ +void noinstr arm64_sync_sei_cb(struct alt_instr *alt, __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + int i; + + if (cpus_have_cap(ARM64_HAS_RAS_EXTN)) + return; + + /* Keep as NOP */ + for (i = 0; i < nr_inst; i++) + updptr[i] = cpu_to_le32(aarch64_insn_gen_nop()); +} + +bool arm64_sync_sei_enabled(void) +{ + return arm64_sync_sei; +} + +static int arm64_sync_sei_toggle(bool enable) +{ + unsigned long *table = __start_esb_patch_table; + int count = __stop_esb_patch_table - __start_esb_patch_table; + void **addrs; + u32 *insns; + u32 target_insn; + int i, ret; + + if (!count) + return -ENODEV; + + if (!cpus_have_cap(ARM64_HAS_RAS_EXTN)) + return -ENODEV; + + target_insn = enable + ? aarch64_insn_gen_hint(AARCH64_INSN_HINT_ESB) + : aarch64_insn_gen_nop(); + + addrs = kmalloc_array(count, sizeof(void *), GFP_KERNEL); + insns = kmalloc_array(count, sizeof(u32), GFP_KERNEL); + if (!addrs || !insns) { + kfree(addrs); + kfree(insns); + return -ENOMEM; + } + + for (i = 0; i < count; i++) { + addrs[i] = (void *)table[i]; + insns[i] = target_insn; + } + + cpus_read_lock(); + ret = aarch64_insn_patch_text(addrs, insns, count); + cpus_read_unlock(); + + kfree(addrs); + kfree(insns); + + return ret; +} + +static int arm64_sync_sei_sysctl(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + bool old_val = arm64_sync_sei; + + ret = proc_dobool(table, write, buffer, lenp, ppos); + if (ret) + return ret; + + if (write && arm64_sync_sei != old_val) { + ret = arm64_sync_sei_toggle(arm64_sync_sei); + if (ret) + arm64_sync_sei = old_val; + } + + return ret; +} + +static struct ctl_table arm64_sync_sei_sysctl_table[] = { + { + .procname = "arm64_sync_sei", + .data = &arm64_sync_sei, + .maxlen = sizeof(bool), + .mode = 0644, + .proc_handler = arm64_sync_sei_sysctl, + }, +}; + +static int __init arm64_sync_sei_late_init(void) +{ + if (read_cpuid_id() != MIDR_HISI_HIP12) + return 0; + + register_sysctl("kernel", arm64_sync_sei_sysctl_table); + return 0; +} +late_initcall(arm64_sync_sei_late_init); diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 72135826baad..e140b7defa20 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -29,15 +29,26 @@ #include <asm/asm-uaccess.h> #include <asm/unistd.h> +#ifdef CONFIG_ARM64_SYNC_SEI +/* External symbols from arm64_sync_sei.c */ + .extern arm64_sync_sei_cb +#endif + .macro sync_sei, label = esb #ifdef CONFIG_ARM64_SYNC_SEI .if \label != xcall - /* Use ESB to synchronize SEI at the entry and exit of exception */ - esb + .pushsection ".esb_patch_table", "a" + .quad .Lesb_insn\@ + .popsection +.Lesb_insn\@: + nop .endif +#endif .endm .macro sei_restore_sp_el0, tmp1:req, tmp2:req +#ifdef CONFIG_ARM64_SYNC_SEI +alternative_cb ARM64_ALWAYS_SYSTEM, arm64_sync_sei_cb /* * It must restore SP_EL0 from per-cpu variable __entry_task, since TF * firmware clobbers the SP_EL0 before SEI is delegated back. @@ -49,6 +60,7 @@ ldr_this_cpu \tmp2, __entry_task, \tmp1 msr sp_el0, \tmp2 .Lskip_sp_el0_restore: +alternative_cb_end #endif /* CONFIG_ARM64_SYNC_SEI */ .endm @@ -955,7 +967,7 @@ alternative_endif add x30, x30, #(1b - \vector_start + 4) #ifdef CONFIG_ARM64_SYNC_SEI - /* Skip the 'ESB' and 'B' at default vector entry */ + /* Skip the sync_sei NOP/ESB and the 'B' at default vector entry */ add x30, x30, #4 #endif ret diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index d4353741f331..c2edcef563d0 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -190,6 +190,15 @@ SECTIONS /* everything from this point to __init_begin will be marked RO NX */ RO_DATA(PAGE_SIZE) +#ifdef CONFIG_ARM64_SYNC_SEI + . = ALIGN(8); + .esb_patch_table : { + PROVIDE(__start_esb_patch_table = .); + *(.esb_patch_table) + PROVIDE(__stop_esb_patch_table = .); + } +#endif + HYPERVISOR_DATA_SECTIONS .got : { *(.got) } -- 2.43.0
hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9116 ------------------------------------------ Background: SEI (SError Interrupt) is an asynchronous exception on ARM64 systems. In firmware-first RAS model, firmware handle SEI errors before the OS, recording error information in vendor-specific registers. Problem: When SEI occurs in userspace with Uncorrected Recoverable/Unrecoverable AET (Abort Error Type), the kernel currently cannot recover the error because it doesn't have access to the error information stored by firmware in vendor-specific registers. Solution: Add a vendor handler interface (apei_claim_sei) allowing each SoC vendor to implement platform-specific SEI handling. Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- arch/arm64/Kconfig | 7 +- arch/arm64/include/asm/acpi.h | 2 + arch/arm64/kernel/traps.c | 16 +++ drivers/acpi/apei/apei-internal.h | 2 + drivers/acpi/apei/ghes-vendor-info.c | 162 ++++++++++++++++++++++++++- drivers/acpi/apei/ghes.c | 5 + 6 files changed, 192 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 86fbcc277e56..81d2baafdcd6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2507,11 +2507,16 @@ config ARM64_PSEUDO_NMI config ARM64_SYNC_SEI bool "Use ESB to Synchronize SEI At Exception Boundary(EXPERIMENTAL)" - depends on ARM64_RAS_EXTN + depends on ARM64_RAS_EXTN && ACPI_APEI_GHES_ARMP_VENDOR_INFO help For Firmware-First, Use the ESB to synchronize SEI occurs before exception entry from EL0 and exit to EL0. + SEI is an asynchronous exception that can occur on ARMv8 systems. + When firmware handles SEI first, it analyzes and records the error. + This option allows the OS to retrieve the error record and take + appropriate action, such as killing the affected task. + if unsure, say N if ARM64_PSEUDO_NMI diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index c07a58b96329..693dc4c7b817 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -134,9 +134,11 @@ static inline int get_cpu_for_acpi_id(u32 uid) static inline void arch_fix_phys_package_id(int num, u32 slot) { } void __init acpi_init_cpus(void); int apei_claim_sea(struct pt_regs *regs); +int apei_claim_sei(struct pt_regs *regs); #else static inline void acpi_init_cpus(void) { } static inline int apei_claim_sea(struct pt_regs *regs) { return -ENOENT; } +static inline int apei_claim_sei(struct pt_regs *regs) { return -ENOENT; } #endif /* CONFIG_ACPI */ #ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 08a5e3d30919..e8ef661715f0 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -30,6 +30,7 @@ #include <linux/kasan.h> #include <linux/ubsan.h> #include <linux/cfi.h> +#include <linux/acpi.h> #include <asm/atomic.h> #include <asm/bug.h> @@ -49,6 +50,8 @@ #include <asm/stacktrace.h> #include <asm/system_misc.h> #include <asm/sysreg.h> +#include <asm/acpi.h> +#include <asm/setup.h> static bool __kprobes __check_eq(unsigned long pstate) { @@ -971,6 +974,12 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr) { unsigned long aet = arm64_ras_serror_get_severity(esr); + pr_info_ratelimited( + "%s aet: %ld comm: %.20s tgid: %d pid: %d cpu: %d\n", + (regs && user_mode(regs)) ? "userspace" : "kernelspace", aet, + current->comm, current->tgid, current->pid, + raw_smp_processor_id()); + switch (aet) { case ESR_ELx_AET_CE: /* corrected error */ case ESR_ELx_AET_UEO: /* restartable, not yet consumed */ @@ -989,7 +998,14 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr) * Neoverse-N1 #1349291 means a non-KVM SError reported as * Unrecoverable should be treated as Uncontainable. We * call arm64_serror_panic() in both cases. + * + * Add a vendor handler interface (apei_claim_sei) allowing + * each SoC vendor to implement platform-specific SEI handling + * for UER. */ + if ((aet == ESR_ELx_AET_UER) && !apei_claim_sei(regs)) + return false; + return true; case ESR_ELx_AET_UC: /* Uncontainable or Uncategorized error */ diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h index f5519153e32d..50bf372637b6 100644 --- a/drivers/acpi/apei/apei-internal.h +++ b/drivers/acpi/apei/apei-internal.h @@ -142,11 +142,13 @@ static inline bool apei_page_should_offline(unsigned long pfn) #ifdef CONFIG_ACPI_APEI_GHES_ARMP_VENDOR_INFO bool ghes_armp_vendor_critical_error(struct cper_sec_proc_arm *err, bool sync); +int ghes_armp_vendor_handle_sei(struct pt_regs *regs); #else static inline bool ghes_armp_vendor_critical_error(struct cper_sec_proc_arm *err, bool sync) { return false; } +static inline int ghes_armp_vendor_handle_sei(struct pt_regs *regs) { return -ENOENT; } #endif #endif diff --git a/drivers/acpi/apei/ghes-vendor-info.c b/drivers/acpi/apei/ghes-vendor-info.c index 55db619638a2..d8bad5be2502 100644 --- a/drivers/acpi/apei/ghes-vendor-info.c +++ b/drivers/acpi/apei/ghes-vendor-info.c @@ -4,10 +4,19 @@ * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved. */ +#define pr_fmt(fmt) "GHES: VENDOR: " fmt + #include <linux/init.h> #include <linux/acpi.h> +#include <linux/signal.h> +#include <linux/task_work.h> +#include <linux/genalloc.h> + #include <acpi/ghes.h> #include <acpi/apei.h> + +#include <asm/setup.h> + #include "apei-internal.h" #define HISI_OEM BIT(0) @@ -57,11 +66,152 @@ static bool ghes_hisi_critical_hw_error(struct cper_sec_proc_arm *err, bool sync return (bool)(vendor_info->err_flag & HISI_VENDOR_CRITICAL_ERR); } + +struct sei_task_work { + struct callback_head twork; + u64 pfn; +}; + +static struct gen_pool *hisi_sei_pool; + +static int ghes_hisi_sei_init(void) +{ + unsigned long addr, len = PAGE_SIZE; + int rc; + + if (!IS_ENABLED(CONFIG_ARM64_SYNC_SEI)) + return 0; + + hisi_sei_pool = gen_pool_create(ilog2(sizeof(struct sei_task_work)), -1); + if (!hisi_sei_pool) + return -ENOMEM; + + addr = (unsigned long)kzalloc(PAGE_ALIGN(len), GFP_KERNEL); + if (!addr) + goto err_pool_alloc; + + rc = gen_pool_add(hisi_sei_pool, addr, PAGE_ALIGN(len), -1); + if (rc) + goto err_pool_add; + + return 0; + +err_pool_add: + kfree((void *)addr); + +err_pool_alloc: + gen_pool_destroy(hisi_sei_pool); + hisi_sei_pool = NULL; + + pr_warn("%s init failed\n", __func__); + return -ENOMEM; +} + +static void hisi_sei_kill_task_work(struct callback_head *twork) +{ + struct sei_task_work *ctx = container_of(twork, struct sei_task_work, twork); + + kill_accessing_process(ctx->pfn, MF_ACTION_REQUIRED, true); + gen_pool_free(hisi_sei_pool, (unsigned long)ctx, sizeof(*ctx)); +} + +/* + * Read SEI error address from HiSilicon RAS registers. + * - s3_3_c15_c0_1 lower 16 bits combined with s3_3_c15_c0_0 form the error physical address. + * - Clear the registers after reading to acknowledge the error. + */ +static inline u64 hisi_sei_get_error_pa(void) +{ + u64 sw_res_reg0 = read_sysreg(s3_3_c15_c0_0); + u64 sw_res_reg1 = read_sysreg(s3_3_c15_c0_1); + u64 pa = ((sw_res_reg1 & 0xFFFFUL) << 32) | (sw_res_reg0 & 0xFFFFFFFFUL); + + write_sysreg(0, s3_3_c15_c0_0); + write_sysreg(sw_res_reg1 & ~0xFFFFUL, s3_3_c15_c0_1); + + return pa; +} + +/* + * Handle fatal SEI error by scheduling a task work to kill the affected process. + * @err_pa: The physical address that triggered the SEI. + * + * This function allocates a task work structure from a pre-allocated pool and + * schedules it to run on the current task. The task work will invoke + * kill_accessing_process() to send a SIGKILL to the process that has the + * error address mapped. This mechanism is used for memory errors in user-space + * accessible regions managed by drivers. + * + * Return: true if task work is successfully scheduled, false otherwise. + */ +static bool hisi_sei_kill_task(void) +{ + struct sei_task_work *ctx; + unsigned long err_pa; + + if (!hisi_sei_pool) + return false; + + ctx = (void *)gen_pool_alloc(hisi_sei_pool, sizeof(*ctx)); + if (!ctx) { + pr_warn_ratelimited("alloc task work failed\n"); + return false; + } + + err_pa = hisi_sei_get_error_pa(); + if (!err_pa) { + pr_warn_ratelimited("err pa is not valid\n"); + gen_pool_free(hisi_sei_pool, (unsigned long)ctx, sizeof(*ctx)); + return false; + } + + ctx->pfn = PHYS_PFN(err_pa); + init_task_work(&ctx->twork, hisi_sei_kill_task_work); + if (task_work_add(current, &ctx->twork, TWA_RESUME)) { + pr_warn_ratelimited("task work add failed\n"); + gen_pool_free(hisi_sei_pool, (unsigned long)ctx, sizeof(*ctx)); + return false; + } + return true; +} + +/* + * Handle HiSilicon specific Synchronous External Interrupt (SEI) errors. + * @regs: exception registers, NULL if from user space + * + * This function processes vendor-specific SEI errors for HiSilicon platforms. + * For user space errors, it reads the error physical address from RAS registers + * and schedules a task work to kill the accessing task. If recovery fails or + * the error is from kernel space, the current process is terminated with SIGKILL. + * + * Return: 0 if SEI is handled, -ENOENT if not applicable or unsupported. + */ +static int ghes_hisi_handle_sei(struct pt_regs *regs) +{ + if (!IS_ENABLED(CONFIG_ARM64_SYNC_SEI)) + return -ENOENT; + + if (!arm64_sync_sei_enabled()) + return -ENOENT; + + if (!current->mm) + return -ENOENT; + + if ((!regs || user_mode(regs)) && hisi_sei_kill_task()) + return 0; + + pr_err("Sending SIGKILL to comm: %s, pid: %d, tgid: %d due to sei not recovered\n", + current->comm, current->pid, current->tgid); + force_sig(SIGKILL); + return 0; +} #else static inline bool ghes_hisi_critical_hw_error(struct cper_sec_proc_arm *err, bool sync) { return false; } +static int ghes_hisi_sei_init(void) { return 0; } +static int ghes_hisi_handle_sei(struct pt_regs *regs) { return -ENOENT; } #endif bool ghes_armp_vendor_critical_error(struct cper_sec_proc_arm *err, bool sync) @@ -72,6 +222,14 @@ bool ghes_armp_vendor_critical_error(struct cper_sec_proc_arm *err, bool sync) return false; } +int ghes_armp_vendor_handle_sei(struct pt_regs *regs) +{ + if (vender_oem & HISI_OEM) + return ghes_hisi_handle_sei(regs); + + return -ENOENT; +} + static int __init ghes_check_oem_table(void) { struct acpi_table_header *tbl; @@ -81,8 +239,10 @@ static int __init ghes_check_oem_table(void) if (ACPI_FAILURE(status) || !tbl) return -ENODEV; - if (!memcmp(tbl->oem_id, "HISI ", ACPI_OEM_ID_SIZE)) + if (!memcmp(tbl->oem_id, "HISI ", ACPI_OEM_ID_SIZE)) { vender_oem |= HISI_OEM; + ghes_hisi_sei_init(); + } acpi_put_table(tbl); return 0; diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 34ab844bd013..1d8890405a87 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -1699,3 +1699,8 @@ void ghes_unregister_report_chain(struct notifier_block *nb) atomic_notifier_chain_unregister(&ghes_report_chain, nb); } EXPORT_SYMBOL_GPL(ghes_unregister_report_chain); + +int apei_claim_sei(struct pt_regs *regs) +{ + return ghes_armp_vendor_handle_sei(regs); +} -- 2.43.0
hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/9116 ------------------------------------------ Enable CONFIG_ARM64_SYNC_SEI by default. Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- arch/arm64/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index c427eab333cd..5409b7435c38 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -604,6 +604,7 @@ CONFIG_ARM64_HISI_IPIV=y CONFIG_ARM64_SVE=y CONFIG_ARM64_SME=y CONFIG_ARM64_PSEUDO_NMI=y +CONFIG_ARM64_SYNC_SEI=y # CONFIG_ARM64_DEBUG_PRIORITY_MASKING is not set CONFIG_IPI_AS_NMI=y CONFIG_NON_NMI_IPI_BACKTRACE=y -- 2.43.0
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://atomgit.com/openeuler/kernel/merge_requests/22447 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/Q3A... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://atomgit.com/openeuler/kernel/merge_requests/22447 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/Q3A...
participants (2)
-
patchwork bot -
Wupeng Ma