[PATCH OLK-6.6 0/5] arm64/ras: Firmware-first SEI error handling with ESB synchronization
This series introduces firmware-first RAS (Reliability, Availability, and Serviceability) error handling for ARM64 SEI (Synchronous External Interrupt) errors, with support for ESB (Error Synchronization Barrier) synchronization. The implementation includes: - Boot parameter to enable/disable ESB synchronization for SEI - Entry code patching to use ESB instruction for SEI handling - Vendor-specific SEI error handling via APEI/GHES - Sysctl interface for runtime control of vendor SEI handling Liao Chang (1): arm64/entry: Add support to synchronize SEI at the exception boundary Wupeng Ma (3): ACPI/APEI/arm64: add vendor SEI handling for firmware-first RAS ACPI: APEI: add runtime switch for HiSilicon vendor SEI handling arm64: openeuler_defconfig: enable CONFIG_ARM64_SYNC_SEI by default Zheng Chuan (1): arm64: Add boot parameter to control ESB for SEI synchronization .../admin-guide/kernel-parameters.txt | 8 + Documentation/admin-guide/sysctl/kernel.rst | 27 +++ arch/arm64/Kconfig | 14 ++ arch/arm64/configs/openeuler_defconfig | 1 + arch/arm64/include/asm/acpi.h | 2 + arch/arm64/include/asm/setup.h | 9 + arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/arm64_sync_sei.c | 45 +++++ arch/arm64/kernel/entry.S | 44 +++++ arch/arm64/kernel/traps.c | 15 ++ arch/arm64/kernel/xcall/entry.S | 2 + drivers/acpi/apei/apei-internal.h | 2 + drivers/acpi/apei/ghes-vendor-info.c | 186 +++++++++++++++++- drivers/acpi/apei/ghes.c | 5 + 14 files changed, 360 insertions(+), 1 deletion(-) create mode 100644 arch/arm64/kernel/arm64_sync_sei.c -- 2.43.0
From: Liao Chang <liaochang1@huawei.com> hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/8918 ------------------------------------------ In order to support hisilicon TF firmware which delegates SEI to lower exception software by jumping to VBAR_EL2 + el1h_64_error offset directly, it needs some changes to the standard Arm64 exception handling: - Place ESB at the exception vector entry from EL0 and before ERET returns to EL0. NOTICE: The exception vectors traps from EL0 have four different versions depends on kernel config. el0t_64_sync traps to the code generated by macro sync_ventry with CONFIG_FAST_SYSCALL is enabled. el0t_32_sync traps to the code generated by macro xcall_ventry with CONFIG_ACTLR_XCALL_XINT is enableu. All exceptions from EL0 trap to the code generated by macro tramp_ventry when KPTI or spectre mitigation is enabled. otherwise, it traps to the code generated by macro kernel_ventry by default. - Since the hisilicon TF firmware clobber the SP_EL0 to delegate SError to lower exception level, so it needs to restore the SP_EL0 from this_cpu per-cpu variable for el1h_64_error vector. Signed-off-by: Liao Chang <liaochang1@huawei.com> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- arch/arm64/Kconfig | 9 +++++++++ arch/arm64/kernel/entry.S | 35 +++++++++++++++++++++++++++++++++ arch/arm64/kernel/xcall/entry.S | 2 ++ 3 files changed, 46 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 74e4639776de..86fbcc277e56 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2505,6 +2505,15 @@ config ARM64_PSEUDO_NMI If unsure, say N +config ARM64_SYNC_SEI + bool "Use ESB to Synchronize SEI At Exception Boundary(EXPERIMENTAL)" + depends on ARM64_RAS_EXTN + help + For Firmware-First, Use the ESB to synchronize SEI occurs before + exception entry from EL0 and exit to EL0. + + if unsure, say N + if ARM64_PSEUDO_NMI config ARM64_DEBUG_PRIORITY_MASKING bool "Debug interrupt priority masking" diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 039ec8d40899..f9f358f41682 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -29,6 +29,27 @@ #include <asm/asm-uaccess.h> #include <asm/unistd.h> + .macro sync_sei +#ifdef CONFIG_ARM64_SYNC_SEI + /* Use ESB to synchronize SEI at the entry and exit of exception */ + esb + .endm + + .macro sei_restore_sp_el0, tmp1:req, tmp2:req + /* + * It must restore SP_EL0 from per-cpu variable __entry_task, since TF + * firmware clobbers the SP_EL0 before SEI is delegated back. + */ + mov \tmp1, (1UL << VA_BITS) + mrs \tmp2, sp_el0 + cmp \tmp2, \tmp1 + b.cs .Lskip_sp_el0_restore + ldr_this_cpu \tmp2, __entry_task, \tmp1 + msr sp_el0, \tmp2 +.Lskip_sp_el0_restore: +#endif /* CONFIG_ARM64_SYNC_SEI */ + .endm + .macro clear_gp_regs .irp n,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29 mov x\n, xzr @@ -39,6 +60,7 @@ .align 7 .Lventry_start\@: .if \el == 0 + sync_sei /* * This must be the first instruction of the EL0 vector entries. It is * skipped by the trampoline vectors, to trigger the cleanup. @@ -482,6 +504,7 @@ alternative_else_nop_endif #endif ldr lr, [sp, #S_LR] + sync_sei add sp, sp, #PT_REGS_SIZE // restore sp /* This must be after the last explicit memory access */ @@ -715,6 +738,13 @@ SYM_CODE_START_LOCAL(el\el\ht\()_\regsize\()_\label) .endif #endif kernel_entry \el, \regsize + +#ifdef CONFIG_ARM64_SYNC_SEI + .if \el == 1 && \ht == h && \label == error + sei_restore_sp_el0 x20, x21 + .endif +#endif + mov x0, sp bl el\el\ht\()_\regsize\()_\label\()_handler .if \el == 0 @@ -832,6 +862,7 @@ alternative_else_nop_endif .macro tramp_ventry, vector_start, regsize, kpti, bhb .align 7 1: + sync_sei .if \regsize == 64 msr tpidrro_el0, x30 // Restored in kernel_ventry .endif @@ -906,6 +937,10 @@ alternative_endif .endif // \bhb == BHB_MITIGATION_FW add x30, x30, #(1b - \vector_start + 4) +#ifdef CONFIG_ARM64_SYNC_SEI + /* Skip the 'ESB' and 'B' at default vector entry */ + add x30, x30, #4 +#endif ret .org 1b + 128 // Did we overflow the ventry slot? .endm diff --git a/arch/arm64/kernel/xcall/entry.S b/arch/arm64/kernel/xcall/entry.S index d5ed68db1547..283a1191abab 100644 --- a/arch/arm64/kernel/xcall/entry.S +++ b/arch/arm64/kernel/xcall/entry.S @@ -240,6 +240,7 @@ SYM_CODE_END(el0t_64_sync_table) .macro xcall_ventry .align 7 .Lventry_start\@: + sync_sei /* * This must be the first instruction of the EL0 vector entries. It is * skipped by the trampoline vectors, to trigger the cleanup. @@ -266,6 +267,7 @@ SYM_CODE_END(el0t_64_sync_table) .macro sync_ventry .align 7 .Lventry_start\@: + sync_sei /* * This must be the first instruction of the EL0 vector entries. It is * skipped by the trampoline vectors, to trigger the cleanup. -- 2.43.0
From: Zheng Chuan <zhengchuan@huawei.com> hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/8918 ------------------------------------------ Background: ESB (Error Synchronization Barrier) is used to synchronize SEI (SError Interrupt) at exception boundaries in firmware-first RAS model. However, ESB has performance impact and may not be needed on all platforms. Design: Add a boot parameter 'arm64_sync_sei' to dynamically control ESB insertion: - Default is disabled (arm64_sync_sei = false) to minimize impact - When enabled via 'arm64_sync_sei', ESB is inserted at: - Exception entry from EL0 - Exception return to EL0 (before ERET) - Uses alternative patching: fills NOP when disabled, keeps ESB when enabled - Checks for ARM64_HAS_RAS_EXTN capability before enabling Signed-off-by: Zheng Chuan <zhengchuan@huawei.com> Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- .../admin-guide/kernel-parameters.txt | 8 ++++ arch/arm64/include/asm/setup.h | 9 ++++ arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/arm64_sync_sei.c | 45 +++++++++++++++++++ arch/arm64/kernel/entry.S | 9 ++++ 5 files changed, 72 insertions(+) create mode 100644 arch/arm64/kernel/arm64_sync_sei.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a437aafa2946..cb4c720de67c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -435,6 +435,14 @@ arm64.nosve [ARM64] Unconditionally disable Scalable Vector Extension support + arm64_sync_sei [ARM64] Enable ESB (Error Synchronization Barrier) to + synchronize SEI (SError Interrupt) at exception boundaries + in firmware-first RAS model. This option has performance + impact and is disabled by default. When enabled via + arm64_sync_sei, ESB is inserted at exception entry from + EL0 and exception return to EL0 (before ERET). Requires + ARM64_HAS_RAS_EXTN capability. + ataflop= [HW,M68k] atarimouse= [HW,MOUSE] Atari Mouse diff --git a/arch/arm64/include/asm/setup.h b/arch/arm64/include/asm/setup.h index 2e4d7da74fb8..a9039c35c813 100644 --- a/arch/arm64/include/asm/setup.h +++ b/arch/arm64/include/asm/setup.h @@ -44,4 +44,13 @@ static inline bool arch_parse_debug_rodata(char *arg) } #define arch_parse_debug_rodata arch_parse_debug_rodata +#ifdef CONFIG_ARM64_SYNC_SEI +bool arm64_sync_sei_enabled(void); +#else +static inline bool arm64_sync_sei_enabled(void) +{ + return false; +} +#endif + #endif diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index 300bfcb8a890..5fed5f7d6868 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_COMPAT_VDSO) += vdso32-wrap.o obj-$(CONFIG_ARM64_ILP32) += vdso-ilp32/ obj-$(CONFIG_FAST_SYSCALL) += xcall/ obj-$(CONFIG_UNWIND_PATCH_PAC_INTO_SCS) += patch-scs.o +obj-$(CONFIG_ARM64_SYNC_SEI) += arm64_sync_sei.o obj-$(CONFIG_IPI_AS_NMI) += ipi_nmi.o obj-$(CONFIG_HISI_VIRTCCA_GUEST) += virtcca_cvm_guest.o virtcca_cvm_tsi.o obj-$(CONFIG_HISI_VIRTCCA_HOST) += virtcca_cvm_host.o diff --git a/arch/arm64/kernel/arm64_sync_sei.c b/arch/arm64/kernel/arm64_sync_sei.c new file mode 100644 index 000000000000..d5592e53383d --- /dev/null +++ b/arch/arm64/kernel/arm64_sync_sei.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/kstrtox.h> +#include <asm/alternative.h> +#include <asm/cpufeature.h> +#include <asm/insn.h> +#include <asm/cache.h> + +static bool arm64_sync_sei __read_mostly; + +static int __init arm64_sync_sei_setup(char *str) +{ + arm64_sync_sei = true; + + pr_info("Enable RAS Extension Support with ESB to synchronize SEI\n"); + return 0; +} +early_param("arm64_sync_sei", arm64_sync_sei_setup); + +/* + * alternative_cb callback: Patch ESB instruction to synchronize SEI + * Called during boot to patch entry.S code + */ +void noinstr arm64_sync_sei_cb(struct alt_instr *alt, __le32 *origptr, + __le32 *updptr, int nr_inst) +{ + int i; + + if (arm64_sync_sei && cpus_have_cap(ARM64_HAS_RAS_EXTN)) + return; + + /* mark as invalid since ras extension is not supported */ + arm64_sync_sei = false; + + /* Keep as NOP */ + for (i = 0; i < nr_inst; i++) + updptr[i] = cpu_to_le32(aarch64_insn_gen_nop()); +} + +bool arm64_sync_sei_enabled(void) +{ + return arm64_sync_sei; +} diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index f9f358f41682..7e699f9f5b86 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -29,13 +29,21 @@ #include <asm/asm-uaccess.h> #include <asm/unistd.h> +#ifdef CONFIG_ARM64_SYNC_SEI +/* External symbols from ras_sync_serror.c */ +.extern arm64_sync_sei_cb +#endif + .macro sync_sei #ifdef CONFIG_ARM64_SYNC_SEI /* Use ESB to synchronize SEI at the entry and exit of exception */ +alternative_cb ARM64_ALWAYS_SYSTEM, arm64_sync_sei_cb esb +alternative_cb_end .endm .macro sei_restore_sp_el0, tmp1:req, tmp2:req +alternative_cb ARM64_ALWAYS_SYSTEM, arm64_sync_sei_cb /* * It must restore SP_EL0 from per-cpu variable __entry_task, since TF * firmware clobbers the SP_EL0 before SEI is delegated back. @@ -47,6 +55,7 @@ ldr_this_cpu \tmp2, __entry_task, \tmp1 msr sp_el0, \tmp2 .Lskip_sp_el0_restore: +alternative_cb_end #endif /* CONFIG_ARM64_SYNC_SEI */ .endm -- 2.43.0
hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/8918 ------------------------------------------ Background: SEI (SError Interrupt) is an asynchronous exception on ARM64 systems. In firmware-first RAS model, firmware intercepts and analyzes SEI errors before the OS, recording error information in vendor-specific registers. Problem: When SEI occurs in userspace with Uncorrected Recoverable/Unrecoverable AET (Abort Error Type), the kernel currently cannot recover the error because it doesn't have access to the error information stored by firmware in vendor-specific registers. Solution: Add a vendor handler interface (apei_claim_sei) allowing each SoC vendor to implement platform-specific SEI handling. Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- arch/arm64/Kconfig | 7 +- arch/arm64/include/asm/acpi.h | 2 + arch/arm64/kernel/traps.c | 15 +++ drivers/acpi/apei/apei-internal.h | 2 + drivers/acpi/apei/ghes-vendor-info.c | 157 ++++++++++++++++++++++++++- drivers/acpi/apei/ghes.c | 5 + 6 files changed, 186 insertions(+), 2 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 86fbcc277e56..81d2baafdcd6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -2507,11 +2507,16 @@ config ARM64_PSEUDO_NMI config ARM64_SYNC_SEI bool "Use ESB to Synchronize SEI At Exception Boundary(EXPERIMENTAL)" - depends on ARM64_RAS_EXTN + depends on ARM64_RAS_EXTN && ACPI_APEI_GHES_ARMP_VENDOR_INFO help For Firmware-First, Use the ESB to synchronize SEI occurs before exception entry from EL0 and exit to EL0. + SEI is an asynchronous exception that can occur on ARMv8 systems. + When firmware handles SEI first, it analyzes and records the error. + This option allows the OS to retrieve the error record and take + appropriate action, such as killing the affected task. + if unsure, say N if ARM64_PSEUDO_NMI diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index c07a58b96329..693dc4c7b817 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -134,9 +134,11 @@ static inline int get_cpu_for_acpi_id(u32 uid) static inline void arch_fix_phys_package_id(int num, u32 slot) { } void __init acpi_init_cpus(void); int apei_claim_sea(struct pt_regs *regs); +int apei_claim_sei(struct pt_regs *regs); #else static inline void acpi_init_cpus(void) { } static inline int apei_claim_sea(struct pt_regs *regs) { return -ENOENT; } +static inline int apei_claim_sei(struct pt_regs *regs) { return -ENOENT; } #endif /* CONFIG_ACPI */ #ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 08a5e3d30919..0ba6623f626c 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -30,6 +30,7 @@ #include <linux/kasan.h> #include <linux/ubsan.h> #include <linux/cfi.h> +#include <linux/acpi.h> #include <asm/atomic.h> #include <asm/bug.h> @@ -49,6 +50,7 @@ #include <asm/stacktrace.h> #include <asm/system_misc.h> #include <asm/sysreg.h> +#include <asm/acpi.h> static bool __kprobes __check_eq(unsigned long pstate) { @@ -971,6 +973,12 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr) { unsigned long aet = arm64_ras_serror_get_severity(esr); + pr_info_ratelimited( + "%s aet: %ld comm: %.20s tgid: %d pid: %d cpu: %d\n", + (regs && user_mode(regs)) ? "userspace" : "kernelspace", aet, + current->comm, current->tgid, current->pid, + raw_smp_processor_id()); + switch (aet) { case ESR_ELx_AET_CE: /* corrected error */ case ESR_ELx_AET_UEO: /* restartable, not yet consumed */ @@ -989,7 +997,14 @@ bool arm64_is_fatal_ras_serror(struct pt_regs *regs, unsigned long esr) * Neoverse-N1 #1349291 means a non-KVM SError reported as * Unrecoverable should be treated as Uncontainable. We * call arm64_serror_panic() in both cases. + * + * Add a vendor handler interface (apei_claim_sei) allowing + * each SoC vendor to implement platform-specific SEI handling + * for UER. */ + if ((aet == ESR_ELx_AET_UER) && !apei_claim_sei(regs)) + return false; + return true; case ESR_ELx_AET_UC: /* Uncontainable or Uncategorized error */ diff --git a/drivers/acpi/apei/apei-internal.h b/drivers/acpi/apei/apei-internal.h index f5519153e32d..50bf372637b6 100644 --- a/drivers/acpi/apei/apei-internal.h +++ b/drivers/acpi/apei/apei-internal.h @@ -142,11 +142,13 @@ static inline bool apei_page_should_offline(unsigned long pfn) #ifdef CONFIG_ACPI_APEI_GHES_ARMP_VENDOR_INFO bool ghes_armp_vendor_critical_error(struct cper_sec_proc_arm *err, bool sync); +int ghes_armp_vendor_handle_sei(struct pt_regs *regs); #else static inline bool ghes_armp_vendor_critical_error(struct cper_sec_proc_arm *err, bool sync) { return false; } +static inline int ghes_armp_vendor_handle_sei(struct pt_regs *regs) { return -ENOENT; } #endif #endif diff --git a/drivers/acpi/apei/ghes-vendor-info.c b/drivers/acpi/apei/ghes-vendor-info.c index 55db619638a2..53be618b897e 100644 --- a/drivers/acpi/apei/ghes-vendor-info.c +++ b/drivers/acpi/apei/ghes-vendor-info.c @@ -4,10 +4,19 @@ * Copyright (c) Huawei Technologies Co., Ltd. 2026. All rights reserved. */ +#define pr_fmt(fmt) "GHES: VENDOR: " fmt + #include <linux/init.h> #include <linux/acpi.h> +#include <linux/signal.h> +#include <linux/task_work.h> +#include <linux/genalloc.h> + #include <acpi/ghes.h> #include <acpi/apei.h> + +#include <asm/setup.h> + #include "apei-internal.h" #define HISI_OEM BIT(0) @@ -57,11 +66,147 @@ static bool ghes_hisi_critical_hw_error(struct cper_sec_proc_arm *err, bool sync return (bool)(vendor_info->err_flag & HISI_VENDOR_CRITICAL_ERR); } + +struct sei_task_work { + struct callback_head twork; + u64 pfn; +}; + +static struct gen_pool *hisi_sei_pool; + +static int ghes_hisi_sei_init(void) +{ + unsigned long addr, len = PAGE_SIZE; + int rc; + + if (!IS_ENABLED(CONFIG_ARM64_SYNC_SEI)) + return 0; + + if (!arm64_sync_sei_enabled()) + return 0; + + hisi_sei_pool = gen_pool_create(ilog2(sizeof(struct sei_task_work)), -1); + if (!hisi_sei_pool) + return -ENOMEM; + + addr = (unsigned long)kzalloc(PAGE_ALIGN(len), GFP_KERNEL); + if (!addr) + goto err_pool_alloc; + + rc = gen_pool_add(hisi_sei_pool, addr, PAGE_ALIGN(len), -1); + if (rc) + goto err_pool_add; + + return 0; + +err_pool_add: + kfree((void *)addr); + +err_pool_alloc: + gen_pool_destroy(hisi_sei_pool); + hisi_sei_pool = NULL; + + pr_warn("%s init failed\n", __func__); + return -ENOMEM; +} + +static void hisi_sei_kill_task_work(struct callback_head *twork) +{ + struct sei_task_work *ctx = container_of(twork, struct sei_task_work, twork); + + kill_accessing_process(ctx->pfn, MF_ACTION_REQUIRED, true); + gen_pool_free(hisi_sei_pool, (unsigned long)ctx, sizeof(*ctx)); +} + +/* + * Read SEI error address from HiSilicon RAS registers. + * - s3_3_c15_c0_1 lower 16 bits combined with s3_3_c15_c0_0 form the error physical address. + * - Clear the registers after reading to acknowledge the error. + */ +static inline u64 hisi_sei_get_error_pa(void) +{ + u64 sw_res_reg0 = read_sysreg(s3_3_c15_c0_0); + u64 sw_res_reg1 = read_sysreg(s3_3_c15_c0_1); + u64 pa = ((sw_res_reg1 & 0xFFFFUL) << 32) | (sw_res_reg0 & 0xFFFFFFFFUL); + + write_sysreg(0, s3_3_c15_c0_0); + write_sysreg(sw_res_reg1 & ~0xFFFFUL, s3_3_c15_c0_1); + + return pa; +} + +/* + * Handle fatal SEI error by scheduling a task work to kill the affected process. + * @err_pa: The physical address that triggered the SEI. + * + * This function allocates a task work structure from a pre-allocated pool and + * schedules it to run on the current task. The task work will invoke + * kill_accessing_process() to send a SIGKILL to the process that has the + * error address mapped. This mechanism is used for memory errors in user-space + * accessible regions managed by drivers. + * + * Return: true if task work is successfully scheduled, false otherwise. + */ +static bool hisi_sei_kill_task(void) +{ + struct sei_task_work *ctx; + unsigned long err_pa; + + if (!hisi_sei_pool) + return false; + + ctx = (void *)gen_pool_alloc(hisi_sei_pool, sizeof(*ctx)); + if (!ctx) { + pr_warn_ratelimited("alloc task work failed\n"); + return false; + } + + err_pa = hisi_sei_get_error_pa(); + if (!err_pa) { + pr_warn_ratelimited("err pa is not valid\n"); + return false; + } + + ctx->pfn = PHYS_PFN(err_pa); + init_task_work(&ctx->twork, hisi_sei_kill_task_work); + task_work_add(current, &ctx->twork, TWA_RESUME); + return true; +} + +/* + * Handle HiSilicon specific Synchronous External Interrupt (SEI) errors. + * @regs: exception registers, NULL if from user space + * + * This function processes vendor-specific SEI errors for HiSilicon platforms. + * For user space errors, it reads the error physical address from RAS registers + * and schedules a task work to kill the accessing task. If recovery fails or + * the error is from kernel space, the current process is terminated with SIGKILL. + * + * Return: 0 if SEI is handled, -ENOENT if not applicable or unsupported. + */ +static int ghes_hisi_handle_sei(struct pt_regs *regs) +{ + if (!IS_ENABLED(CONFIG_ARM64_SYNC_SEI)) + return -ENOENT; + + if (!current->mm) + return -ENOENT; + + if ((!regs || user_mode(regs)) && hisi_sei_kill_task()) + return 0; + + pr_err("Sending SIGKILL to comm: %s, pid: %d, tgid: %d due to sei not recovered", + current->comm, current->pid, current->tgid); + force_sig(SIGKILL); + return 0; +} #else static inline bool ghes_hisi_critical_hw_error(struct cper_sec_proc_arm *err, bool sync) { return false; } +static int ghes_hisi_sei_init(void) { return 0; } +static int ghes_hisi_handle_sei(struct pt_regs *regs) { return -ENOENT; } #endif bool ghes_armp_vendor_critical_error(struct cper_sec_proc_arm *err, bool sync) @@ -72,6 +217,14 @@ bool ghes_armp_vendor_critical_error(struct cper_sec_proc_arm *err, bool sync) return false; } +int ghes_armp_vendor_handle_sei(struct pt_regs *regs) +{ + if (vender_oem & HISI_OEM) + return ghes_hisi_handle_sei(regs); + + return -ENOENT; +} + static int __init ghes_check_oem_table(void) { struct acpi_table_header *tbl; @@ -81,8 +234,10 @@ static int __init ghes_check_oem_table(void) if (ACPI_FAILURE(status) || !tbl) return -ENODEV; - if (!memcmp(tbl->oem_id, "HISI ", ACPI_OEM_ID_SIZE)) + if (!memcmp(tbl->oem_id, "HISI ", ACPI_OEM_ID_SIZE)) { vender_oem |= HISI_OEM; + ghes_hisi_sei_init(); + } acpi_put_table(tbl); return 0; diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index f6b68b7b81e9..51cd04307ee4 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -1695,3 +1695,8 @@ void ghes_unregister_report_chain(struct notifier_block *nb) atomic_notifier_chain_unregister(&ghes_report_chain, nb); } EXPORT_SYMBOL_GPL(ghes_unregister_report_chain); + +int apei_claim_sei(struct pt_regs *regs) +{ + return ghes_armp_vendor_handle_sei(regs); +} -- 2.43.0
hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/8918 ------------------------------------------ HiSilicon platforms store SEI (SError Interrupt) error information in vendor-specific RAS registers. Accessing these registers incurs overhead and may have side effects on some platforms. In emergency scenarios where ghes_hisi_handle_sei exhibits defects, a runtime switch is needed to provide an escape mechanism without requiring a system reboot. Add a runtime switch via /proc/sys/kernel/ghes_hisi_sei to allow dynamic enable/disable of vendor SEI handling: Design: ------- 1. Runtime Control Mechanism - Feature is enabled by default on supported HiSilicon platforms - Sysctl interface allows immediate toggle without kernel rebuild - Early exit check in SEI handler: when disabled, returns -ENOENT immediately without accessing vendor RAS registers 2. Emergency Escape Path - When ghes_hisi_handle_sei encounters issues, operators can immediately disable the feature via: echo 0 > /proc/sys/kernel/ghes_hisi_sei - This bypasses all vendor-specific SEI processing, returning control to the standard error handling path - No memory allocation or resource cleanup required on toggle 3. Resource Lifecycle - Memory pool (hisi_sei_pool) allocated at init time - Simple lifecycle: allocated once, never freed during runtime - sysctl toggle only affects the hisi_sei_enabled flag - No race conditions: flag read in NMI context, safe for atomic access Benefits: --------- - Emergency recovery: Quickly disable problematic SEI handling - Performance testing: Measure overhead without reboot - Platform flexibility: Adapt to different firmware behaviors - Zero downtime: Enable/disable without system restart Usage: ------ # Disable vendor SEI handling (emergency escape) echo 0 > /proc/sys/kernel/ghes_hisi_sei # Re-enable vendor SEI handling echo 1 > /proc/sys/kernel/ghes_hisi_sei Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- Documentation/admin-guide/sysctl/kernel.rst | 27 ++++++++++++++++++ drivers/acpi/apei/ghes-vendor-info.c | 31 ++++++++++++++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 3b8953c49183..86b2100d46f4 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -555,6 +555,33 @@ synchronous exception from memory copy. safely fail when accessing to hwpoison. = =================================================================== + +ghes_hisi_sei (arm64 only) +========================== + +This parameter controls whether HiSilicon vendor-specific SEI (SError +Interrupt) handling is enabled. On HiSilicon platforms, SEI error information +is stored in vendor-specific RAS registers. Accessing these registers incurs +overhead and may have side effects on some platforms. + +This runtime switch allows dynamic enable/disable of vendor SEI handling +without reboot. + += =============================================================== +0 Disable HiSilicon vendor SEI handling (reduces overhead). +1 Enable HiSilicon vendor SEI handling (default on supported platforms). += =============================================================== + +Usage:: + + # Disable vendor SEI handling + echo 0 > /proc/sys/kernel/ghes_hisi_sei + + # Re-enable vendor SEI handling + echo 1 > /proc/sys/kernel/ghes_hisi_sei + +See Documentation/admin-guide/ras/arm64.rst for more information. + modprobe ======== diff --git a/drivers/acpi/apei/ghes-vendor-info.c b/drivers/acpi/apei/ghes-vendor-info.c index 53be618b897e..d0ca6b25b8c9 100644 --- a/drivers/acpi/apei/ghes-vendor-info.c +++ b/drivers/acpi/apei/ghes-vendor-info.c @@ -11,6 +11,7 @@ #include <linux/signal.h> #include <linux/task_work.h> #include <linux/genalloc.h> +#include <linux/sysctl.h> #include <acpi/ghes.h> #include <acpi/apei.h> @@ -22,6 +23,7 @@ #define HISI_OEM BIT(0) static int vender_oem __ro_after_init; +static int hisi_sei_enabled; #ifdef CONFIG_ARCH_HISI @@ -74,8 +76,22 @@ struct sei_task_work { static struct gen_pool *hisi_sei_pool; +static struct ctl_table hisi_sei_sysctl_table[] = { + { + .procname = "ghes_hisi_sei", + .data = &hisi_sei_enabled, + .maxlen = sizeof(hisi_sei_enabled), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + {} +}; + static int ghes_hisi_sei_init(void) { + struct ctl_table_header *sysctl_header; unsigned long addr, len = PAGE_SIZE; int rc; @@ -85,9 +101,15 @@ static int ghes_hisi_sei_init(void) if (!arm64_sync_sei_enabled()) return 0; + sysctl_header = register_sysctl("kernel", hisi_sei_sysctl_table); + if (!sysctl_header) { + pr_warn("failed to register sysctl\n"); + return -EINVAL; + } + hisi_sei_pool = gen_pool_create(ilog2(sizeof(struct sei_task_work)), -1); if (!hisi_sei_pool) - return -ENOMEM; + goto err_sysctl; addr = (unsigned long)kzalloc(PAGE_ALIGN(len), GFP_KERNEL); if (!addr) @@ -97,6 +119,7 @@ static int ghes_hisi_sei_init(void) if (rc) goto err_pool_add; + hisi_sei_enabled = 1; return 0; err_pool_add: @@ -106,6 +129,9 @@ static int ghes_hisi_sei_init(void) gen_pool_destroy(hisi_sei_pool); hisi_sei_pool = NULL; +err_sysctl: + unregister_sysctl_table(sysctl_header); + pr_warn("%s init failed\n", __func__); return -ENOMEM; } @@ -189,6 +215,9 @@ static int ghes_hisi_handle_sei(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_ARM64_SYNC_SEI)) return -ENOENT; + if (!hisi_sei_enabled) + return -ENOENT; + if (!current->mm) return -ENOENT; -- 2.43.0
hulk inclusion category: feature bugzilla: https://atomgit.com/openeuler/kernel/issues/8918 ------------------------------------------ Enable CONFIG_ARM64_SYNC_SEI by default. Signed-off-by: Wupeng Ma <mawupeng1@huawei.com> --- arch/arm64/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index e6d3b9b6788b..06b25fff089b 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -605,6 +605,7 @@ CONFIG_ARM64_HISI_IPIV=y CONFIG_ARM64_SVE=y CONFIG_ARM64_SME=y CONFIG_ARM64_PSEUDO_NMI=y +CONFIG_ARM64_SYNC_SEI=y # CONFIG_ARM64_DEBUG_PRIORITY_MASKING is not set CONFIG_IPI_AS_NMI=y CONFIG_NON_NMI_IPI_BACKTRACE=y -- 2.43.0
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://atomgit.com/openeuler/kernel/merge_requests/21756 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/2OI... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://atomgit.com/openeuler/kernel/merge_requests/21756 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/2OI...
participants (2)
-
patchwork bot -
Wupeng Ma