From: Xu Qiang xuqiang36@huawei.com
ascend inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4F3V1 CVE: NA
--------------------------------
Optimized core lockup detection judgment rules to make it easier to understand.
Core suspension detection is performed in the hrtimer interrupt processing function. The detection condition is that the hrtimer interrupt and NMI interrupt are not updated for multiple consecutive times.
Signed-off-by: Xu Qiang xuqiang36@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/include/asm/barrier.h | 15 ------ kernel/watchdog_hld.c | 91 +++++++++----------------------- 2 files changed, 26 insertions(+), 80 deletions(-)
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 519a30346e176..3cae78c1ce33b 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -24,23 +24,8 @@ #define nops(n) asm volatile(__nops(n))
#define sev() asm volatile("sev" : : : "memory") -#ifdef CONFIG_CORELOCKUP_DETECTOR -extern unsigned int close_wfi_wfe; -#define wfe() \ - do { \ - if (likely(close_wfi_wfe == 0)) \ - asm volatile("wfe" : : : "memory"); \ - } while (0) -#define wfi() \ - do { \ - if (likely(close_wfi_wfe == 0)) \ - asm volatile("wfi" : : : "memory"); \ - } while (0) - -#else #define wfe() asm volatile("wfe" : : : "memory") #define wfi() asm volatile("wfi" : : : "memory") -#endif
#define isb() asm volatile("isb" : : : "memory") #define dmb(opt) asm volatile("dmb " #opt : : : "memory") diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index 51ffc8f90520d..c71036cb474b3 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -77,7 +77,6 @@ EXPORT_SYMBOL(arch_touch_nmi_watchdog); * nmi_cnt_missed: the nmi consecutive miss counts of detector_cpu * hrint_saved: saved hrtimer interrupts of detector_cpu * hrint_missed: the hrtimer consecutive miss counts of detector_cpu - * corelockup_cpumask/close_wfi_wfe: * the cpu mask is set if certain cpu maybe fall in suspend and close * wfi/wfe mode if any bit is set */ @@ -85,12 +84,9 @@ static DEFINE_PER_CPU(unsigned int, detector_cpu); static DEFINE_PER_CPU(unsigned long, nmi_interrupts); static DEFINE_PER_CPU(unsigned long, nmi_cnt_saved); static DEFINE_PER_CPU(unsigned long, nmi_cnt_missed); -static DEFINE_PER_CPU(bool, core_watchdog_warn); static DEFINE_PER_CPU(unsigned long, hrint_saved); static DEFINE_PER_CPU(unsigned long, hrint_missed); -struct cpumask corelockup_cpumask __read_mostly; -unsigned int close_wfi_wfe; -static bool pmu_based_nmi; +static unsigned long corelockup_allcpu_dumped; bool enable_corelockup_detector;
static int __init enable_corelockup_detector_setup(char *str) @@ -150,6 +146,11 @@ void watchdog_check_hrtimer(void) { unsigned int cpu = __this_cpu_read(detector_cpu); unsigned long hrint = watchdog_hrtimer_interrupts(cpu); + unsigned long nmi_int = per_cpu(nmi_interrupts, cpu); + + /* skip check if only one cpu online */ + if (cpu == smp_processor_id()) + return;
/* * The freq of hrtimer is fast than nmi interrupts and @@ -159,23 +160,31 @@ void watchdog_check_hrtimer(void) */ watchdog_nmi_interrupts();
- if (!pmu_based_nmi) - return; - if (__this_cpu_read(hrint_saved) != hrint) { __this_cpu_write(hrint_saved, hrint); __this_cpu_write(hrint_missed, 0); - cpumask_clear_cpu(cpu, &corelockup_cpumask); - } else { - __this_cpu_inc(hrint_missed); - if (__this_cpu_read(hrint_missed) > 2) - cpumask_set_cpu(cpu, &corelockup_cpumask); + return; + } + __this_cpu_inc(hrint_missed); + + if (__this_cpu_read(nmi_cnt_saved) != nmi_int) { + __this_cpu_write(nmi_cnt_saved, nmi_int); + __this_cpu_write(nmi_cnt_missed, 0); + return; } + __this_cpu_inc(nmi_cnt_missed);
- if (likely(cpumask_empty(&corelockup_cpumask))) - close_wfi_wfe = 0; - else - close_wfi_wfe = 1; + if ((__this_cpu_read(hrint_missed) > 5) && (__this_cpu_read(nmi_cnt_missed) > 5)) { + pr_emerg("Watchdog detected core LOCKUP on cpu %d\n", cpu); + + if (!test_and_set_bit(0, &corelockup_allcpu_dumped)) { + trigger_allbutself_cpu_backtrace(); + panic("Core LOCKUP"); + } else { + while (1) + cpu_relax(); + } + } }
/* @@ -206,9 +215,6 @@ void corelockup_detector_offline_cpu(unsigned int cpu) unsigned int prev = nr_cpu_ids; unsigned int i;
- /* clear bitmap */ - cpumask_clear_cpu(cpu, &corelockup_cpumask); - /* found prev cpu */ for_each_cpu_and(i, &watchdog_cpumask, cpu_online_mask) { if (per_cpu(detector_cpu, i) == cpu) { @@ -223,45 +229,6 @@ void corelockup_detector_offline_cpu(unsigned int cpu) /* prev->next */ corelockup_status_copy(cpu, prev); } - -static bool is_corelockup(unsigned int cpu) -{ - unsigned long nmi_int = per_cpu(nmi_interrupts, cpu); - - /* skip check if only one cpu online */ - if (cpu == smp_processor_id()) - return false; - - if (__this_cpu_read(nmi_cnt_saved) != nmi_int) { - __this_cpu_write(nmi_cnt_saved, nmi_int); - __this_cpu_write(nmi_cnt_missed, 0); - per_cpu(core_watchdog_warn, cpu) = false; - return false; - } - - __this_cpu_inc(nmi_cnt_missed); - if (__this_cpu_read(nmi_cnt_missed) > 2) - return true; - - return false; -} -NOKPROBE_SYMBOL(is_corelockup); - -static void watchdog_corelockup_check(struct pt_regs *regs) -{ - unsigned int cpu = __this_cpu_read(detector_cpu); - - if (is_corelockup(cpu)) { - if (per_cpu(core_watchdog_warn, cpu) == true) - return; - pr_emerg("Watchdog detected core LOCKUP on cpu %d\n", cpu); - - if (hardlockup_panic) - nmi_panic(regs, "Core LOCKUP"); - - per_cpu(core_watchdog_warn, cpu) = true; - } -} #endif
#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP @@ -335,9 +302,6 @@ void watchdog_hardlockup_check(struct pt_regs *regs) if (enable_corelockup_detector) { /* Kick nmi interrupts */ watchdog_nmi_interrupts(); - - /* corelockup check */ - watchdog_corelockup_check(regs); } #endif
@@ -546,9 +510,6 @@ int __init hardlockup_detector_perf_init(void) perf_event_release_kernel(this_cpu_read(watchdog_ev)); this_cpu_write(watchdog_ev, NULL); } -#ifdef CONFIG_CORELOCKUP_DETECTOR - pmu_based_nmi = true; -#endif return ret; } #endif /* CONFIG_HARDLOCKUP_DETECTOR_PERF */