Kernel
Threads by month
- ----- 2025 -----
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- 3 participants
- 17953 discussions
openEuler inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I4CIJQ
CVE: NA
----------------------------------------------------------------------
The default way of calculating CPU utilization is to check which task is
executed during the interval of two ticks. This leads to the inaccurate
results of CPU utilization.
This problem can be solved by counting the idle time via scheduler rather
than the tick interval. We can record the time before executing idle
process and calculate the execute time before quiting the idle process.
The idle time of each CPU is given in the /proc/stat2 file. This way can
give higher precision in accounting the CPU idle time compared with the
/proc/stat.
Signed-off-by: Hongyu Li <543306408(a)qq.com>
---
fs/proc/Kconfig | 7 ++++
fs/proc/Makefile | 1 +
fs/proc/stat2.c | 91 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/cputime.c | 34 ++++++++++++++++
kernel/sched/idle.c | 28 +++++++++++++
5 files changed, 161 insertions(+)
create mode 100644 fs/proc/stat2.c
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index c930001056f9..33588a37579e 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -107,3 +107,10 @@ config PROC_PID_ARCH_STATUS
config PROC_CPU_RESCTRL
def_bool n
depends on PROC_FS
+
+config PROC_IDLE
+ bool "include /proc/stat2 file"
+ depends on PROC_FS
+ default y
+ help
+ Provide the CPU idle time in the /proc/stat2 file.
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 8704d41dd67c..b0d5f2b347d7 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -34,5 +34,6 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o
+proc-$(CONFIG_PROC_IDLE) += stat2.o
obj-$(CONFIG_ETMEM_SCAN) += etmem_scan.o
obj-$(CONFIG_ETMEM_SWAP) += etmem_swap.o
diff --git a/fs/proc/stat2.c b/fs/proc/stat2.c
new file mode 100644
index 000000000000..6036a946c71d
--- /dev/null
+++ b/fs/proc/stat2.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/proc/stat2.c
+ *
+ * Copyright (C) 2007
+ *
+ * cpu idle time accouting
+ */
+
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/sched/stat.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/irqnr.h>
+#include <linux/sched/cputime.h>
+#include <linux/tick.h>
+
+#ifdef CONFIG_PROC_IDLE
+
+#define PROC_NAME "stat2"
+
+extern u64 cal_idle_sum_exec_runtime(int cpu);
+
+static u64 get_idle_sum_exec_runtime(int cpu)
+{
+ u64 idle = cal_idle_sum_exec_runtime(cpu);
+
+ return idle;
+}
+
+static int show_idle(struct seq_file *p, void *v)
+{
+ int i;
+ u64 idle;
+
+ idle = 0;
+
+ for_each_possible_cpu(i) {
+
+ idle += get_idle_sum_exec_runtime(i);
+
+ }
+
+ seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(idle));
+ seq_putc(p, '\n');
+
+ for_each_online_cpu(i) {
+
+ idle = get_idle_sum_exec_runtime(i);
+
+ seq_printf(p, "cpu%d", i);
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
+ seq_putc(p, '\n');
+ }
+
+ return 0;
+}
+
+static int idle_open(struct inode *inode, struct file *file)
+{
+ unsigned int size = 32 + 32 * num_online_cpus();
+
+ return single_open_size(file, show_idle, NULL, size);
+}
+
+static struct proc_ops idle_procs_ops = {
+ .proc_open = idle_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+};
+
+static int __init kernel_module_init(void)
+{
+ proc_create(PROC_NAME, 0, NULL, &idle_procs_ops);
+ return 0;
+}
+
+fs_initcall(kernel_module_init);
+
+#endif /*CONFIG_PROC_IDLE*/
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5a55d2300452..25218a8f822f 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -19,6 +19,8 @@
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
+extern struct static_key_true proc_idle;
+
static int sched_clock_irqtime;
void enable_sched_clock_irqtime(void)
@@ -1078,3 +1080,35 @@ void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
+
+
+#ifdef CONFIG_PROC_IDLE
+
+
+u64 cal_idle_sum_exec_runtime(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_entity *idle_se = &rq->idle->se;
+ u64 idle = idle_se->sum_exec_runtime;
+
+ if (!static_branch_likely(&proc_idle))
+ return 0ULL;
+
+ if (rq->curr == rq->idle) {
+ u64 now = sched_clock();
+ u64 delta_exec;
+
+ delta_exec = now - idle_se->exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return idle;
+
+ schedstat_set(idle_se->statistics.exec_max,
+ max(delta_exec, idle_se->statistics.exec_max));
+
+ idle += delta_exec;
+ }
+
+ return idle;
+}
+
+#endif /* CONFIG_PROC_IDLE */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 36b545f17206..15f076ab5823 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -10,6 +10,8 @@
#include <trace/events/power.h>
+DEFINE_STATIC_KEY_TRUE(proc_idle);
+
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
@@ -424,6 +426,23 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
+#ifdef CONFIG_PROC_IDLE
+ if (!static_branch_likely(&proc_idle))
+ return;
+
+ struct sched_entity *idle_se = &rq->idle->se;
+ u64 now = sched_clock();
+ u64 delta_exec;
+
+ delta_exec = now - idle_se->exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
+
+ schedstat_set(idle_se->statistics.exec_max,
+ max(delta_exec, idle_se->statistics.exec_max));
+
+ idle_se->sum_exec_runtime += delta_exec;
+#endif
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
@@ -436,6 +455,15 @@ struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
+#ifdef CONFIG_PROC_IDLE
+ if (static_branch_likely(&proc_idle)) {
+ struct sched_entity *idle_se = &rq->idle->se;
+ u64 now = sched_clock();
+
+ idle_se->exec_start = now;
+ }
+#endif
+
set_next_task_idle(rq, next, true);
return next;
--
2.17.1
2
1

[PATCH openEuler-21.09 0/2] Improve the precisoin of accounuting CPU utilization rate
by Hongyu Li 30 Sep '21
by Hongyu Li 30 Sep '21
30 Sep '21
The current way of calculating the CPU utilization rate is not accurate.
The accounting system only works In the interval of two ticks. However,
a process can give up the CPU before the tick ending.
This can be fixed by counting the idel time via the scheduler. We can
use the sum_exe_runtime of the idle process of each CPU to calculate the
the CPU utilization rate. The idle time of each CPU is given in the
/proc/stat2 file. An example of using this file is also attached.
Hongyu Li (2):
eulerfs: add the /proc/stat2 file.
tools: add a tool to calculate the idle rate
fs/proc/Kconfig | 7 +++
fs/proc/Makefile | 1 +
fs/proc/stat2.c | 91 +++++++++++++++++++++++++++++++++++++
kernel/sched/cputime.c | 34 ++++++++++++++
kernel/sched/idle.c | 28 ++++++++++++
tools/accounting/Makefile | 2 +-
tools/accounting/idle_cal.c | 91 +++++++++++++++++++++++++++++++++++++
7 files changed, 253 insertions(+), 1 deletion(-)
create mode 100644 fs/proc/stat2.c
create mode 100644 tools/accounting/idle_cal.c
--
2.17.1
1
0
Hi Community:
Does Euler 2.9 support BPFTrace Or not?
It is very necessary for us now.
B.R.
2
1
openEuler inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I4CIJQ
CVE: NA
----------------------------------------------------------------------
The default way of calculating CPU utilization is to check which task is
executed during the interval of two ticks. This leads to the inaccurate
results of CPU utilization.
This problem can be solved by counting the idle time via scheduler rather
than the tick interval. We can record the time before executing idle
process and calculate the execute time before quiting the idle process.
The idle time of each CPU is given in the /proc/stat2 file. This way can
give higher precision in accounting the CPU idle time compared with the
/proc/stat.
Signed-off-by: Hongyu Li <543306408(a)qq.com>
---
fs/proc/Kconfig | 7 ++++
fs/proc/Makefile | 1 +
fs/proc/stat2.c | 91 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/cputime.c | 34 ++++++++++++++++
kernel/sched/idle.c | 28 +++++++++++++
5 files changed, 161 insertions(+)
create mode 100644 fs/proc/stat2.c
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index c930001056f9..33588a37579e 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -107,3 +107,10 @@ config PROC_PID_ARCH_STATUS
config PROC_CPU_RESCTRL
def_bool n
depends on PROC_FS
+
+config PROC_IDLE
+ bool "include /proc/stat2 file"
+ depends on PROC_FS
+ default y
+ help
+ Provide the CPU idle time in the /proc/stat2 file.
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 8704d41dd67c..b0d5f2b347d7 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -34,5 +34,6 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o
+proc-$(CONFIG_PROC_IDLE) += stat2.o
obj-$(CONFIG_ETMEM_SCAN) += etmem_scan.o
obj-$(CONFIG_ETMEM_SWAP) += etmem_swap.o
diff --git a/fs/proc/stat2.c b/fs/proc/stat2.c
new file mode 100644
index 000000000000..6036a946c71d
--- /dev/null
+++ b/fs/proc/stat2.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/proc/stat2.c
+ *
+ * Copyright (C) 2007
+ *
+ * cpu idle time accouting
+ */
+
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/sched/stat.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/irqnr.h>
+#include <linux/sched/cputime.h>
+#include <linux/tick.h>
+
+#ifdef CONFIG_PROC_IDLE
+
+#define PROC_NAME "stat2"
+
+extern u64 cal_idle_sum_exec_runtime(int cpu);
+
+static u64 get_idle_sum_exec_runtime(int cpu)
+{
+ u64 idle = cal_idle_sum_exec_runtime(cpu);
+
+ return idle;
+}
+
+static int show_idle(struct seq_file *p, void *v)
+{
+ int i;
+ u64 idle;
+
+ idle = 0;
+
+ for_each_possible_cpu(i) {
+
+ idle += get_idle_sum_exec_runtime(i);
+
+ }
+
+ seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(idle));
+ seq_putc(p, '\n');
+
+ for_each_online_cpu(i) {
+
+ idle = get_idle_sum_exec_runtime(i);
+
+ seq_printf(p, "cpu%d", i);
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
+ seq_putc(p, '\n');
+ }
+
+ return 0;
+}
+
+static int idle_open(struct inode *inode, struct file *file)
+{
+ unsigned int size = 32 + 32 * num_online_cpus();
+
+ return single_open_size(file, show_idle, NULL, size);
+}
+
+static struct proc_ops idle_procs_ops = {
+ .proc_open = idle_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+};
+
+static int __init kernel_module_init(void)
+{
+ proc_create(PROC_NAME, 0, NULL, &idle_procs_ops);
+ return 0;
+}
+
+fs_initcall(kernel_module_init);
+
+#endif /*CONFIG_PROC_IDLE*/
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5a55d2300452..25218a8f822f 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -19,6 +19,8 @@
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
+extern struct static_key_true proc_idle;
+
static int sched_clock_irqtime;
void enable_sched_clock_irqtime(void)
@@ -1078,3 +1080,35 @@ void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
+
+
+#ifdef CONFIG_PROC_IDLE
+
+
+u64 cal_idle_sum_exec_runtime(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_entity *idle_se = &rq->idle->se;
+ u64 idle = idle_se->sum_exec_runtime;
+
+ if (!static_branch_likely(&proc_idle))
+ return 0ULL;
+
+ if (rq->curr == rq->idle) {
+ u64 now = sched_clock();
+ u64 delta_exec;
+
+ delta_exec = now - idle_se->exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return idle;
+
+ schedstat_set(idle_se->statistics.exec_max,
+ max(delta_exec, idle_se->statistics.exec_max));
+
+ idle += delta_exec;
+ }
+
+ return idle;
+}
+
+#endif /* CONFIG_PROC_IDLE */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 36b545f17206..15f076ab5823 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -10,6 +10,8 @@
#include <trace/events/power.h>
+DEFINE_STATIC_KEY_TRUE(proc_idle);
+
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
@@ -424,6 +426,23 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
+#ifdef CONFIG_PROC_IDLE
+ if (!static_branch_likely(&proc_idle))
+ return;
+
+ struct sched_entity *idle_se = &rq->idle->se;
+ u64 now = sched_clock();
+ u64 delta_exec;
+
+ delta_exec = now - idle_se->exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
+
+ schedstat_set(idle_se->statistics.exec_max,
+ max(delta_exec, idle_se->statistics.exec_max));
+
+ idle_se->sum_exec_runtime += delta_exec;
+#endif
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
@@ -436,6 +455,15 @@ struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
+#ifdef CONFIG_PROC_IDLE
+ if (static_branch_likely(&proc_idle)) {
+ struct sched_entity *idle_se = &rq->idle->se;
+ u64 now = sched_clock();
+
+ idle_se->exec_start = now;
+ }
+#endif
+
set_next_task_idle(rq, next, true);
return next;
--
2.17.1
1
0
openEuler inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I4CIJQ
CVE: NA
----------------------------------------------------------------------
The default way of calculating CPU utilization is to check which task is
executed during the interval of two ticks. This leads to the inaccurate
results of CPU utilization.
This problem can be solved by counting the idle time via scheduler rather
than the tick interval. We can record the time before executing idle
process and calculate the execute time before quiting the idle process.
The idle time of each CPU is given in the /proc/stat2 file. This way can
give higher precision in accounting the CPU idle time compared with the
/proc/stat.
Signed-off-by: Hongyu Li <543306408(a)qq.com>
---
fs/proc/Kconfig | 7 ++++
fs/proc/Makefile | 1 +
fs/proc/stat2.c | 91 ++++++++++++++++++++++++++++++++++++++++++
kernel/sched/cputime.c | 34 ++++++++++++++++
kernel/sched/idle.c | 28 +++++++++++++
5 files changed, 161 insertions(+)
create mode 100644 fs/proc/stat2.c
diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig
index c930001056f9..33588a37579e 100644
--- a/fs/proc/Kconfig
+++ b/fs/proc/Kconfig
@@ -107,3 +107,10 @@ config PROC_PID_ARCH_STATUS
config PROC_CPU_RESCTRL
def_bool n
depends on PROC_FS
+
+config PROC_IDLE
+ bool "include /proc/stat2 file"
+ depends on PROC_FS
+ default y
+ help
+ Provide the CPU idle time in the /proc/stat2 file.
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 8704d41dd67c..b0d5f2b347d7 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -34,5 +34,6 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
proc-$(CONFIG_BOOT_CONFIG) += bootconfig.o
+proc-$(CONFIG_PROC_IDLE) += stat2.o
obj-$(CONFIG_ETMEM_SCAN) += etmem_scan.o
obj-$(CONFIG_ETMEM_SWAP) += etmem_swap.o
diff --git a/fs/proc/stat2.c b/fs/proc/stat2.c
new file mode 100644
index 000000000000..6036a946c71d
--- /dev/null
+++ b/fs/proc/stat2.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * linux/fs/proc/stat2.c
+ *
+ * Copyright (C) 2007
+ *
+ * cpu idle time accouting
+ */
+
+#include <linux/cpumask.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/sched/stat.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/irqnr.h>
+#include <linux/sched/cputime.h>
+#include <linux/tick.h>
+
+#ifdef CONFIG_PROC_IDLE
+
+#define PROC_NAME "stat2"
+
+extern u64 cal_idle_sum_exec_runtime(int cpu);
+
+static u64 get_idle_sum_exec_runtime(int cpu)
+{
+ u64 idle = cal_idle_sum_exec_runtime(cpu);
+
+ return idle;
+}
+
+static int show_idle(struct seq_file *p, void *v)
+{
+ int i;
+ u64 idle;
+
+ idle = 0;
+
+ for_each_possible_cpu(i) {
+
+ idle += get_idle_sum_exec_runtime(i);
+
+ }
+
+ seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(idle));
+ seq_putc(p, '\n');
+
+ for_each_online_cpu(i) {
+
+ idle = get_idle_sum_exec_runtime(i);
+
+ seq_printf(p, "cpu%d", i);
+ seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
+ seq_putc(p, '\n');
+ }
+
+ return 0;
+}
+
+static int idle_open(struct inode *inode, struct file *file)
+{
+ unsigned int size = 32 + 32 * num_online_cpus();
+
+ return single_open_size(file, show_idle, NULL, size);
+}
+
+static struct proc_ops idle_procs_ops = {
+ .proc_open = idle_open,
+ .proc_read_iter = seq_read_iter,
+ .proc_lseek = seq_lseek,
+ .proc_release = single_release,
+};
+
+static int __init kernel_module_init(void)
+{
+ proc_create(PROC_NAME, 0, NULL, &idle_procs_ops);
+ return 0;
+}
+
+fs_initcall(kernel_module_init);
+
+#endif /*CONFIG_PROC_IDLE*/
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5a55d2300452..25218a8f822f 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -19,6 +19,8 @@
*/
DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
+extern struct static_key_true proc_idle;
+
static int sched_clock_irqtime;
void enable_sched_clock_irqtime(void)
@@ -1078,3 +1080,35 @@ void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
+
+
+#ifdef CONFIG_PROC_IDLE
+
+
+u64 cal_idle_sum_exec_runtime(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct sched_entity *idle_se = &rq->idle->se;
+ u64 idle = idle_se->sum_exec_runtime;
+
+ if (!static_branch_likely(&proc_idle))
+ return 0ULL;
+
+ if (rq->curr == rq->idle) {
+ u64 now = sched_clock();
+ u64 delta_exec;
+
+ delta_exec = now - idle_se->exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return idle;
+
+ schedstat_set(idle_se->statistics.exec_max,
+ max(delta_exec, idle_se->statistics.exec_max));
+
+ idle += delta_exec;
+ }
+
+ return idle;
+}
+
+#endif /* CONFIG_PROC_IDLE */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 36b545f17206..15f076ab5823 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -10,6 +10,8 @@
#include <trace/events/power.h>
+DEFINE_STATIC_KEY_TRUE(proc_idle);
+
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];
@@ -424,6 +426,23 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
+#ifdef CONFIG_PROC_IDLE
+ if (!static_branch_likely(&proc_idle))
+ return;
+
+ struct sched_entity *idle_se = &rq->idle->se;
+ u64 now = sched_clock();
+ u64 delta_exec;
+
+ delta_exec = now - idle_se->exec_start;
+ if (unlikely((s64)delta_exec <= 0))
+ return;
+
+ schedstat_set(idle_se->statistics.exec_max,
+ max(delta_exec, idle_se->statistics.exec_max));
+
+ idle_se->sum_exec_runtime += delta_exec;
+#endif
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
@@ -436,6 +455,15 @@ struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
+#ifdef CONFIG_PROC_IDLE
+ if (static_branch_likely(&proc_idle)) {
+ struct sched_entity *idle_se = &rq->idle->se;
+ u64 now = sched_clock();
+
+ idle_se->exec_start = now;
+ }
+#endif
+
set_next_task_idle(rq, next, true);
return next;
--
2.17.1
1
0

30 Sep '21
From: Alexander Potapenko <glider(a)google.com>
Patch series "KFENCE: A low-overhead sampling-based memory safety error detector", v7.
This adds the Kernel Electric-Fence (KFENCE) infrastructure. KFENCE is a
low-overhead sampling-based memory safety error detector of heap
use-after-free, invalid-free, and out-of-bounds access errors. This
series enables KFENCE for the x86 and arm64 architectures, and adds
KFENCE hooks to the SLAB and SLUB allocators.
KFENCE is designed to be enabled in production kernels, and has near
zero performance overhead. Compared to KASAN, KFENCE trades performance
for precision. The main motivation behind KFENCE's design, is that with
enough total uptime KFENCE will detect bugs in code paths not typically
exercised by non-production test workloads. One way to quickly achieve a
large enough total uptime is when the tool is deployed across a large
fleet of machines.
KFENCE objects each reside on a dedicated page, at either the left or
right page boundaries. The pages to the left and right of the object
page are "guard pages", whose attributes are changed to a protected
state, and cause page faults on any attempted access to them. Such page
faults are then intercepted by KFENCE, which handles the fault
gracefully by reporting a memory access error.
Guarded allocations are set up based on a sample interval (can be set
via kfence.sample_interval). After expiration of the sample interval,
the next allocation through the main allocator (SLAB or SLUB) returns a
guarded allocation from the KFENCE object pool. At this point, the timer
is reset, and the next allocation is set up after the expiration of the
interval.
To enable/disable a KFENCE allocation through the main allocator's
fast-path without overhead, KFENCE relies on static branches via the
static keys infrastructure. The static branch is toggled to redirect the
allocation to KFENCE.
The KFENCE memory pool is of fixed size, and if the pool is exhausted no
further KFENCE allocations occur. The default config is conservative
with only 255 objects, resulting in a pool size of 2 MiB (with 4 KiB
pages).
We have verified by running synthetic benchmarks (sysbench I/O,
hackbench) and production server-workload benchmarks that a kernel with
KFENCE (using sample intervals 100-500ms) is performance-neutral
compared to a non-KFENCE baseline kernel.
KFENCE is inspired by GWP-ASan [1], a userspace tool with similar
properties. The name "KFENCE" is a homage to the Electric Fence Malloc
Debugger [2].
For more details, see Documentation/dev-tools/kfence.rst added in the
series -- also viewable here:
https://raw.githubusercontent.com/google/kasan/kfence/Documentation/dev-too…
[1] http://llvm.org/docs/GwpAsan.html
[2] https://linux.die.net/man/3/efence
This patch (of 9):
This adds the Kernel Electric-Fence (KFENCE) infrastructure. KFENCE is a
low-overhead sampling-based memory safety error detector of heap
use-after-free, invalid-free, and out-of-bounds access errors.
KFENCE is designed to be enabled in production kernels, and has near
zero performance overhead. Compared to KASAN, KFENCE trades performance
for precision. The main motivation behind KFENCE's design, is that with
enough total uptime KFENCE will detect bugs in code paths not typically
exercised by non-production test workloads. One way to quickly achieve a
large enough total uptime is when the tool is deployed across a large
fleet of machines.
KFENCE objects each reside on a dedicated page, at either the left or
right page boundaries. The pages to the left and right of the object
page are "guard pages", whose attributes are changed to a protected
state, and cause page faults on any attempted access to them. Such page
faults are then intercepted by KFENCE, which handles the fault
gracefully by reporting a memory access error. To detect out-of-bounds
writes to memory within the object's page itself, KFENCE also uses
pattern-based redzones. The following figure illustrates the page
layout:
---+-----------+-----------+-----------+-----------+-----------+---
| xxxxxxxxx | O : | xxxxxxxxx | : O | xxxxxxxxx |
| xxxxxxxxx | B : | xxxxxxxxx | : B | xxxxxxxxx |
| x GUARD x | J : RED- | x GUARD x | RED- : J | x GUARD x |
| xxxxxxxxx | E : ZONE | xxxxxxxxx | ZONE : E | xxxxxxxxx |
| xxxxxxxxx | C : | xxxxxxxxx | : C | xxxxxxxxx |
| xxxxxxxxx | T : | xxxxxxxxx | : T | xxxxxxxxx |
---+-----------+-----------+-----------+-----------+-----------+---
Guarded allocations are set up based on a sample interval (can be set
via kfence.sample_interval). After expiration of the sample interval, a
guarded allocation from the KFENCE object pool is returned to the main
allocator (SLAB or SLUB). At this point, the timer is reset, and the
next allocation is set up after the expiration of the interval.
To enable/disable a KFENCE allocation through the main allocator's
fast-path without overhead, KFENCE relies on static branches via the
static keys infrastructure. The static branch is toggled to redirect the
allocation to KFENCE. To date, we have verified by running synthetic
benchmarks (sysbench I/O, hackbench) that a kernel compiled with KFENCE
is performance-neutral compared to the non-KFENCE baseline.
For more details, see Documentation/dev-tools/kfence.rst (added later in
the series).
[elver(a)google.com: fix parameter description for kfence_object_start()]
Link: https://lkml.kernel.org/r/20201106092149.GA2851373@elver.google.com
[elver(a)google.com: avoid stalling work queue task without allocations]
Link: https://lkml.kernel.org/r/CADYN=9J0DQhizAGB0-jz4HOBBh+05kMBXb4c0cXMS7Qi5NAJ…
Link: https://lkml.kernel.org/r/20201110135320.3309507-1-elver@google.com
[elver(a)google.com: fix potential deadlock due to wake_up()]
Link: https://lkml.kernel.org/r/000000000000c0645805b7f982e4@google.com
Link: https://lkml.kernel.org/r/20210104130749.1768991-1-elver@google.com
[elver(a)google.com: add option to use KFENCE without static keys]
Link: https://lkml.kernel.org/r/20210111091544.3287013-1-elver@google.com
[elver(a)google.com: add missing copyright and description headers]
Link: https://lkml.kernel.org/r/20210118092159.145934-1-elver@google.com
Link: https://lkml.kernel.org/r/20201103175841.3495947-2-elver@google.com
Signed-off-by: Marco Elver <elver(a)google.com>
Signed-off-by: Alexander Potapenko <glider(a)google.com>
Reviewed-by: Dmitry Vyukov <dvyukov(a)google.com>
Reviewed-by: SeongJae Park <sjpark(a)amazon.de>
Co-developed-by: Marco Elver <elver(a)google.com>
Reviewed-by: Jann Horn <jannh(a)google.com>
Cc: "H. Peter Anvin" <hpa(a)zytor.com>
Cc: Paul E. McKenney <paulmck(a)kernel.org>
Cc: Andrey Konovalov <andreyknvl(a)google.com>
Cc: Andrey Ryabinin <aryabinin(a)virtuozzo.com>
Cc: Andy Lutomirski <luto(a)kernel.org>
Cc: Borislav Petkov <bp(a)alien8.de>
Cc: Catalin Marinas <catalin.marinas(a)arm.com>
Cc: Christopher Lameter <cl(a)linux.com>
Cc: Dave Hansen <dave.hansen(a)linux.intel.com>
Cc: David Rientjes <rientjes(a)google.com>
Cc: Eric Dumazet <edumazet(a)google.com>
Cc: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Cc: Hillf Danton <hdanton(a)sina.com>
Cc: Ingo Molnar <mingo(a)redhat.com>
Cc: Jonathan Corbet <corbet(a)lwn.net>
Cc: Joonsoo Kim <iamjoonsoo.kim(a)lge.com>
Cc: Joern Engel <joern(a)purestorage.com>
Cc: Kees Cook <keescook(a)chromium.org>
Cc: Mark Rutland <mark.rutland(a)arm.com>
Cc: Pekka Enberg <penberg(a)kernel.org>
Cc: Peter Zijlstra <peterz(a)infradead.org>
Cc: Thomas Gleixner <tglx(a)linutronix.de>
Cc: Vlastimil Babka <vbabka(a)suse.cz>
Cc: Will Deacon <will(a)kernel.org>
Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
Signed-off-by: Yingjie Shang <1415317271(a)qq.com>
Reviewed-by: Bixuan Cui <cuibixuan(a)huawei.com>
---
include/linux/kfence.h | 216 +++++++++++
init/main.c | 3 +
lib/Kconfig.debug | 1 +
lib/Kconfig.kfence | 67 ++++
mm/Makefile | 1 +
mm/kfence/Makefile | 3 +
mm/kfence/core.c | 840 +++++++++++++++++++++++++++++++++++++++++
mm/kfence/kfence.h | 113 ++++++
mm/kfence/report.c | 240 ++++++++++++
9 files changed, 1484 insertions(+)
create mode 100644 include/linux/kfence.h
create mode 100644 lib/Kconfig.kfence
create mode 100644 mm/kfence/Makefile
create mode 100644 mm/kfence/core.c
create mode 100644 mm/kfence/kfence.h
create mode 100644 mm/kfence/report.c
diff --git a/include/linux/kfence.h b/include/linux/kfence.h
new file mode 100644
index 000000000000..81f3911cb298
--- /dev/null
+++ b/include/linux/kfence.h
@@ -0,0 +1,216 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Kernel Electric-Fence (KFENCE). Public interface for allocator and fault
+ * handler integration. For more info see Documentation/dev-tools/kfence.rst.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#ifndef _LINUX_KFENCE_H
+#define _LINUX_KFENCE_H
+
+#include <linux/mm.h>
+#include <linux/types.h>
+
+#ifdef CONFIG_KFENCE
+
+/*
+ * We allocate an even number of pages, as it simplifies calculations to map
+ * address to metadata indices; effectively, the very first page serves as an
+ * extended guard page, but otherwise has no special purpose.
+ */
+#define KFENCE_POOL_SIZE ((CONFIG_KFENCE_NUM_OBJECTS + 1) * 2 * PAGE_SIZE)
+extern char *__kfence_pool;
+
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+#include <linux/static_key.h>
+DECLARE_STATIC_KEY_FALSE(kfence_allocation_key);
+#else
+#include <linux/atomic.h>
+extern atomic_t kfence_allocation_gate;
+#endif
+
+/**
+ * is_kfence_address() - check if an address belongs to KFENCE pool
+ * @addr: address to check
+ *
+ * Return: true or false depending on whether the address is within the KFENCE
+ * object range.
+ *
+ * KFENCE objects live in a separate page range and are not to be intermixed
+ * with regular heap objects (e.g. KFENCE objects must never be added to the
+ * allocator freelists). Failing to do so may and will result in heap
+ * corruptions, therefore is_kfence_address() must be used to check whether
+ * an object requires specific handling.
+ *
+ * Note: This function may be used in fast-paths, and is performance critical.
+ * Future changes should take this into account; for instance, we want to avoid
+ * introducing another load and therefore need to keep KFENCE_POOL_SIZE a
+ * constant (until immediate patching support is added to the kernel).
+ */
+static __always_inline bool is_kfence_address(const void *addr)
+{
+ /*
+ * The non-NULL check is required in case the __kfence_pool pointer was
+ * never initialized; keep it in the slow-path after the range-check.
+ */
+ return unlikely((unsigned long)((char *)addr - __kfence_pool) < KFENCE_POOL_SIZE && addr);
+}
+
+/**
+ * kfence_alloc_pool() - allocate the KFENCE pool via memblock
+ */
+void __init kfence_alloc_pool(void);
+
+/**
+ * kfence_init() - perform KFENCE initialization at boot time
+ *
+ * Requires that kfence_alloc_pool() was called before. This sets up the
+ * allocation gate timer, and requires that workqueues are available.
+ */
+void __init kfence_init(void);
+
+/**
+ * kfence_shutdown_cache() - handle shutdown_cache() for KFENCE objects
+ * @s: cache being shut down
+ *
+ * Before shutting down a cache, one must ensure there are no remaining objects
+ * allocated from it. Because KFENCE objects are not referenced from the cache
+ * directly, we need to check them here.
+ *
+ * Note that shutdown_cache() is internal to SL*B, and kmem_cache_destroy() does
+ * not return if allocated objects still exist: it prints an error message and
+ * simply aborts destruction of a cache, leaking memory.
+ *
+ * If the only such objects are KFENCE objects, we will not leak the entire
+ * cache, but instead try to provide more useful debug info by making allocated
+ * objects "zombie allocations". Objects may then still be used or freed (which
+ * is handled gracefully), but usage will result in showing KFENCE error reports
+ * which include stack traces to the user of the object, the original allocation
+ * site, and caller to shutdown_cache().
+ */
+void kfence_shutdown_cache(struct kmem_cache *s);
+
+/*
+ * Allocate a KFENCE object. Allocators must not call this function directly,
+ * use kfence_alloc() instead.
+ */
+void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags);
+
+/**
+ * kfence_alloc() - allocate a KFENCE object with a low probability
+ * @s: struct kmem_cache with object requirements
+ * @size: exact size of the object to allocate (can be less than @s->size
+ * e.g. for kmalloc caches)
+ * @flags: GFP flags
+ *
+ * Return:
+ * * NULL - must proceed with allocating as usual,
+ * * non-NULL - pointer to a KFENCE object.
+ *
+ * kfence_alloc() should be inserted into the heap allocation fast path,
+ * allowing it to transparently return KFENCE-allocated objects with a low
+ * probability using a static branch (the probability is controlled by the
+ * kfence.sample_interval boot parameter).
+ */
+static __always_inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
+{
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+ if (static_branch_unlikely(&kfence_allocation_key))
+#else
+ if (unlikely(!atomic_read(&kfence_allocation_gate)))
+#endif
+ return __kfence_alloc(s, size, flags);
+ return NULL;
+}
+
+/**
+ * kfence_ksize() - get actual amount of memory allocated for a KFENCE object
+ * @addr: pointer to a heap object
+ *
+ * Return:
+ * * 0 - not a KFENCE object, must call __ksize() instead,
+ * * non-0 - this many bytes can be accessed without causing a memory error.
+ *
+ * kfence_ksize() returns the number of bytes requested for a KFENCE object at
+ * allocation time. This number may be less than the object size of the
+ * corresponding struct kmem_cache.
+ */
+size_t kfence_ksize(const void *addr);
+
+/**
+ * kfence_object_start() - find the beginning of a KFENCE object
+ * @addr: address within a KFENCE-allocated object
+ *
+ * Return: address of the beginning of the object.
+ *
+ * SL[AU]B-allocated objects are laid out within a page one by one, so it is
+ * easy to calculate the beginning of an object given a pointer inside it and
+ * the object size. The same is not true for KFENCE, which places a single
+ * object at either end of the page. This helper function is used to find the
+ * beginning of a KFENCE-allocated object.
+ */
+void *kfence_object_start(const void *addr);
+
+/**
+ * __kfence_free() - release a KFENCE heap object to KFENCE pool
+ * @addr: object to be freed
+ *
+ * Requires: is_kfence_address(addr)
+ *
+ * Release a KFENCE object and mark it as freed.
+ */
+void __kfence_free(void *addr);
+
+/**
+ * kfence_free() - try to release an arbitrary heap object to KFENCE pool
+ * @addr: object to be freed
+ *
+ * Return:
+ * * false - object doesn't belong to KFENCE pool and was ignored,
+ * * true - object was released to KFENCE pool.
+ *
+ * Release a KFENCE object and mark it as freed. May be called on any object,
+ * even non-KFENCE objects, to simplify integration of the hooks into the
+ * allocator's free codepath. The allocator must check the return value to
+ * determine if it was a KFENCE object or not.
+ */
+static __always_inline __must_check bool kfence_free(void *addr)
+{
+ if (!is_kfence_address(addr))
+ return false;
+ __kfence_free(addr);
+ return true;
+}
+
+/**
+ * kfence_handle_page_fault() - perform page fault handling for KFENCE pages
+ * @addr: faulting address
+ *
+ * Return:
+ * * false - address outside KFENCE pool,
+ * * true - page fault handled by KFENCE, no additional handling required.
+ *
+ * A page fault inside KFENCE pool indicates a memory error, such as an
+ * out-of-bounds access, a use-after-free or an invalid memory access. In these
+ * cases KFENCE prints an error message and marks the offending page as
+ * present, so that the kernel can proceed.
+ */
+bool __must_check kfence_handle_page_fault(unsigned long addr);
+
+#else /* CONFIG_KFENCE */
+
+static inline bool is_kfence_address(const void *addr) { return false; }
+static inline void kfence_alloc_pool(void) { }
+static inline void kfence_init(void) { }
+static inline void kfence_shutdown_cache(struct kmem_cache *s) { }
+static inline void *kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags) { return NULL; }
+static inline size_t kfence_ksize(const void *addr) { return 0; }
+static inline void *kfence_object_start(const void *addr) { return NULL; }
+static inline void __kfence_free(void *addr) { }
+static inline bool __must_check kfence_free(void *addr) { return false; }
+static inline bool __must_check kfence_handle_page_fault(unsigned long addr) { return false; }
+
+#endif
+
+#endif /* _LINUX_KFENCE_H */
diff --git a/init/main.c b/init/main.c
index fc0277a9e7a3..2d027dc11b10 100644
--- a/init/main.c
+++ b/init/main.c
@@ -40,6 +40,7 @@
#include <linux/security.h>
#include <linux/smp.h>
#include <linux/profile.h>
+#include <linux/kfence.h>
#include <linux/rcupdate.h>
#include <linux/moduleparam.h>
#include <linux/kallsyms.h>
@@ -828,6 +829,7 @@ static void __init mm_init(void)
*/
page_ext_init_flatmem();
init_debug_pagealloc();
+ kfence_alloc_pool();
report_meminit();
mem_init();
kmem_cache_init();
@@ -956,6 +958,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
hrtimers_init();
softirq_init();
timekeeping_init();
+ kfence_init();
/*
* For best initial stack canary entropy, prepare it after:
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 799e49a29797..29e64b4cd412 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -878,6 +878,7 @@ config DEBUG_STACKOVERFLOW
If in doubt, say "N".
source "lib/Kconfig.kasan"
+source "lib/Kconfig.kfence"
endmenu # "Memory Debugging"
diff --git a/lib/Kconfig.kfence b/lib/Kconfig.kfence
new file mode 100644
index 000000000000..b88ac9d6b2e6
--- /dev/null
+++ b/lib/Kconfig.kfence
@@ -0,0 +1,67 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config HAVE_ARCH_KFENCE
+ bool
+
+menuconfig KFENCE
+ bool "KFENCE: low-overhead sampling-based memory safety error detector"
+ depends on HAVE_ARCH_KFENCE && !KASAN && (SLAB || SLUB)
+ select STACKTRACE
+ help
+ KFENCE is a low-overhead sampling-based detector of heap out-of-bounds
+ access, use-after-free, and invalid-free errors. KFENCE is designed
+ to have negligible cost to permit enabling it in production
+ environments.
+
+ Note that, KFENCE is not a substitute for explicit testing with tools
+ such as KASAN. KFENCE can detect a subset of bugs that KASAN can
+ detect, albeit at very different performance profiles. If you can
+ afford to use KASAN, continue using KASAN, for example in test
+ environments. If your kernel targets production use, and cannot
+ enable KASAN due to its cost, consider using KFENCE.
+
+if KFENCE
+
+config KFENCE_STATIC_KEYS
+ bool "Use static keys to set up allocations"
+ default y
+ depends on JUMP_LABEL # To ensure performance, require jump labels
+ help
+ Use static keys (static branches) to set up KFENCE allocations. Using
+ static keys is normally recommended, because it avoids a dynamic
+ branch in the allocator's fast path. However, with very low sample
+ intervals, or on systems that do not support jump labels, a dynamic
+ branch may still be an acceptable performance trade-off.
+
+config KFENCE_SAMPLE_INTERVAL
+ int "Default sample interval in milliseconds"
+ default 100
+ help
+ The KFENCE sample interval determines the frequency with which heap
+ allocations will be guarded by KFENCE. May be overridden via boot
+ parameter "kfence.sample_interval".
+
+ Set this to 0 to disable KFENCE by default, in which case only
+ setting "kfence.sample_interval" to a non-zero value enables KFENCE.
+
+config KFENCE_NUM_OBJECTS
+ int "Number of guarded objects available"
+ range 1 65535
+ default 255
+ help
+ The number of guarded objects available. For each KFENCE object, 2
+ pages are required; with one containing the object and two adjacent
+ ones used as guard pages.
+
+config KFENCE_STRESS_TEST_FAULTS
+ int "Stress testing of fault handling and error reporting" if EXPERT
+ default 0
+ help
+ The inverse probability with which to randomly protect KFENCE object
+ pages, resulting in spurious use-after-frees. The main purpose of
+ this option is to stress test KFENCE with concurrent error reports
+ and allocations/frees. A value of 0 disables stress testing logic.
+
+ Only for KFENCE testing; set to 0 if you are not a KFENCE developer.
+
+endif # KFENCE
diff --git a/mm/Makefile b/mm/Makefile
index 2b1991759835..3d81a67d66a2 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_PAGE_POISONING) += page_poison.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KASAN) += kasan/
+obj-$(CONFIG_KFENCE) += kfence/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/mm/kfence/Makefile b/mm/kfence/Makefile
new file mode 100644
index 000000000000..d991e9a349f0
--- /dev/null
+++ b/mm/kfence/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_KFENCE) := core.o report.o
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
new file mode 100644
index 000000000000..d6a32c13336b
--- /dev/null
+++ b/mm/kfence/core.c
@@ -0,0 +1,840 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KFENCE guarded object allocator and fault handling.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#define pr_fmt(fmt) "kfence: " fmt
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/debugfs.h>
+#include <linux/kcsan-checks.h>
+#include <linux/kfence.h>
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/memblock.h>
+#include <linux/moduleparam.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+
+#include <asm/kfence.h>
+
+#include "kfence.h"
+
+/* Disables KFENCE on the first warning assuming an irrecoverable error. */
+#define KFENCE_WARN_ON(cond) \
+ ({ \
+ const bool __cond = WARN_ON(cond); \
+ if (unlikely(__cond)) \
+ WRITE_ONCE(kfence_enabled, false); \
+ __cond; \
+ })
+
+/* === Data ================================================================= */
+
+static bool kfence_enabled __read_mostly;
+
+static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
+
+#ifdef MODULE_PARAM_PREFIX
+#undef MODULE_PARAM_PREFIX
+#endif
+#define MODULE_PARAM_PREFIX "kfence."
+
+static int param_set_sample_interval(const char *val, const struct kernel_param *kp)
+{
+ unsigned long num;
+ int ret = kstrtoul(val, 0, &num);
+
+ if (ret < 0)
+ return ret;
+
+ if (!num) /* Using 0 to indicate KFENCE is disabled. */
+ WRITE_ONCE(kfence_enabled, false);
+ else if (!READ_ONCE(kfence_enabled) && system_state != SYSTEM_BOOTING)
+ return -EINVAL; /* Cannot (re-)enable KFENCE on-the-fly. */
+
+ *((unsigned long *)kp->arg) = num;
+ return 0;
+}
+
+static int param_get_sample_interval(char *buffer, const struct kernel_param *kp)
+{
+ if (!READ_ONCE(kfence_enabled))
+ return sprintf(buffer, "0\n");
+
+ return param_get_ulong(buffer, kp);
+}
+
+static const struct kernel_param_ops sample_interval_param_ops = {
+ .set = param_set_sample_interval,
+ .get = param_get_sample_interval,
+};
+module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
+
+/* The pool of pages used for guard pages and objects. */
+char *__kfence_pool __ro_after_init;
+EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
+
+/*
+ * Per-object metadata, with one-to-one mapping of object metadata to
+ * backing pages (in __kfence_pool).
+ */
+static_assert(CONFIG_KFENCE_NUM_OBJECTS > 0);
+struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+
+/* Freelist with available objects. */
+static struct list_head kfence_freelist = LIST_HEAD_INIT(kfence_freelist);
+static DEFINE_RAW_SPINLOCK(kfence_freelist_lock); /* Lock protecting freelist. */
+
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+/* The static key to set up a KFENCE allocation. */
+DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
+#endif
+
+/* Gates the allocation, ensuring only one succeeds in a given period. */
+atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
+
+/* Statistics counters for debugfs. */
+enum kfence_counter_id {
+ KFENCE_COUNTER_ALLOCATED,
+ KFENCE_COUNTER_ALLOCS,
+ KFENCE_COUNTER_FREES,
+ KFENCE_COUNTER_ZOMBIES,
+ KFENCE_COUNTER_BUGS,
+ KFENCE_COUNTER_COUNT,
+};
+static atomic_long_t counters[KFENCE_COUNTER_COUNT];
+static const char *const counter_names[] = {
+ [KFENCE_COUNTER_ALLOCATED] = "currently allocated",
+ [KFENCE_COUNTER_ALLOCS] = "total allocations",
+ [KFENCE_COUNTER_FREES] = "total frees",
+ [KFENCE_COUNTER_ZOMBIES] = "zombie allocations",
+ [KFENCE_COUNTER_BUGS] = "total bugs",
+};
+static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
+
+/* === Internals ============================================================ */
+
+static bool kfence_protect(unsigned long addr)
+{
+ return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
+}
+
+static bool kfence_unprotect(unsigned long addr)
+{
+ return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), false));
+}
+
+static inline struct kfence_metadata *addr_to_metadata(unsigned long addr)
+{
+ long index;
+
+ /* The checks do not affect performance; only called from slow-paths. */
+
+ if (!is_kfence_address((void *)addr))
+ return NULL;
+
+ /*
+ * May be an invalid index if called with an address at the edge of
+ * __kfence_pool, in which case we would report an "invalid access"
+ * error.
+ */
+ index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1;
+ if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS)
+ return NULL;
+
+ return &kfence_metadata[index];
+}
+
+static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *meta)
+{
+ unsigned long offset = (meta - kfence_metadata + 1) * PAGE_SIZE * 2;
+ unsigned long pageaddr = (unsigned long)&__kfence_pool[offset];
+
+ /* The checks do not affect performance; only called from slow-paths. */
+
+ /* Only call with a pointer into kfence_metadata. */
+ if (KFENCE_WARN_ON(meta < kfence_metadata ||
+ meta >= kfence_metadata + CONFIG_KFENCE_NUM_OBJECTS))
+ return 0;
+
+ /*
+ * This metadata object only ever maps to 1 page; verify that the stored
+ * address is in the expected range.
+ */
+ if (KFENCE_WARN_ON(ALIGN_DOWN(meta->addr, PAGE_SIZE) != pageaddr))
+ return 0;
+
+ return pageaddr;
+}
+
+/*
+ * Update the object's metadata state, including updating the alloc/free stacks
+ * depending on the state transition.
+ */
+static noinline void metadata_update_state(struct kfence_metadata *meta,
+ enum kfence_object_state next)
+{
+ struct kfence_track *track =
+ next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
+
+ lockdep_assert_held(&meta->lock);
+
+ /*
+ * Skip over 1 (this) functions; noinline ensures we do not accidentally
+ * skip over the caller by never inlining.
+ */
+ track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+ track->pid = task_pid_nr(current);
+
+ /*
+ * Pairs with READ_ONCE() in
+ * kfence_shutdown_cache(),
+ * kfence_handle_page_fault().
+ */
+ WRITE_ONCE(meta->state, next);
+}
+
+/* Write canary byte to @addr. */
+static inline bool set_canary_byte(u8 *addr)
+{
+ *addr = KFENCE_CANARY_PATTERN(addr);
+ return true;
+}
+
+/* Check canary byte at @addr. */
+static inline bool check_canary_byte(u8 *addr)
+{
+ if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
+ return true;
+
+ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+ kfence_report_error((unsigned long)addr, addr_to_metadata((unsigned long)addr),
+ KFENCE_ERROR_CORRUPTION);
+ return false;
+}
+
+/* __always_inline this to ensure we won't do an indirect call to fn. */
+static __always_inline void for_each_canary(const struct kfence_metadata *meta, bool (*fn)(u8 *))
+{
+ const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
+ unsigned long addr;
+
+ lockdep_assert_held(&meta->lock);
+
+ /*
+ * We'll iterate over each canary byte per-side until fn() returns
+ * false. However, we'll still iterate over the canary bytes to the
+ * right of the object even if there was an error in the canary bytes to
+ * the left of the object. Specifically, if check_canary_byte()
+ * generates an error, showing both sides might give more clues as to
+ * what the error is about when displaying which bytes were corrupted.
+ */
+
+ /* Apply to left of object. */
+ for (addr = pageaddr; addr < meta->addr; addr++) {
+ if (!fn((u8 *)addr))
+ break;
+ }
+
+ /* Apply to right of object. */
+ for (addr = meta->addr + meta->size; addr < pageaddr + PAGE_SIZE; addr++) {
+ if (!fn((u8 *)addr))
+ break;
+ }
+}
+
+static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
+{
+ struct kfence_metadata *meta = NULL;
+ unsigned long flags;
+ struct page *page;
+ void *addr;
+
+ /* Try to obtain a free object. */
+ raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+ if (!list_empty(&kfence_freelist)) {
+ meta = list_entry(kfence_freelist.next, struct kfence_metadata, list);
+ list_del_init(&meta->list);
+ }
+ raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+ if (!meta)
+ return NULL;
+
+ if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
+ /*
+ * This is extremely unlikely -- we are reporting on a
+ * use-after-free, which locked meta->lock, and the reporting
+ * code via printk calls kmalloc() which ends up in
+ * kfence_alloc() and tries to grab the same object that we're
+ * reporting on. While it has never been observed, lockdep does
+ * report that there is a possibility of deadlock. Fix it by
+ * using trylock and bailing out gracefully.
+ */
+ raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+ /* Put the object back on the freelist. */
+ list_add_tail(&meta->list, &kfence_freelist);
+ raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+
+ return NULL;
+ }
+
+ meta->addr = metadata_to_pageaddr(meta);
+ /* Unprotect if we're reusing this page. */
+ if (meta->state == KFENCE_OBJECT_FREED)
+ kfence_unprotect(meta->addr);
+
+ /*
+ * Note: for allocations made before RNG initialization, will always
+ * return zero. We still benefit from enabling KFENCE as early as
+ * possible, even when the RNG is not yet available, as this will allow
+ * KFENCE to detect bugs due to earlier allocations. The only downside
+ * is that the out-of-bounds accesses detected are deterministic for
+ * such allocations.
+ */
+ if (prandom_u32_max(2)) {
+ /* Allocate on the "right" side, re-calculate address. */
+ meta->addr += PAGE_SIZE - size;
+ meta->addr = ALIGN_DOWN(meta->addr, cache->align);
+ }
+
+ addr = (void *)meta->addr;
+
+ /* Update remaining metadata. */
+ metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
+ /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
+ WRITE_ONCE(meta->cache, cache);
+ meta->size = size;
+ for_each_canary(meta, set_canary_byte);
+
+ /* Set required struct page fields. */
+ page = virt_to_page(meta->addr);
+ page->slab_cache = cache;
+
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ /* Memory initialization. */
+
+ /*
+ * We check slab_want_init_on_alloc() ourselves, rather than letting
+ * SL*B do the initialization, as otherwise we might overwrite KFENCE's
+ * redzone.
+ */
+ if (unlikely(slab_want_init_on_alloc(gfp, cache)))
+ memzero_explicit(addr, size);
+ if (cache->ctor)
+ cache->ctor(addr);
+
+ if (CONFIG_KFENCE_STRESS_TEST_FAULTS && !prandom_u32_max(CONFIG_KFENCE_STRESS_TEST_FAULTS))
+ kfence_protect(meta->addr); /* Random "faults" by protecting the object. */
+
+ atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCATED]);
+ atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
+
+ return addr;
+}
+
+static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
+{
+ struct kcsan_scoped_access assert_page_exclusive;
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+
+ if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
+ /* Invalid or double-free, bail out. */
+ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+ kfence_report_error((unsigned long)addr, meta, KFENCE_ERROR_INVALID_FREE);
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+ return;
+ }
+
+ /* Detect racy use-after-free, or incorrect reallocation of this page by KFENCE. */
+ kcsan_begin_scoped_access((void *)ALIGN_DOWN((unsigned long)addr, PAGE_SIZE), PAGE_SIZE,
+ KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT,
+ &assert_page_exclusive);
+
+ if (CONFIG_KFENCE_STRESS_TEST_FAULTS)
+ kfence_unprotect((unsigned long)addr); /* To check canary bytes. */
+
+ /* Restore page protection if there was an OOB access. */
+ if (meta->unprotected_page) {
+ kfence_protect(meta->unprotected_page);
+ meta->unprotected_page = 0;
+ }
+
+ /* Check canary bytes for memory corruption. */
+ for_each_canary(meta, check_canary_byte);
+
+ /*
+ * Clear memory if init-on-free is set. While we protect the page, the
+ * data is still there, and after a use-after-free is detected, we
+ * unprotect the page, so the data is still accessible.
+ */
+ if (!zombie && unlikely(slab_want_init_on_free(meta->cache)))
+ memzero_explicit(addr, meta->size);
+
+ /* Mark the object as freed. */
+ metadata_update_state(meta, KFENCE_OBJECT_FREED);
+
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ /* Protect to detect use-after-frees. */
+ kfence_protect((unsigned long)addr);
+
+ kcsan_end_scoped_access(&assert_page_exclusive);
+ if (!zombie) {
+ /* Add it to the tail of the freelist for reuse. */
+ raw_spin_lock_irqsave(&kfence_freelist_lock, flags);
+ KFENCE_WARN_ON(!list_empty(&meta->list));
+ list_add_tail(&meta->list, &kfence_freelist);
+ raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
+
+ atomic_long_dec(&counters[KFENCE_COUNTER_ALLOCATED]);
+ atomic_long_inc(&counters[KFENCE_COUNTER_FREES]);
+ } else {
+ /* See kfence_shutdown_cache(). */
+ atomic_long_inc(&counters[KFENCE_COUNTER_ZOMBIES]);
+ }
+}
+
+static void rcu_guarded_free(struct rcu_head *h)
+{
+ struct kfence_metadata *meta = container_of(h, struct kfence_metadata, rcu_head);
+
+ kfence_guarded_free((void *)meta->addr, meta, false);
+}
+
+static bool __init kfence_init_pool(void)
+{
+ unsigned long addr = (unsigned long)__kfence_pool;
+ struct page *pages;
+ int i;
+
+ if (!__kfence_pool)
+ return false;
+
+ if (!arch_kfence_init_pool())
+ goto err;
+
+ pages = virt_to_page(addr);
+
+ /*
+ * Set up object pages: they must have PG_slab set, to avoid freeing
+ * these as real pages.
+ *
+ * We also want to avoid inserting kfence_free() in the kfree()
+ * fast-path in SLUB, and therefore need to ensure kfree() correctly
+ * enters __slab_free() slow-path.
+ */
+ for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
+ if (!i || (i % 2))
+ continue;
+
+ /* Verify we do not have a compound head page. */
+ if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
+ goto err;
+
+ __SetPageSlab(&pages[i]);
+ }
+
+ /*
+ * Protect the first 2 pages. The first page is mostly unnecessary, and
+ * merely serves as an extended guard page. However, adding one
+ * additional page in the beginning gives us an even number of pages,
+ * which simplifies the mapping of address to metadata index.
+ */
+ for (i = 0; i < 2; i++) {
+ if (unlikely(!kfence_protect(addr)))
+ goto err;
+
+ addr += PAGE_SIZE;
+ }
+
+ for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+ struct kfence_metadata *meta = &kfence_metadata[i];
+
+ /* Initialize metadata. */
+ INIT_LIST_HEAD(&meta->list);
+ raw_spin_lock_init(&meta->lock);
+ meta->state = KFENCE_OBJECT_UNUSED;
+ meta->addr = addr; /* Initialize for validation in metadata_to_pageaddr(). */
+ list_add_tail(&meta->list, &kfence_freelist);
+
+ /* Protect the right redzone. */
+ if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
+ goto err;
+
+ addr += 2 * PAGE_SIZE;
+ }
+
+ return true;
+
+err:
+ /*
+ * Only release unprotected pages, and do not try to go back and change
+ * page attributes due to risk of failing to do so as well. If changing
+ * page attributes for some pages fails, it is very likely that it also
+ * fails for the first page, and therefore expect addr==__kfence_pool in
+ * most failure cases.
+ */
+ memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
+ __kfence_pool = NULL;
+ return false;
+}
+
+/* === DebugFS Interface ==================================================== */
+
+static int stats_show(struct seq_file *seq, void *v)
+{
+ int i;
+
+ seq_printf(seq, "enabled: %i\n", READ_ONCE(kfence_enabled));
+ for (i = 0; i < KFENCE_COUNTER_COUNT; i++)
+ seq_printf(seq, "%s: %ld\n", counter_names[i], atomic_long_read(&counters[i]));
+
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(stats);
+
+/*
+ * debugfs seq_file operations for /sys/kernel/debug/kfence/objects.
+ * start_object() and next_object() return the object index + 1, because NULL is used
+ * to stop iteration.
+ */
+static void *start_object(struct seq_file *seq, loff_t *pos)
+{
+ if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
+ return (void *)((long)*pos + 1);
+ return NULL;
+}
+
+static void stop_object(struct seq_file *seq, void *v)
+{
+}
+
+static void *next_object(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ if (*pos < CONFIG_KFENCE_NUM_OBJECTS)
+ return (void *)((long)*pos + 1);
+ return NULL;
+}
+
+static int show_object(struct seq_file *seq, void *v)
+{
+ struct kfence_metadata *meta = &kfence_metadata[(long)v - 1];
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+ kfence_print_object(seq, meta);
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+ seq_puts(seq, "---------------------------------\n");
+
+ return 0;
+}
+
+static const struct seq_operations object_seqops = {
+ .start = start_object,
+ .next = next_object,
+ .stop = stop_object,
+ .show = show_object,
+};
+
+static int open_objects(struct inode *inode, struct file *file)
+{
+ return seq_open(file, &object_seqops);
+}
+
+static const struct file_operations objects_fops = {
+ .open = open_objects,
+ .read = seq_read,
+ .llseek = seq_lseek,
+};
+
+static int __init kfence_debugfs_init(void)
+{
+ struct dentry *kfence_dir = debugfs_create_dir("kfence", NULL);
+
+ debugfs_create_file("stats", 0444, kfence_dir, NULL, &stats_fops);
+ debugfs_create_file("objects", 0400, kfence_dir, NULL, &objects_fops);
+ return 0;
+}
+
+late_initcall(kfence_debugfs_init);
+
+/* === Allocation Gate Timer ================================================ */
+
+/*
+ * Set up delayed work, which will enable and disable the static key. We need to
+ * use a work queue (rather than a simple timer), since enabling and disabling a
+ * static key cannot be done from an interrupt.
+ *
+ * Note: Toggling a static branch currently causes IPIs, and here we'll end up
+ * with a total of 2 IPIs to all CPUs. If this ends up a problem in future (with
+ * more aggressive sampling intervals), we could get away with a variant that
+ * avoids IPIs, at the cost of not immediately capturing allocations if the
+ * instructions remain cached.
+ */
+static struct delayed_work kfence_timer;
+static void toggle_allocation_gate(struct work_struct *work)
+{
+ if (!READ_ONCE(kfence_enabled))
+ return;
+
+ /* Enable static key, and await allocation to happen. */
+ atomic_set(&kfence_allocation_gate, 0);
+#ifdef CONFIG_KFENCE_STATIC_KEYS
+ static_branch_enable(&kfence_allocation_key);
+ /*
+ * Await an allocation. Timeout after 1 second, in case the kernel stops
+ * doing allocations, to avoid stalling this worker task for too long.
+ */
+ {
+ unsigned long end_wait = jiffies + HZ;
+
+ do {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (atomic_read(&kfence_allocation_gate) != 0)
+ break;
+ schedule_timeout(1);
+ } while (time_before(jiffies, end_wait));
+ __set_current_state(TASK_RUNNING);
+ }
+ /* Disable static key and reset timer. */
+ static_branch_disable(&kfence_allocation_key);
+#endif
+ schedule_delayed_work(&kfence_timer, msecs_to_jiffies(kfence_sample_interval));
+}
+static DECLARE_DELAYED_WORK(kfence_timer, toggle_allocation_gate);
+
+/* === Public interface ===================================================== */
+
+void __init kfence_alloc_pool(void)
+{
+ if (!kfence_sample_interval)
+ return;
+
+ __kfence_pool = memblock_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
+
+ if (!__kfence_pool)
+ pr_err("failed to allocate pool\n");
+}
+
+void __init kfence_init(void)
+{
+ /* Setting kfence_sample_interval to 0 on boot disables KFENCE. */
+ if (!kfence_sample_interval)
+ return;
+
+ if (!kfence_init_pool()) {
+ pr_err("%s failed\n", __func__);
+ return;
+ }
+
+ WRITE_ONCE(kfence_enabled, true);
+ schedule_delayed_work(&kfence_timer, 0);
+ pr_info("initialized - using %lu bytes for %d objects", KFENCE_POOL_SIZE,
+ CONFIG_KFENCE_NUM_OBJECTS);
+ if (IS_ENABLED(CONFIG_DEBUG_KERNEL))
+ pr_cont(" at 0x%px-0x%px\n", (void *)__kfence_pool,
+ (void *)(__kfence_pool + KFENCE_POOL_SIZE));
+ else
+ pr_cont("\n");
+}
+
+void kfence_shutdown_cache(struct kmem_cache *s)
+{
+ unsigned long flags;
+ struct kfence_metadata *meta;
+ int i;
+
+ for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+ bool in_use;
+
+ meta = &kfence_metadata[i];
+
+ /*
+ * If we observe some inconsistent cache and state pair where we
+ * should have returned false here, cache destruction is racing
+ * with either kmem_cache_alloc() or kmem_cache_free(). Taking
+ * the lock will not help, as different critical section
+ * serialization will have the same outcome.
+ */
+ if (READ_ONCE(meta->cache) != s ||
+ READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED)
+ continue;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+ in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED;
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+
+ if (in_use) {
+ /*
+ * This cache still has allocations, and we should not
+ * release them back into the freelist so they can still
+ * safely be used and retain the kernel's default
+ * behaviour of keeping the allocations alive (leak the
+ * cache); however, they effectively become "zombie
+ * allocations" as the KFENCE objects are the only ones
+ * still in use and the owning cache is being destroyed.
+ *
+ * We mark them freed, so that any subsequent use shows
+ * more useful error messages that will include stack
+ * traces of the user of the object, the original
+ * allocation, and caller to shutdown_cache().
+ */
+ kfence_guarded_free((void *)meta->addr, meta, /*zombie=*/true);
+ }
+ }
+
+ for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
+ meta = &kfence_metadata[i];
+
+ /* See above. */
+ if (READ_ONCE(meta->cache) != s || READ_ONCE(meta->state) != KFENCE_OBJECT_FREED)
+ continue;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+ if (meta->cache == s && meta->state == KFENCE_OBJECT_FREED)
+ meta->cache = NULL;
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
+ }
+}
+
+void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
+{
+ /*
+ * allocation_gate only needs to become non-zero, so it doesn't make
+ * sense to continue writing to it and pay the associated contention
+ * cost, in case we have a large number of concurrent allocations.
+ */
+ if (atomic_read(&kfence_allocation_gate) || atomic_inc_return(&kfence_allocation_gate) > 1)
+ return NULL;
+
+ if (!READ_ONCE(kfence_enabled))
+ return NULL;
+
+ if (size > PAGE_SIZE)
+ return NULL;
+
+ return kfence_guarded_alloc(s, size, flags);
+}
+
+size_t kfence_ksize(const void *addr)
+{
+ const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+ /*
+ * Read locklessly -- if there is a race with __kfence_alloc(), this is
+ * either a use-after-free or invalid access.
+ */
+ return meta ? meta->size : 0;
+}
+
+void *kfence_object_start(const void *addr)
+{
+ const struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+ /*
+ * Read locklessly -- if there is a race with __kfence_alloc(), this is
+ * either a use-after-free or invalid access.
+ */
+ return meta ? (void *)meta->addr : NULL;
+}
+
+void __kfence_free(void *addr)
+{
+ struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr);
+
+ /*
+ * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing
+ * the object, as the object page may be recycled for other-typed
+ * objects once it has been freed. meta->cache may be NULL if the cache
+ * was destroyed.
+ */
+ if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU)))
+ call_rcu(&meta->rcu_head, rcu_guarded_free);
+ else
+ kfence_guarded_free(addr, meta, false);
+}
+
+bool kfence_handle_page_fault(unsigned long addr)
+{
+ const int page_index = (addr - (unsigned long)__kfence_pool) / PAGE_SIZE;
+ struct kfence_metadata *to_report = NULL;
+ enum kfence_error_type error_type;
+ unsigned long flags;
+
+ if (!is_kfence_address((void *)addr))
+ return false;
+
+ if (!READ_ONCE(kfence_enabled)) /* If disabled at runtime ... */
+ return kfence_unprotect(addr); /* ... unprotect and proceed. */
+
+ atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
+
+ if (page_index % 2) {
+ /* This is a redzone, report a buffer overflow. */
+ struct kfence_metadata *meta;
+ int distance = 0;
+
+ meta = addr_to_metadata(addr - PAGE_SIZE);
+ if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
+ to_report = meta;
+ /* Data race ok; distance calculation approximate. */
+ distance = addr - data_race(meta->addr + meta->size);
+ }
+
+ meta = addr_to_metadata(addr + PAGE_SIZE);
+ if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
+ /* Data race ok; distance calculation approximate. */
+ if (!to_report || distance > data_race(meta->addr) - addr)
+ to_report = meta;
+ }
+
+ if (!to_report)
+ goto out;
+
+ raw_spin_lock_irqsave(&to_report->lock, flags);
+ to_report->unprotected_page = addr;
+ error_type = KFENCE_ERROR_OOB;
+
+ /*
+ * If the object was freed before we took the look we can still
+ * report this as an OOB -- the report will simply show the
+ * stacktrace of the free as well.
+ */
+ } else {
+ to_report = addr_to_metadata(addr);
+ if (!to_report)
+ goto out;
+
+ raw_spin_lock_irqsave(&to_report->lock, flags);
+ error_type = KFENCE_ERROR_UAF;
+ /*
+ * We may race with __kfence_alloc(), and it is possible that a
+ * freed object may be reallocated. We simply report this as a
+ * use-after-free, with the stack trace showing the place where
+ * the object was re-allocated.
+ */
+ }
+
+out:
+ if (to_report) {
+ kfence_report_error(addr, to_report, error_type);
+ raw_spin_unlock_irqrestore(&to_report->lock, flags);
+ } else {
+ /* This may be a UAF or OOB access, but we can't be sure. */
+ kfence_report_error(addr, NULL, KFENCE_ERROR_INVALID);
+ }
+
+ return kfence_unprotect(addr); /* Unprotect and let access proceed. */
+}
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
new file mode 100644
index 000000000000..1014060f9707
--- /dev/null
+++ b/mm/kfence/kfence.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Kernel Electric-Fence (KFENCE). For more info please see
+ * Documentation/dev-tools/kfence.rst.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#ifndef MM_KFENCE_KFENCE_H
+#define MM_KFENCE_KFENCE_H
+
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include "../slab.h" /* for struct kmem_cache */
+
+/* For non-debug builds, avoid leaking kernel pointers into dmesg. */
+#ifdef CONFIG_DEBUG_KERNEL
+#define PTR_FMT "%px"
+#else
+#define PTR_FMT "%p"
+#endif
+
+/*
+ * Get the canary byte pattern for @addr. Use a pattern that varies based on the
+ * lower 3 bits of the address, to detect memory corruptions with higher
+ * probability, where similar constants are used.
+ */
+#define KFENCE_CANARY_PATTERN(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7))
+
+/* Maximum stack depth for reports. */
+#define KFENCE_STACK_DEPTH 64
+
+/* KFENCE object states. */
+enum kfence_object_state {
+ KFENCE_OBJECT_UNUSED, /* Object is unused. */
+ KFENCE_OBJECT_ALLOCATED, /* Object is currently allocated. */
+ KFENCE_OBJECT_FREED, /* Object was allocated, and then freed. */
+};
+
+/* Alloc/free tracking information. */
+struct kfence_track {
+ pid_t pid;
+ int num_stack_entries;
+ unsigned long stack_entries[KFENCE_STACK_DEPTH];
+};
+
+/* KFENCE metadata per guarded allocation. */
+struct kfence_metadata {
+ struct list_head list; /* Freelist node; access under kfence_freelist_lock. */
+ struct rcu_head rcu_head; /* For delayed freeing. */
+
+ /*
+ * Lock protecting below data; to ensure consistency of the below data,
+ * since the following may execute concurrently: __kfence_alloc(),
+ * __kfence_free(), kfence_handle_page_fault(). However, note that we
+ * cannot grab the same metadata off the freelist twice, and multiple
+ * __kfence_alloc() cannot run concurrently on the same metadata.
+ */
+ raw_spinlock_t lock;
+
+ /* The current state of the object; see above. */
+ enum kfence_object_state state;
+
+ /*
+ * Allocated object address; cannot be calculated from size, because of
+ * alignment requirements.
+ *
+ * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant.
+ */
+ unsigned long addr;
+
+ /*
+ * The size of the original allocation.
+ */
+ size_t size;
+
+ /*
+ * The kmem_cache cache of the last allocation; NULL if never allocated
+ * or the cache has already been destroyed.
+ */
+ struct kmem_cache *cache;
+
+ /*
+ * In case of an invalid access, the page that was unprotected; we
+ * optimistically only store one address.
+ */
+ unsigned long unprotected_page;
+
+ /* Allocation and free stack information. */
+ struct kfence_track alloc_track;
+ struct kfence_track free_track;
+};
+
+extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
+
+/* KFENCE error types for report generation. */
+enum kfence_error_type {
+ KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */
+ KFENCE_ERROR_UAF, /* Detected a use-after-free access. */
+ KFENCE_ERROR_CORRUPTION, /* Detected a memory corruption on free. */
+ KFENCE_ERROR_INVALID, /* Invalid access of unknown type. */
+ KFENCE_ERROR_INVALID_FREE, /* Invalid free. */
+};
+
+void kfence_report_error(unsigned long address, const struct kfence_metadata *meta,
+ enum kfence_error_type type);
+
+void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta);
+
+#endif /* MM_KFENCE_KFENCE_H */
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
new file mode 100644
index 000000000000..64f27c8d46a3
--- /dev/null
+++ b/mm/kfence/report.c
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * KFENCE reporting.
+ *
+ * Copyright (C) 2020, Google LLC.
+ */
+
+#include <stdarg.h>
+
+#include <linux/kernel.h>
+#include <linux/lockdep.h>
+#include <linux/printk.h>
+#include <linux/seq_file.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+
+#include <asm/kfence.h>
+
+#include "kfence.h"
+
+/* Helper function to either print to a seq_file or to console. */
+__printf(2, 3)
+static void seq_con_printf(struct seq_file *seq, const char *fmt, ...)
+{
+ va_list args;
+
+ va_start(args, fmt);
+ if (seq)
+ seq_vprintf(seq, fmt, args);
+ else
+ vprintk(fmt, args);
+ va_end(args);
+}
+
+/*
+ * Get the number of stack entries to skip to get out of MM internals. @type is
+ * optional, and if set to NULL, assumes an allocation or free stack.
+ */
+static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries,
+ const enum kfence_error_type *type)
+{
+ char buf[64];
+ int skipnr, fallback = 0;
+ bool is_access_fault = false;
+
+ if (type) {
+ /* Depending on error type, find different stack entries. */
+ switch (*type) {
+ case KFENCE_ERROR_UAF:
+ case KFENCE_ERROR_OOB:
+ case KFENCE_ERROR_INVALID:
+ is_access_fault = true;
+ break;
+ case KFENCE_ERROR_CORRUPTION:
+ case KFENCE_ERROR_INVALID_FREE:
+ break;
+ }
+ }
+
+ for (skipnr = 0; skipnr < num_entries; skipnr++) {
+ int len = scnprintf(buf, sizeof(buf), "%ps", (void *)stack_entries[skipnr]);
+
+ if (is_access_fault) {
+ if (!strncmp(buf, KFENCE_SKIP_ARCH_FAULT_HANDLER, len))
+ goto found;
+ } else {
+ if (str_has_prefix(buf, "kfence_") || str_has_prefix(buf, "__kfence_") ||
+ !strncmp(buf, "__slab_free", len)) {
+ /*
+ * In case of tail calls from any of the below
+ * to any of the above.
+ */
+ fallback = skipnr + 1;
+ }
+
+ /* Also the *_bulk() variants by only checking prefixes. */
+ if (str_has_prefix(buf, "kfree") ||
+ str_has_prefix(buf, "kmem_cache_free") ||
+ str_has_prefix(buf, "__kmalloc") ||
+ str_has_prefix(buf, "kmem_cache_alloc"))
+ goto found;
+ }
+ }
+ if (fallback < num_entries)
+ return fallback;
+found:
+ skipnr++;
+ return skipnr < num_entries ? skipnr : 0;
+}
+
+static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadata *meta,
+ bool show_alloc)
+{
+ const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track;
+
+ if (track->num_stack_entries) {
+ /* Skip allocation/free internals stack. */
+ int i = get_stack_skipnr(track->stack_entries, track->num_stack_entries, NULL);
+
+ /* stack_trace_seq_print() does not exist; open code our own. */
+ for (; i < track->num_stack_entries; i++)
+ seq_con_printf(seq, " %pS\n", (void *)track->stack_entries[i]);
+ } else {
+ seq_con_printf(seq, " no %s stack\n", show_alloc ? "allocation" : "deallocation");
+ }
+}
+
+void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta)
+{
+ const int size = abs(meta->size);
+ const unsigned long start = meta->addr;
+ const struct kmem_cache *const cache = meta->cache;
+
+ lockdep_assert_held(&meta->lock);
+
+ if (meta->state == KFENCE_OBJECT_UNUSED) {
+ seq_con_printf(seq, "kfence-#%zd unused\n", meta - kfence_metadata);
+ return;
+ }
+
+ seq_con_printf(seq,
+ "kfence-#%zd [0x" PTR_FMT "-0x" PTR_FMT
+ ", size=%d, cache=%s] allocated by task %d:\n",
+ meta - kfence_metadata, (void *)start, (void *)(start + size - 1), size,
+ (cache && cache->name) ? cache->name : "<destroyed>", meta->alloc_track.pid);
+ kfence_print_stack(seq, meta, true);
+
+ if (meta->state == KFENCE_OBJECT_FREED) {
+ seq_con_printf(seq, "\nfreed by task %d:\n", meta->free_track.pid);
+ kfence_print_stack(seq, meta, false);
+ }
+}
+
+/*
+ * Show bytes at @addr that are different from the expected canary values, up to
+ * @max_bytes.
+ */
+static void print_diff_canary(unsigned long address, size_t bytes_to_show,
+ const struct kfence_metadata *meta)
+{
+ const unsigned long show_until_addr = address + bytes_to_show;
+ const u8 *cur, *end;
+
+ /* Do not show contents of object nor read into following guard page. */
+ end = (const u8 *)(address < meta->addr ? min(show_until_addr, meta->addr)
+ : min(show_until_addr, PAGE_ALIGN(address)));
+
+ pr_cont("[");
+ for (cur = (const u8 *)address; cur < end; cur++) {
+ if (*cur == KFENCE_CANARY_PATTERN(cur))
+ pr_cont(" .");
+ else if (IS_ENABLED(CONFIG_DEBUG_KERNEL))
+ pr_cont(" 0x%02x", *cur);
+ else /* Do not leak kernel memory in non-debug builds. */
+ pr_cont(" !");
+ }
+ pr_cont(" ]");
+}
+
+void kfence_report_error(unsigned long address, const struct kfence_metadata *meta,
+ enum kfence_error_type type)
+{
+ unsigned long stack_entries[KFENCE_STACK_DEPTH] = { 0 };
+ int num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 1);
+ int skipnr = get_stack_skipnr(stack_entries, num_stack_entries, &type);
+ const ptrdiff_t object_index = meta ? meta - kfence_metadata : -1;
+
+ /* Require non-NULL meta, except if KFENCE_ERROR_INVALID. */
+ if (WARN_ON(type != KFENCE_ERROR_INVALID && !meta))
+ return;
+
+ if (meta)
+ lockdep_assert_held(&meta->lock);
+ /*
+ * Because we may generate reports in printk-unfriendly parts of the
+ * kernel, such as scheduler code, the use of printk() could deadlock.
+ * Until such time that all printing code here is safe in all parts of
+ * the kernel, accept the risk, and just get our message out (given the
+ * system might already behave unpredictably due to the memory error).
+ * As such, also disable lockdep to hide warnings, and avoid disabling
+ * lockdep for the rest of the kernel.
+ */
+ lockdep_off();
+
+ pr_err("==================================================================\n");
+ /* Print report header. */
+ switch (type) {
+ case KFENCE_ERROR_OOB: {
+ const bool left_of_object = address < meta->addr;
+
+ pr_err("BUG: KFENCE: out-of-bounds in %pS\n\n", (void *)stack_entries[skipnr]);
+ pr_err("Out-of-bounds access at 0x" PTR_FMT " (%luB %s of kfence-#%zd):\n",
+ (void *)address,
+ left_of_object ? meta->addr - address : address - meta->addr,
+ left_of_object ? "left" : "right", object_index);
+ break;
+ }
+ case KFENCE_ERROR_UAF:
+ pr_err("BUG: KFENCE: use-after-free in %pS\n\n", (void *)stack_entries[skipnr]);
+ pr_err("Use-after-free access at 0x" PTR_FMT " (in kfence-#%zd):\n",
+ (void *)address, object_index);
+ break;
+ case KFENCE_ERROR_CORRUPTION:
+ pr_err("BUG: KFENCE: memory corruption in %pS\n\n", (void *)stack_entries[skipnr]);
+ pr_err("Corrupted memory at 0x" PTR_FMT " ", (void *)address);
+ print_diff_canary(address, 16, meta);
+ pr_cont(" (in kfence-#%zd):\n", object_index);
+ break;
+ case KFENCE_ERROR_INVALID:
+ pr_err("BUG: KFENCE: invalid access in %pS\n\n", (void *)stack_entries[skipnr]);
+ pr_err("Invalid access at 0x" PTR_FMT ":\n", (void *)address);
+ break;
+ case KFENCE_ERROR_INVALID_FREE:
+ pr_err("BUG: KFENCE: invalid free in %pS\n\n", (void *)stack_entries[skipnr]);
+ pr_err("Invalid free of 0x" PTR_FMT " (in kfence-#%zd):\n", (void *)address,
+ object_index);
+ break;
+ }
+
+ /* Print stack trace and object info. */
+ stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, 0);
+
+ if (meta) {
+ pr_err("\n");
+ kfence_print_object(NULL, meta);
+ }
+
+ /* Print report footer. */
+ pr_err("\n");
+ dump_stack_print_info(KERN_ERR);
+ pr_err("==================================================================\n");
+
+ lockdep_on();
+
+ if (panic_on_warn)
+ panic("panic_on_warn set ...\n");
+
+ /* We encountered a memory unsafety error, taint the kernel! */
+ add_taint(TAINT_BAD_PAGE, LOCKDEP_STILL_OK);
+}
--
2.25.1
2
1

[PATCH kernel-4.19 1/2] ACPI / APEI: Add a notifier chain for unknown (vendor) CPER records
by Yang Yingliang 30 Sep '21
by Yang Yingliang 30 Sep '21
30 Sep '21
From: Shiju Jose <shiju.jose(a)huawei.com>
mainline inclusion
from mainline-v5.10-rc1
commit 9aa9cf3ee9451d08adafc03cef8e44c7ea3898e7
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I4CMAR
CVE: NA
--------------------------------
CPER records describing a firmware-first error are identified by GUID.
The ghes driver currently logs, but ignores any unknown CPER records.
This prevents describing errors that can't be represented by a standard
entry, that would otherwise allow a driver to recover from an error.
The UEFI spec calls these 'Non-standard Section Body' (N.2.3 of
version 2.8).
Add a notifier chain for these non-standard/vendor-records. Callers
must identify their type of records by GUID.
Record data is copied to memory from the ghes_estatus_pool to allow
us to keep it until after the notifier has run.
Co-developed-by: James Morse <james.morse(a)arm.com>
Link: https://lore.kernel.org/r/20200903123456.1823-2-shiju.jose@huawei.com
Signed-off-by: James Morse <james.morse(a)arm.com>
Signed-off-by: Shiju Jose <shiju.jose(a)huawei.com>
Signed-off-by: Lorenzo Pieralisi <lorenzo.pieralisi(a)arm.com>
Acked-by: "Rafael J. Wysocki" <rjw(a)rjwysocki.net>
Signed-off-by: Weilong Chen <chenweilong(a)huawei.com>
Reviewed-by: Xie XiuQi <xiexiuqi(a)huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
---
drivers/acpi/apei/ghes.c | 64 ++++++++++++++++++++++++++++++++++++++++
include/acpi/ghes.h | 18 +++++++++++
2 files changed, 82 insertions(+)
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index e807e8f74d1e0..08658c433b1fc 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -84,6 +84,12 @@
((struct acpi_hest_generic_status *) \
((struct ghes_estatus_node *)(estatus_node) + 1))
+#define GHES_VENDOR_ENTRY_LEN(gdata_len) \
+ (sizeof(struct ghes_vendor_record_entry) + (gdata_len))
+#define GHES_GDATA_FROM_VENDOR_ENTRY(vendor_entry) \
+ ((struct acpi_hest_generic_data *) \
+ ((struct ghes_vendor_record_entry *)(vendor_entry) + 1))
+
static inline bool is_hest_type_generic_v2(struct ghes *ghes)
{
return ghes->generic->header.type == ACPI_HEST_TYPE_GENERIC_ERROR_V2;
@@ -126,6 +132,12 @@ EXPORT_SYMBOL(ghes_ts_err_chain);
static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi);
static DEFINE_SPINLOCK(ghes_ioremap_lock_irq);
+struct ghes_vendor_record_entry {
+ struct work_struct work;
+ int error_severity;
+ char vendor_record[];
+};
+
static struct gen_pool *ghes_estatus_pool;
static unsigned long ghes_estatus_pool_size_request;
@@ -464,6 +476,57 @@ static void ghes_handle_aer(struct acpi_hest_generic_data *gdata)
#endif
}
+static BLOCKING_NOTIFIER_HEAD(vendor_record_notify_list);
+
+int ghes_register_vendor_record_notifier(struct notifier_block *nb)
+{
+ return blocking_notifier_chain_register(&vendor_record_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(ghes_register_vendor_record_notifier);
+
+void ghes_unregister_vendor_record_notifier(struct notifier_block *nb)
+{
+ blocking_notifier_chain_unregister(&vendor_record_notify_list, nb);
+}
+EXPORT_SYMBOL_GPL(ghes_unregister_vendor_record_notifier);
+
+static void ghes_vendor_record_work_func(struct work_struct *work)
+{
+ struct ghes_vendor_record_entry *entry;
+ struct acpi_hest_generic_data *gdata;
+ u32 len;
+
+ entry = container_of(work, struct ghes_vendor_record_entry, work);
+ gdata = GHES_GDATA_FROM_VENDOR_ENTRY(entry);
+
+ blocking_notifier_call_chain(&vendor_record_notify_list,
+ entry->error_severity, gdata);
+
+ len = GHES_VENDOR_ENTRY_LEN(acpi_hest_get_record_size(gdata));
+ gen_pool_free(ghes_estatus_pool, (unsigned long)entry, len);
+}
+
+static void ghes_defer_non_standard_event(struct acpi_hest_generic_data *gdata,
+ int sev)
+{
+ struct acpi_hest_generic_data *copied_gdata;
+ struct ghes_vendor_record_entry *entry;
+ u32 len;
+
+ len = GHES_VENDOR_ENTRY_LEN(acpi_hest_get_record_size(gdata));
+ entry = (void *)gen_pool_alloc(ghes_estatus_pool, len);
+ if (!entry)
+ return;
+
+ copied_gdata = GHES_GDATA_FROM_VENDOR_ENTRY(entry);
+ memcpy(copied_gdata, gdata, acpi_hest_get_record_size(gdata));
+ entry->error_severity = sev;
+
+ INIT_WORK(&entry->work, ghes_vendor_record_work_func);
+ schedule_work(&entry->work);
+}
+
+
void __weak ghes_arm_process_error(struct ghes *ghes,
struct cper_sec_proc_arm *err, int sec_sev)
{
@@ -517,6 +580,7 @@ static void ghes_do_proc(struct ghes *ghes,
} else {
void *err = acpi_hest_get_payload(gdata);
+ ghes_defer_non_standard_event(gdata, sev);
log_non_standard_event(sec_type, fru_id, fru_text,
sec_sev, err,
gdata->error_data_length);
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 9aaeaaa3d1a7f..0e96c78f9f180 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -52,6 +52,24 @@ enum {
GHES_SEV_PANIC = 0x3,
};
+#ifdef CONFIG_ACPI_APEI_GHES
+/**
+ * ghes_register_vendor_record_notifier - register a notifier for vendor
+ * records that the kernel would otherwise ignore.
+ * @nb: pointer to the notifier_block structure of the event handler.
+ *
+ * return 0 : SUCCESS, non-zero : FAIL
+ */
+int ghes_register_vendor_record_notifier(struct notifier_block *nb);
+
+/**
+ * ghes_unregister_vendor_record_notifier - unregister the previously
+ * registered vendor record notifier.
+ * @nb: pointer to the notifier_block structure of the vendor record handler.
+ */
+void ghes_unregister_vendor_record_notifier(struct notifier_block *nb);
+#endif
+
/* From drivers/edac/ghes_edac.c */
#ifdef CONFIG_EDAC_GHES
--
2.25.1
1
1

[PATCH kernel-4.19] blk-mq-sched: Fix blk_mq_sched_alloc_tags() error handling
by Yang Yingliang 30 Sep '21
by Yang Yingliang 30 Sep '21
30 Sep '21
From: John Garry <john.garry(a)huawei.com>
mainline inclusion
from mainline-v5.14-rc1
commit b93af3055d6f32d3b0361cfdb110c9399c1241ba
category: bugfix
bugzilla: 177012
CVE: NA
---------------------------
If the blk_mq_sched_alloc_tags() -> blk_mq_alloc_rqs() call fails, then we
call blk_mq_sched_free_tags() -> blk_mq_free_rqs().
It is incorrect to do so, as any rqs would have already been freed in the
blk_mq_alloc_rqs() call.
Fix by calling blk_mq_free_rq_map() only directly.
Fixes: 6917ff0b5bd41 ("blk-mq-sched: refactor scheduler initialization")
Signed-off-by: John Garry <john.garry(a)huawei.com>
Reviewed-by: Ming Lei <ming.lei(a)redhat.com>
Link: https://lore.kernel.org/r/1627378373-148090-1-git-send-email-john.garry@hua…
Signed-off-by: Jens Axboe <axboe(a)kernel.dk>
conflicts:
block/blk-mq-sched.c
Signed-off-by: Laibin Qiu <qiulaibin(a)huawei.com>
Reviewed-by: Jason Yan <yanaijie(a)huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
---
block/blk-mq-sched.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index ce4b2ac6d6977..3521eca1b2984 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -513,8 +513,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
- if (ret)
- blk_mq_sched_free_tags(set, hctx, hctx_idx);
+ if (ret) {
+ blk_mq_free_rq_map(hctx->sched_tags);
+ hctx->sched_tags = NULL;
+ }
return ret;
}
--
2.25.1
1
0

[PATCH kernel-4.19 1/3] jbd2: Drop unnecessary branch from jbd2_journal_forget()
by Yang Yingliang 30 Sep '21
by Yang Yingliang 30 Sep '21
30 Sep '21
From: Jan Kara <jack(a)suse.cz>
mainline inclusion
from mainline-5.5-rc1
commit 6d69843e5d3f0c394e1e3004cc2b36efbe402b71
category: bugfix
bugzilla: 176007
CVE: NA
---------------------------
We have cleared both dirty & jbddirty bits from the bh. So there's no
difference between bforget() and brelse(). Thus there's no point jumping
to no_jbd branch.
Signed-off-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20190809124233.13277-5-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Conflicts:
fs/jbd2/transaction.c
Signed-off-by: yangerkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
---
fs/jbd2/transaction.c | 4 ----
1 file changed, 4 deletions(-)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 4055929a043cf..c48658a4d53c0 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1639,10 +1639,6 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
__jbd2_journal_file_buffer(jh, transaction, BJ_Forget);
} else {
__jbd2_journal_unfile_buffer(jh);
- if (!buffer_jbd(bh)) {
- spin_unlock(&journal->j_list_lock);
- goto not_jbd;
- }
}
spin_unlock(&journal->j_list_lock);
} else if (jh->b_transaction) {
--
2.25.1
1
2

30 Sep '21
From: Longfang Liu <liulongfang(a)huawei.com>
mainline inclusion
from mainline-v5.11-rc5
commit 643a4df7fe3f6831d14536fd692be85f92670a52
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I4CDK3?from=project-issue
CVE: NA
----------------------------------------
The system that use Synopsys USB host controllers goes to suspend
when using USB audio player. This causes the USB host controller
continuous send interrupt signal to system, When the number of
interrupts exceeds 100000, the system will forcibly close the
interrupts and output a calltrace error.
When the system goes to suspend, the last interrupt is reported to
the driver. At this time, the system has set the state to suspend.
This causes the last interrupt to not be processed by the system and
not clear the interrupt flag. This uncleared interrupt flag constantly
triggers new interrupt event. This causing the driver to receive more
than 100,000 interrupts, which causes the system to forcibly close the
interrupt report and report the calltrace error.
so, when the driver goes to sleep and changes the system state to
suspend, the interrupt flag needs to be cleared.
Signed-off-by: Longfang Liu <liulongfang(a)huawei.com>
Acked-by: Alan Stern <stern(a)rowland.harvard.edu>
Link: https://lore.kernel.org/r/1610416647-45774-1-git-send-email-liulongfang@hua…
Cc: stable <stable(a)vger.kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Signed-off-by: Longfang Liu <liulongfang(a)huawei.com>
Acked-by: Xie XiuQi <xiexiuqi(a)huawei.com>
Reviewed-by: Cheng Jian <cj.chengjian(a)huawei.com>
Signed-off-by: Yang Yingliang <yangyingliang(a)huawei.com>
---
drivers/usb/host/ehci-hub.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c
index ce0eaf7d7c12a..a99c1ac5d8c8b 100644
--- a/drivers/usb/host/ehci-hub.c
+++ b/drivers/usb/host/ehci-hub.c
@@ -346,8 +346,12 @@ static int ehci_bus_suspend (struct usb_hcd *hcd)
unlink_empty_async_suspended(ehci);
+ /* Some Synopsys controllers mistakenly leave IAA turned on */
+ ehci_writel(ehci, STS_IAA, &ehci->regs->status);
+
/* Any IAA cycle that started before the suspend is now invalid */
end_iaa_cycle(ehci);
+
ehci_handle_start_intr_unlinks(ehci);
ehci_handle_intr_unlinks(ehci);
end_free_itds(ehci);
--
2.25.1
1
0