Kernel
Threads by month
- ----- 2025 -----
- May
- April
- March
- February
- January
- ----- 2024 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2023 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2022 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2021 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2020 -----
- December
- November
- October
- September
- August
- July
- June
- May
- April
- March
- February
- January
- ----- 2019 -----
- December
- 1 participants
- 17951 discussions

[RFC PATCH openEuler-1.0-LTS] sched: memqos: add memqos for dynamic affinity
by Wang ShaoBo 22 Mar '23
by Wang ShaoBo 22 Mar '23
22 Mar '23
Add debug memband interface to dynamic affinity, this would be
useful for those threads sensitive to memory bandwidth.
Signed-off-by: Wang ShaoBo <bobo.shaobowang(a)huawei.com>
---
arch/arm64/include/asm/mpam.h | 2 +
arch/arm64/include/asm/mpam_sched.h | 2 +
arch/arm64/kernel/mpam/mpam_device.c | 58 ++-
arch/arm64/kernel/mpam/mpam_resctrl.c | 65 ++++
include/linux/memqos.h | 142 +++++++
include/linux/sched.h | 14 +-
include/linux/sysctl.h | 2 +
kernel/cgroup/cpuset.c | 1 +
kernel/exit.c | 3 +
kernel/fork.c | 4 +
kernel/sched/Makefile | 1 +
kernel/sched/core.c | 29 +-
kernel/sched/fair.c | 14 +-
kernel/sched/memqos/Makefile | 6 +
kernel/sched/memqos/memqos.c | 297 +++++++++++++++
kernel/sched/memqos/phase_feature_sysctl.c | 126 +++++++
kernel/sched/memqos/phase_memband.c | 145 ++++++++
kernel/sched/memqos/phase_perf.c | 409 +++++++++++++++++++++
kernel/sched/memqos/phase_sim_knn.c | 92 +++++
kernel/sysctl.c | 7 +
mm/mempolicy.c | 10 +-
21 files changed, 1409 insertions(+), 20 deletions(-)
create mode 100644 include/linux/memqos.h
create mode 100644 kernel/sched/memqos/Makefile
create mode 100644 kernel/sched/memqos/memqos.c
create mode 100644 kernel/sched/memqos/phase_feature_sysctl.c
create mode 100644 kernel/sched/memqos/phase_memband.c
create mode 100644 kernel/sched/memqos/phase_perf.c
create mode 100644 kernel/sched/memqos/phase_sim_knn.c
diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h
index 6338eab817e75..269a91d8ca907 100644
--- a/arch/arm64/include/asm/mpam.h
+++ b/arch/arm64/include/asm/mpam.h
@@ -4,6 +4,8 @@
#ifdef CONFIG_MPAM
extern int mpam_rmid_to_partid_pmg(int rmid, int *partid, int *pmg);
+
+void mpam_component_config_mbwu_mon(int partid, int pmg, int monitor, int *result, int nr);
#endif
#endif /* _ASM_ARM64_MPAM_H */
diff --git a/arch/arm64/include/asm/mpam_sched.h b/arch/arm64/include/asm/mpam_sched.h
index 08ed349b6efa1..32d08cf654b31 100644
--- a/arch/arm64/include/asm/mpam_sched.h
+++ b/arch/arm64/include/asm/mpam_sched.h
@@ -40,6 +40,8 @@ static inline void mpam_sched_in(void)
__mpam_sched_in();
}
+void __mpam_sched_in_v2(struct task_struct *tsk);
+
#else
static inline void mpam_sched_in(void) {}
diff --git a/arch/arm64/kernel/mpam/mpam_device.c b/arch/arm64/kernel/mpam/mpam_device.c
index 6455c69f132fd..48de3982a0b9a 100644
--- a/arch/arm64/kernel/mpam/mpam_device.c
+++ b/arch/arm64/kernel/mpam/mpam_device.c
@@ -84,14 +84,14 @@ void mpam_class_list_lock_held(void)
static inline u32 mpam_read_reg(struct mpam_device *dev, u16 reg)
{
WARN_ON_ONCE(reg > SZ_MPAM_DEVICE);
- assert_spin_locked(&dev->lock);
+ //assert_spin_locked(&dev->lock);
/*
* If we touch a device that isn't accessible from this CPU we may get
* an external-abort.
*/
- WARN_ON_ONCE(preemptible());
- WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
+ //WARN_ON_ONCE(preemptible());
+ //WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
return readl_relaxed(dev->mapped_hwpage + reg);
}
@@ -99,14 +99,14 @@ static inline u32 mpam_read_reg(struct mpam_device *dev, u16 reg)
static inline void mpam_write_reg(struct mpam_device *dev, u16 reg, u32 val)
{
WARN_ON_ONCE(reg > SZ_MPAM_DEVICE);
- assert_spin_locked(&dev->lock);
+ //assert_spin_locked(&dev->lock);
/*
* If we touch a device that isn't accessible from this CPU we may get
* an external-abort. If we're lucky, we corrupt another mpam:component.
*/
- WARN_ON_ONCE(preemptible());
- WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
+ //WARN_ON_ONCE(preemptible());
+ //WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
writel_relaxed(val, dev->mapped_hwpage + reg);
}
@@ -1208,6 +1208,7 @@ static u32 mpam_device_read_mbwu_mon(struct mpam_device *dev,
{
u16 mon;
u32 clt, flt, cur_clt, cur_flt;
+ u32 total = 0;
mon = args->mon;
@@ -1249,7 +1250,12 @@ static u32 mpam_device_read_mbwu_mon(struct mpam_device *dev,
wmb();
}
- return mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ return total / 5;
}
static int mpam_device_frob_mon(struct mpam_device *dev,
@@ -1470,6 +1476,44 @@ static void mpam_component_device_sync(void *__ctx)
cpumask_set_cpu(smp_processor_id(), &ctx->updated_on);
}
+static DEFINE_SPINLOCK(mpam_tmp_lock);
+
+void mpam_component_config_mbwu_mon(int partid, int pmg, int monitor, int *result, int nr)
+{
+ struct mpam_class *class;
+ struct mpam_component *comp;
+ struct mpam_device *dev;
+ struct sync_args args;
+ int i = 0;
+
+ args.pmg = pmg;
+ args.mon = monitor;
+ args.closid.reqpartid = partid;
+ args.match_pmg = 1;
+
+ spin_lock(&mpam_tmp_lock);
+ list_for_each_entry(class, &mpam_classes, classes_list) {
+ if (class->type != MPAM_CLASS_MEMORY)
+ continue;
+
+ list_for_each_entry(comp, &class->components, class_list) {
+ if (i >= nr) {
+ pr_err_once("error, i > result nr");
+ break;
+ }
+ result[i] = 0;
+ list_for_each_entry(dev, &comp->devices, comp_list) {
+ result[i] += mpam_device_read_mbwu_mon(dev, &args);
+ }
+ i++;
+ }
+ break;
+ }
+ spin_unlock(&mpam_tmp_lock);
+
+}
+EXPORT_SYMBOL(mpam_component_config_mbwu_mon);
+
/**
* in some cases/platforms the MSC register access is only possible with
* the associated CPUs. And need to check if those CPUS are online before
diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c
index 60d3d8706a38b..f4d87964616f2 100644
--- a/arch/arm64/kernel/mpam/mpam_resctrl.c
+++ b/arch/arm64/kernel/mpam/mpam_resctrl.c
@@ -2226,6 +2226,71 @@ int mpam_resctrl_init(void)
return resctrl_group_init();
}
+
+void __mpam_sched_in_v2(struct task_struct *tsk)
+{
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ u64 partid_d, partid_i;
+ u64 rmid = state->default_rmid;
+ u64 closid = state->default_closid;
+ u64 reqpartid = 0;
+ u64 pmg = 0;
+
+ /*
+ * If this task has a closid/rmid assigned, use it.
+ * Else use the closid/rmid assigned to this cpu.
+ */
+ if (static_branch_likely(&resctrl_alloc_enable_key)) {
+ if (tsk->closid)
+ closid = tsk->closid;
+ }
+
+ if (static_branch_likely(&resctrl_mon_enable_key)) {
+ if (tsk->rmid)
+ rmid = tsk->rmid;
+ }
+
+ if (closid != state->cur_closid || rmid != state->cur_rmid) {
+ u64 reg;
+
+ resctrl_navie_rmid_partid_pmg(rmid, (int *)&reqpartid, (int *)&pmg);
+
+ if (resctrl_cdp_enabled) {
+ resctrl_cdp_mpamid_map_val(reqpartid, CDP_DATA, partid_d);
+ resctrl_cdp_mpamid_map_val(reqpartid, CDP_CODE, partid_i);
+
+ /* set in EL0 */
+ reg = mpam_read_sysreg_s(SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+ reg = PARTID_D_SET(reg, partid_d);
+ reg = PARTID_I_SET(reg, partid_i);
+ reg = PMG_SET(reg, pmg);
+ mpam_write_sysreg_s(reg, SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+
+ /* set in EL1 */
+ reg = mpam_read_sysreg_s(SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+ reg = PARTID_D_SET(reg, partid_d);
+ reg = PARTID_I_SET(reg, partid_i);
+ reg = PMG_SET(reg, pmg);
+ mpam_write_sysreg_s(reg, SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+ } else {
+ /* set in EL0 */
+ reg = mpam_read_sysreg_s(SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+ reg = PARTID_SET(reg, reqpartid);
+ reg = PMG_SET(reg, pmg);
+ mpam_write_sysreg_s(reg, SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+
+ /* set in EL1 */
+ reg = mpam_read_sysreg_s(SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+ reg = PARTID_SET(reg, reqpartid);
+ reg = PMG_SET(reg, pmg);
+ mpam_write_sysreg_s(reg, SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+ }
+
+ state->cur_rmid = rmid;
+ state->cur_closid = closid;
+ }
+}
+
/*
* __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
*
diff --git a/include/linux/memqos.h b/include/linux/memqos.h
new file mode 100644
index 0000000000000..814e9935590d3
--- /dev/null
+++ b/include/linux/memqos.h
@@ -0,0 +1,142 @@
+#ifndef _MEMQOS_H
+#define _MEMQOS_H
+
+#include <linux/vmstat.h>
+#include <linux/rbtree.h>
+//#include <linux/sched.h>
+
+struct task_struct;
+
+struct memqos_domain {
+ int dom_id;
+ int total_memband_div_10;
+ int total_out_memband_div_10;
+
+ //record 10 timers
+ int memband_ringpos;
+ int memband_div_10_history[4][10];
+};
+
+struct memqos_mpam_profile {
+ int partid;
+ int pmg;
+ int monitor;
+
+ struct task_struct *tsk;
+ int used;
+};
+
+struct memqos_wait_profile {
+ struct memqos_mpam_profile *profile;
+ struct list_head wait_list;
+};
+
+struct memqos_class {
+ struct list_head turbo_list;
+ struct list_head tasks_list;
+};
+
+#include <linux/topology.h>
+//embed in task_struct
+
+struct task_memqos {
+ int ipc_ringpos;
+ int ipcx10;
+ int ipcx10_total[4];
+ int ipcx10_history[10];
+
+ int memband_div_10;
+ int memband_ringpos;
+ int memband_div_10_total[4];
+ int memband_div_10_history[4][10];
+
+ u32 sample_times;
+ int account_ready;
+ int numa_score[4];
+ int turbo;
+
+ struct memqos_wait_profile mpam_profile;
+
+ struct list_head turbo_list;
+ struct list_head task_list;
+
+ struct cpumask *advise_mem_node_mask;
+ int preferred_nid;
+
+ int class_id;
+
+ int corrupt;
+};
+
+#define PHASE_PEVENT_NUM 10
+
+struct phase_event_pcount {
+ u64 data[PHASE_PEVENT_NUM];
+};
+
+struct phase_event_count {
+ struct phase_event_pcount pcount;
+};
+
+void phase_update_mpam_label(struct task_struct *tsk);
+
+void phase_release_mpam_label(struct task_struct *tsk);
+
+static inline void memqos_update_mpam_label(struct task_struct *tsk)
+{
+ phase_update_mpam_label(tsk);
+}
+
+static inline void memqos_release_mpam_label(struct task_struct *tsk)
+{
+ phase_release_mpam_label(tsk);
+}
+
+void phase_destroy_waitqueue(struct task_struct *tsk);
+
+void phase_get_memband(struct memqos_mpam_profile *pm, int *result, int nr);
+
+DECLARE_STATIC_KEY_FALSE(sched_phase);
+DECLARE_STATIC_KEY_FALSE(sched_phase_printk);
+
+int phase_perf_create(void);
+
+void phase_perf_release(void);
+
+void memqos_account_task(struct task_struct *p, int cpu);
+
+void memqos_drop_class(struct task_struct *p);
+
+void phase_account_task(struct task_struct *p, int cpu);
+
+static inline void memqos_task_collect_data(struct task_struct *p, int cpu)
+{
+ phase_account_task(p, cpu);
+}
+
+static inline void memqos_task_account(struct task_struct *p, int cpu)
+{
+ memqos_account_task(p, cpu);
+}
+
+static inline void memqos_task_exit(struct task_struct *p)
+{
+
+ memqos_drop_class(p);
+ phase_destroy_waitqueue(p);
+}
+
+void memqos_select_nicest_cpus(struct task_struct *p);
+
+void memqos_exclude_low_level_task_single(struct task_struct *p);
+
+int knn_get_tag(int ipcx10, int memband_div_10);
+
+void memqos_init_class(struct task_struct *p);
+
+void phase_trace_printk(struct task_struct *p);
+static inline void memqos_trace_printk(struct task_struct *p)
+{
+ phase_trace_printk(p);
+}
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 928186f161000..5f710dc5bc03b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -29,6 +29,7 @@
#include <linux/task_io_accounting.h>
#include <linux/rseq.h>
#include <linux/thread_bits.h>
+#include <linux/memqos.h>
/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
@@ -1268,7 +1269,7 @@ struct task_struct {
#if !defined(__GENKSYMS__)
#if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY)
cpumask_t *prefer_cpus;
- const cpumask_t *select_cpus;
+ cpumask_t *select_cpus;
#else
KABI_RESERVE(6)
KABI_RESERVE(7)
@@ -1279,6 +1280,10 @@ struct task_struct {
#endif
KABI_RESERVE(8)
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+ struct task_memqos sched_memqos;
+#endif
+
/* CPU-specific state of this task: */
struct thread_struct thread;
@@ -1998,6 +2003,13 @@ int set_prefer_cpus_ptr(struct task_struct *p,
const struct cpumask *new_mask);
int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig);
void sched_prefer_cpus_free(struct task_struct *p);
+static inline bool prefer_cpus_valid(struct task_struct *p)
+{
+ return p->prefer_cpus &&
+ !cpumask_empty(p->prefer_cpus) &&
+ !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) &&
+ cpumask_subset(p->prefer_cpus, &p->cpus_allowed);
+}
#endif
#endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b769ecfcc3bd4..73bce39107cb3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -230,6 +230,8 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,
#endif /* CONFIG_SYSCTL */
+extern struct ctl_table phase_table[];
+
int sysctl_max_threads(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 55bfbc4cdb16c..d94a9065a5605 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -106,6 +106,7 @@ struct cpuset {
nodemask_t mems_allowed;
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
cpumask_var_t prefer_cpus;
+ int mem_turbo;
#endif
/* effective CPUs and Memory Nodes allow to tasks */
diff --git a/kernel/exit.c b/kernel/exit.c
index 2a32d32bdc03d..b731c19618176 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -699,6 +699,8 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
+#include <linux/memqos.h>
+
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
@@ -806,6 +808,7 @@ void __noreturn do_exit(long code)
* because of cgroup mode, must be called before cgroup_exit()
*/
perf_event_exit_task(tsk);
+ memqos_task_exit(tsk);
sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index b5453a26655e2..0a762b92dc814 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -841,6 +841,8 @@ void set_task_stack_end_magic(struct task_struct *tsk)
*stackend = STACK_END_MAGIC; /* for overflow detection */
}
+
+#include <linux/memqos.h>
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
@@ -923,6 +925,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
kcov_task_init(tsk);
+ memqos_init_class(tsk);
+
#ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
#endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7fe183404c383..471380d6686e3 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_CPU_ISOLATION) += isolation.o
+obj-$(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) += memqos/
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 970616070da86..1171025aaa440 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2787,6 +2787,8 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
calculate_sigpending();
}
+#include <linux/memqos.h>
+
/*
* context_switch - switch to the new MM and the new thread's register state.
*/
@@ -2794,6 +2796,8 @@ static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
+ struct rq *ret;
+
prepare_task_switch(rq, prev, next);
/*
@@ -2837,6 +2841,18 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
}
+ //account and release
+ memqos_task_account(prev, smp_processor_id());
+
+ if (prefer_cpus_valid(prev))
+ memqos_trace_printk(prev);
+
+ memqos_release_mpam_label(prev);
+
+ //label new task's mpamid
+ if (prefer_cpus_valid(next))
+ memqos_update_mpam_label(next);
+
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
@@ -2845,7 +2861,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
switch_to(prev, next, prev);
barrier();
- return finish_task_switch(prev);
+ ret = finish_task_switch(prev);
+
+ return ret;
}
/*
@@ -3058,8 +3076,12 @@ unsigned long long task_sched_runtime(struct task_struct *p)
void scheduler_tick(void)
{
int cpu = smp_processor_id();
+ //memqos clooect next cpu's memband and perf
+ int cpu_memqos = (cpu + 1) % nr_cpu_ids;
struct rq *rq = cpu_rq(cpu);
+ struct rq *rq_next = cpu_rq(cpu_memqos);
struct task_struct *curr = rq->curr;
+ struct task_struct *curr_memqos = rq_next->curr;
struct rq_flags rf;
sched_clock_tick();
@@ -3075,6 +3097,10 @@ void scheduler_tick(void)
perf_event_task_tick();
+ //only monitor task enabled dynamic affinity
+ if (curr_memqos && prefer_cpus_valid(curr_memqos))
+ memqos_task_collect_data(curr_memqos, cpu_memqos);
+
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
@@ -3524,6 +3550,7 @@ static void __sched notrace __schedule(bool preempt)
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
+ memqos_task_account(prev, smp_processor_id());
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unlock_irq(rq, &rf);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index af55a26d11fcb..12e9675495d2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6675,6 +6675,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
}
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+#include <linux/memqos.h>
/*
* Low utilization threshold for CPU
*
@@ -6749,14 +6750,6 @@ static inline int cpu_vutil_of(int cpu)
return cputime->vutil;
}
-static inline bool prefer_cpus_valid(struct task_struct *p)
-{
- return p->prefer_cpus &&
- !cpumask_empty(p->prefer_cpus) &&
- !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) &&
- cpumask_subset(p->prefer_cpus, &p->cpus_allowed);
-}
-
/*
* set_task_select_cpus: select the cpu range for task
* @p: the task whose available cpu range will to set
@@ -6828,8 +6821,13 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
if (util_avg_sum < sysctl_sched_util_low_pct *
cpumask_weight(p->prefer_cpus)) {
p->select_cpus = p->prefer_cpus;
+ memqos_select_nicest_cpus(p);
if (sd_flag & SD_BALANCE_WAKE)
schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus);
+ } else {
+ //select trubo task
+ //select low class task
+ memqos_exclude_low_level_task_single(p);
}
}
#endif
diff --git a/kernel/sched/memqos/Makefile b/kernel/sched/memqos/Makefile
new file mode 100644
index 0000000000000..ed8f42649a8a7
--- /dev/null
+++ b/kernel/sched/memqos/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
+obj-y := memqos.o phase_feature_sysctl.o phase_memband.o phase_perf.o phase_sim_knn.o
diff --git a/kernel/sched/memqos/memqos.c b/kernel/sched/memqos/memqos.c
new file mode 100644
index 0000000000000..ddf8785439aa6
--- /dev/null
+++ b/kernel/sched/memqos/memqos.c
@@ -0,0 +1,297 @@
+#include <linux/memqos.h>
+#include <linux/cpumask.h>
+#include <linux/sched.h>
+
+static void memqos_set_task_classid(struct task_struct *p)
+{
+ int class_id;
+ int memband_div_10 = p->sched_memqos.memband_div_10;
+ int ipcx10 = p->sched_memqos.ipcx10;
+
+ class_id = knn_get_tag((u64)ipcx10, (u64)memband_div_10);
+ p->sched_memqos.class_id = class_id;
+}
+
+//static memqos_domain mq_domains[] = {
+// {.dom_id = 0, .total_memband = 0, .total_out_memband = 0,},
+// {.dom_id = 1, .total_memband = 0, .total_out_memband = 0,},
+// {.dom_id = 2, .total_memband = 0, .total_out_memband = 0,},
+// {.dom_id = 3, .total_memband = 0, .total_out_memband = 0,},
+//};
+
+static DEFINE_PER_CPU(struct memqos_class, memqos_classes[8]);
+//static DEFINE_PER_CPU(spinlock_t, memqos_class_lock);
+static DEFINE_SPINLOCK(memqos_class_lock);
+
+static int memqos_class_online(unsigned int cpu)
+{
+ int class_id = 0;
+ struct memqos_class *class;
+
+ for (class_id = 0; class_id < 8; class_id++) {
+ class = &per_cpu(memqos_classes, cpu)[class_id];
+ INIT_LIST_HEAD(&class->tasks_list);
+ INIT_LIST_HEAD(&class->turbo_list);
+ }
+ return 0;
+}
+
+static int memqos_class_offline(unsigned int cpu)
+{
+ return 0;
+}
+
+#include <linux/cpu.h>
+#include <linux/cacheinfo.h>
+
+static void memqos_init(void)
+{
+ int cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "memqos:online", memqos_class_online,
+ memqos_class_offline);
+ if (cpuhp_state <= 0) {
+ pr_err("Failed to register 'dyn' cpuhp callbacks");
+ return;
+ }
+}
+late_initcall(memqos_init);
+
+static void memqos_insert_to_class(struct task_struct *p, int cpu)
+{
+ unsigned long flag;
+ int class_id = p->sched_memqos.class_id;
+ struct memqos_class *class;
+ struct task_memqos *memqos;
+
+ if (class_id >= 8)
+ return;
+
+ memqos = &p->sched_memqos;
+
+ class = &per_cpu(memqos_classes, cpu)[class_id];
+
+ spin_lock_irqsave(&memqos_class_lock, flag);
+ if (p->sched_memqos.corrupt) {
+ spin_unlock_irqrestore(&memqos_class_lock, flag);
+ return;
+ }
+
+ //pr_info("count:%d %d add (%llx) %llx %llx to list %llx!!!!!!!!!!!!!\n", count, p->pid, &p->sched_memqos.task_list, p->sched_memqos.task_list.next, p->sched_memqos.task_list.prev, &class->tasks_list);
+ list_move_tail(&p->sched_memqos.task_list, &class->tasks_list);
+ if (memqos->turbo)
+ list_move_tail(&p->sched_memqos.turbo_list, &class->turbo_list);
+ spin_unlock_irqrestore(&memqos_class_lock, flag);
+}
+
+static void memqos_drop_class_without_lock(struct task_struct *p)
+{
+ //pr_info("%d drop (%llx) %llx %llx to list %llx!!!!!!!!!!!!!\n", p->pid, &p->sched_memqos.task_list, p->sched_memqos.task_list.next, p->sched_memqos.task_list.prev);
+ list_del_init(&p->sched_memqos.task_list);
+ list_del_init(&p->sched_memqos.turbo_list);
+}
+
+static void memqos_score(struct task_struct *p)
+{
+ int total_n1 = p->sched_memqos.memband_div_10_total[0];
+ int total_n2 = p->sched_memqos.memband_div_10_total[1];
+ int total_n3 = p->sched_memqos.memband_div_10_total[2];
+ int total_n4 = p->sched_memqos.memband_div_10_total[3];
+
+ p->sched_memqos.numa_score[0] = (total_n1 - (total_n2 + total_n3 + total_n4)) * 10 / total_n1;
+ p->sched_memqos.numa_score[1] = (total_n2 - (total_n1 + total_n3 + total_n4)) * 10 / total_n2;
+ p->sched_memqos.numa_score[2] = (total_n3 - (total_n1 + total_n2 + total_n4)) * 10 / total_n3;
+ p->sched_memqos.numa_score[3] = (total_n4 - (total_n1 + total_n2 + total_n3)) * 10 / total_n4;
+
+ //over x% percent
+ if (p->sched_memqos.numa_score[0] > 0)
+ p->sched_memqos.turbo = 1;
+ else if (p->sched_memqos.numa_score[1] > 0)
+ p->sched_memqos.turbo = 2;
+ else if (p->sched_memqos.numa_score[2] > 0)
+ p->sched_memqos.turbo = 3;
+ else if (p->sched_memqos.numa_score[3] > 0)
+ p->sched_memqos.turbo = 4;
+ else
+ p->sched_memqos.turbo = 0;
+}
+
+void memqos_account_task(struct task_struct *p, int cpu)
+{
+ if (!p->sched_memqos.account_ready ||
+ p->sched_memqos.corrupt)
+ return;
+ memqos_set_task_classid(p);
+ memqos_insert_to_class(p, cpu);
+ memqos_score(p);
+ p->sched_memqos.account_ready = 0;
+}
+
+void memqos_init_class(struct task_struct *p)
+{
+ memset(&p->sched_memqos, 0, sizeof(struct task_memqos));
+ spin_lock(&memqos_class_lock);
+ INIT_LIST_HEAD(&p->sched_memqos.task_list);
+ INIT_LIST_HEAD(&p->sched_memqos.turbo_list);
+ INIT_LIST_HEAD(&p->sched_memqos.mpam_profile.wait_list);
+ spin_unlock(&memqos_class_lock);
+
+ p->closid = 0;
+ p->rmid = 0;
+}
+
+//destroy ?
+void memqos_drop_class(struct task_struct *p)
+{
+ spin_lock(&memqos_class_lock);
+ memqos_drop_class_without_lock(p);
+ p->sched_memqos.corrupt = 1;
+ spin_unlock(&memqos_class_lock);
+}
+
+void memqos_select_nicest_cpus(struct task_struct *p)
+{
+ int i = 0;
+ int max_score = -10000;
+ int select_node = 0;
+ struct task_memqos *memqos = &p->sched_memqos;
+
+ if (!memqos->turbo) {
+ for (i = 0; i < 4; i++) {
+ if (!cpumask_intersects(cpumask_of_node(i), p->select_cpus))
+ continue;
+
+ if (memqos->numa_score[i] > max_score) {
+ select_node = i;
+ max_score = memqos->numa_score[i];
+ }
+ }
+
+ cpumask_and(p->select_cpus, p->select_cpus, cpumask_of_node(select_node));
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ return;
+ }
+
+ select_node = memqos->turbo - 1;
+ if (cpumask_intersects(cpumask_of_node(select_node), p->select_cpus)) {
+ cpumask_and(p->select_cpus, p->select_cpus, cpumask_of_node(select_node));
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ }
+
+ return;
+
+ //if turbo another cpus, wait...
+}
+
+void memqos_exclude_low_level_task_single(struct task_struct *p)
+{
+ int i, j, cpu;
+ int find = 0;
+ int select_node = 0;
+ const struct cpumask *cpumask;
+ struct cpumask *cpumask_med;
+ struct memqos_class *class;
+ struct task_memqos *memqos = &p->sched_memqos;;
+ struct task_struct *tsk = NULL;
+ int max_score = -100000;
+
+ if (memqos->turbo) {
+ select_node = memqos->turbo - 1;
+ cpumask = cpumask_of_node(select_node);
+ if (!cpumask_intersects(cpumask, p->prefer_cpus) &&
+ (cpumask_intersects(&p->cpus_allowed, cpumask))) {
+ cpumask_and(p->select_cpus, &p->cpus_allowed, cpumask);
+ memqos_drop_class(p);
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ return;
+ } else if (cpumask_intersects(p->prefer_cpus, cpumask)) {
+ cpumask_and(p->select_cpus, p->prefer_cpus, cpumask);
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ }
+ }
+
+ //select turbo one
+ for (cpu = 0; cpu < nr_cpu_ids; i++) {
+ if (!cpumask_test_cpu(cpu, p->prefer_cpus))
+ continue;
+
+ spin_lock(&memqos_class_lock);
+ for (i = 7; i >= 0; i--) {
+ class = &per_cpu(memqos_classes, cpu)[i];
+ list_for_each_entry(memqos, &class->turbo_list, turbo_list) {
+ if (!memqos->turbo)
+ continue;
+ select_node = memqos->turbo - 1;
+ cpumask = cpumask_of_node(select_node);
+ if (!cpumask_intersects(cpumask, p->prefer_cpus)) {
+ tsk = container_of(memqos, struct task_struct, sched_memqos);
+ if (!cpumask_intersects(cpumask, &tsk->cpus_allowed))
+ continue;
+ cpumask_and(tsk->select_cpus, &tsk->cpus_allowed, cpumask);
+ //mem prefered
+ //tsk->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ tsk->sched_memqos.preferred_nid = memqos->turbo;
+ find = 1;
+ break;
+ }
+ }
+ if (find) {
+ memqos_drop_class_without_lock(tsk);
+ spin_unlock(&memqos_class_lock);
+ return;
+ }
+ }
+ spin_unlock(&memqos_class_lock);
+ }
+
+ find = 0;
+
+ //if not, select lower class's tsk
+ for (cpu = 0; cpu < nr_cpu_ids; i++) {
+ if (!cpumask_test_cpu(cpu, p->prefer_cpus))
+ continue;
+
+ spin_lock(&memqos_class_lock);
+ //only find below class tsk
+ for (i = 0; i < memqos->class_id; i++) {
+ class = &per_cpu(memqos_classes, cpu)[i];
+ list_for_each_entry(memqos, &class->tasks_list, task_list) {
+ if (memqos->turbo)
+ continue;
+
+ tsk = container_of(memqos, struct task_struct, sched_memqos);
+ for (j = 0; j < 4; j++) {
+ if (!cpumask_intersects(cpumask_of_node(i), &tsk->cpus_allowed))
+ continue;
+ if (memqos->numa_score[j] > max_score) {
+ select_node = j;
+ max_score = memqos->numa_score[j];
+ }
+ find = 1;
+ }
+ if (!find)
+ continue;
+
+ cpumask_and(cpumask_med, cpumask_of_node(select_node), &tsk->cpus_allowed);
+ cpumask_andnot(cpumask_med, cpumask_med, p->prefer_cpus);
+ if (cpumask_empty(cpumask_med))
+ continue;
+ cpumask_copy(tsk->select_cpus, cpumask_med);
+ //mem prefered
+ //tsk->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ tsk->sched_memqos.preferred_nid = memqos->turbo;
+ memqos_drop_class_without_lock(tsk);
+ spin_unlock(&memqos_class_lock);
+ return;
+ }
+ }
+ spin_unlock(&memqos_class_lock);
+ }
+
+ //do not care, this task may out
+ return;
+}
+
diff --git a/kernel/sched/memqos/phase_feature_sysctl.c b/kernel/sched/memqos/phase_feature_sysctl.c
new file mode 100644
index 0000000000000..443ae03275605
--- /dev/null
+++ b/kernel/sched/memqos/phase_feature_sysctl.c
@@ -0,0 +1,126 @@
+#include <linux/sched.h>
+#include <linux/sysctl.h>
+#include <linux/capability.h>
+#include <linux/cpumask.h>
+#include <linux/topology.h>
+#include <linux/sched/task.h>
+
+#include <linux/memqos.h>
+
+#ifdef CONFIG_PROC_SYSCTL
+
+DEFINE_STATIC_KEY_FALSE(sched_phase);
+DEFINE_STATIC_KEY_FALSE(sched_phase_printk);
+
+static int set_phase_state(bool enabled)
+{
+ int err;
+ int state = static_branch_likely(&sched_phase);
+
+ if (enabled == state) {
+ pr_warn("phase has already %s\n", state ? "enabled" : "disabled");
+ return 0;
+ }
+
+ if (enabled) {
+ err = phase_perf_create();
+ if (err) {
+ pr_err("phase enable failed\n");
+ return err;
+ }
+ static_branch_enable(&sched_phase);
+ pr_info("phase enabled\n");
+ } else {
+ static_branch_disable(&sched_phase);
+ phase_perf_release();
+ pr_info("phase disabled\n");
+ }
+
+ return 0;
+}
+
+/*
+ * the other procfs files of phase cannot be modified if sched_phase is already enabled
+ */
+static int phase_proc_state(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_phase);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ err = set_phase_state(state);
+
+ return err;
+}
+
+static int set_phase_state_printk(bool enabled)
+{
+ if (enabled) {
+ static_branch_enable(&sched_phase_printk);
+ } else {
+ static_branch_disable(&sched_phase_printk);
+ }
+
+ return 0;
+}
+
+/*
+ * the other procfs files of phase cannot be modified if sched_phase is already enabled
+ */
+static int phase_proc_state_printk(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_phase);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ err = set_phase_state_printk(state);
+
+ return err;
+}
+
+
+static int __maybe_unused zero;
+static int __maybe_unused one = 1;
+
+struct ctl_table phase_table[] = {
+ {
+ .procname = "enabled",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = phase_proc_state,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "trace_enabled",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = phase_proc_state_printk,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ { }
+};
+#endif /* CONFIG_PROC_SYSCTL */
diff --git a/kernel/sched/memqos/phase_memband.c b/kernel/sched/memqos/phase_memband.c
new file mode 100644
index 0000000000000..d83909c8eca45
--- /dev/null
+++ b/kernel/sched/memqos/phase_memband.c
@@ -0,0 +1,145 @@
+#include <linux/types.h>
+#include <linux/cpu.h>
+#include <linux/memqos.h>
+
+#include <asm/cpu.h>
+#include <asm/cputype.h>
+#include <asm/cpufeature.h>
+#include <asm/mpam_sched.h>
+
+static const int nr_partid = 15;
+static const int nr_monitor = 4;
+
+static LIST_HEAD(phase_mpam_waitqueue);
+
+//mpam_profile_res[0] not used
+struct memqos_mpam_profile mpam_profile_res[16] = {
+ { .partid = 0, .monitor = 0, .used = 1},
+ { .partid = 1, .monitor = 0,},
+ { .partid = 2, .monitor = 1,},
+ { .partid = 3, .monitor = 2,},
+ { .partid = 4, .monitor = 3,},
+ { .partid = 5, .monitor = 0,},
+ { .partid = 6, .monitor = 1,},
+ { .partid = 7, .monitor = 2,},
+ { .partid = 8, .monitor = 3,},
+ { .partid = 9, .monitor = 0,},
+ { .partid = 10, .monitor = 1,},
+ { .partid = 11, .monitor = 2,},
+ { .partid = 12, .monitor = 3,},
+ { .partid = 13, .monitor = 0,},
+ { .partid = 14, .monitor = 1,},
+ { .partid = 15, .monitor = 2,},
+};
+
+static DEFINE_SPINLOCK(phase_partid_lock);
+
+void phase_update_mpam_label(struct task_struct *tsk)
+{
+ int i = 0;
+ //unsigned long flag;
+
+ WARN_ON_ONCE(tsk->closid);
+
+ if (tsk->sched_memqos.mpam_profile.profile != &mpam_profile_res[0] &&
+ tsk->sched_memqos.mpam_profile.profile != NULL) {
+ tsk->closid = tsk->sched_memqos.mpam_profile.profile->partid;
+ tsk->rmid = 0;
+ mpam_profile_res[tsk->closid].tsk = tsk;
+ __mpam_sched_in_v2(tsk);
+ return;
+ }
+
+ spin_lock(&phase_partid_lock);
+ //is in profile queue, wait...
+ if (tsk->sched_memqos.mpam_profile.profile == &mpam_profile_res[0]) {
+ spin_unlock(&phase_partid_lock);
+ return;
+ }
+
+ for (i = 1; i < 16; i++) {
+ if (mpam_profile_res[i].used) {
+ continue;
+ }
+
+ tsk->sched_memqos.mpam_profile.profile = NULL;
+ break;
+ }
+
+ if (i == 16) {
+ list_move_tail(&tsk->sched_memqos.mpam_profile.wait_list, &phase_mpam_waitqueue);
+ tsk->sched_memqos.mpam_profile.profile = &mpam_profile_res[0];
+ spin_unlock(&phase_partid_lock);
+ //wait...
+ return;
+ }
+
+ mpam_profile_res[i].used = 1;
+ spin_unlock(&phase_partid_lock);
+
+ tsk->closid = mpam_profile_res[i].partid;
+ mpam_profile_res[i].tsk = tsk;
+ tsk->sched_memqos.mpam_profile.profile = &mpam_profile_res[i];
+ tsk->rmid = 0;
+ __mpam_sched_in_v2(tsk);
+}
+
+static void phase_release_mpam_label_without_lock(struct task_struct *tsk)
+{
+ int closid;
+ struct memqos_wait_profile *next;
+
+ //assert locked
+
+ if (tsk->closid == 0)
+ return;
+
+ closid = tsk->closid;
+ tsk->closid = 0;
+ tsk->sched_memqos.mpam_profile.profile = NULL;
+ mpam_profile_res[closid].used = 0;
+ mpam_profile_res[closid].tsk = NULL;
+
+ next = list_first_entry_or_null(&phase_mpam_waitqueue, struct memqos_wait_profile, wait_list);
+ if (next) {
+ list_del_init(&next->wait_list);
+ next->profile = &mpam_profile_res[closid];
+ mpam_profile_res[closid].used = 1;
+ }
+
+ return;
+}
+
+//task shutdown
+void phase_destroy_waitqueue(struct task_struct *tsk)
+{
+ spin_lock(&phase_partid_lock);
+
+ //if (tsk->sched_memqos.mpam_profile.profile == &mpam_profile_res[0]) {
+ list_del_init(&tsk->sched_memqos.mpam_profile.wait_list);
+ //} else {
+ phase_release_mpam_label_without_lock(tsk);
+ //}
+ spin_unlock(&phase_partid_lock);
+}
+
+void phase_release_mpam_label(struct task_struct *tsk)
+{
+ spin_lock(&phase_partid_lock);
+ phase_release_mpam_label_without_lock(tsk);
+ spin_unlock(&phase_partid_lock);
+}
+
+#include <asm/mpam.h>
+void phase_get_memband(struct memqos_mpam_profile *pm, int *result, int nr)
+{
+ if (pm == &mpam_profile_res[0] || pm == NULL) {
+ result[0] = 0;
+ result[1] = 0;
+ result[2] = 0;
+ result[3] = 0;
+ return;
+ }
+
+ mpam_component_config_mbwu_mon(pm->partid, pm->pmg, pm->monitor, result, nr);
+}
diff --git a/kernel/sched/memqos/phase_perf.c b/kernel/sched/memqos/phase_perf.c
new file mode 100644
index 0000000000000..9b450a20e808f
--- /dev/null
+++ b/kernel/sched/memqos/phase_perf.c
@@ -0,0 +1,409 @@
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/percpu-defs.h>
+#include <linux/slab.h>
+#include <linux/stop_machine.h>
+#include <linux/memqos.h>
+#include <linux/sched.h>
+
+#define PHASE_FEVENT_NUM 3
+
+int *phase_perf_pevents = NULL;
+
+static DEFINE_PER_CPU(__typeof__(struct perf_event *)[PHASE_PEVENT_NUM], cpu_phase_perf_events);
+
+/******************************************
+ * Helpers for phase perf event
+ *****************************************/
+static inline struct perf_event *perf_event_of_cpu(int cpu, int index)
+{
+ return per_cpu(cpu_phase_perf_events, cpu)[index];
+}
+
+static inline struct perf_event **perf_events_of_cpu(int cpu)
+{
+ return per_cpu(cpu_phase_perf_events, cpu);
+}
+
+static inline u64 perf_event_local_pmu_read(struct perf_event *event)
+{
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
+ return local64_read(&event->count);
+}
+
+/******************************************
+ * Helpers for cpu counters
+ *****************************************/
+static inline u64 read_cpu_counter(int cpu, int index)
+{
+ struct perf_event *event = perf_event_of_cpu(cpu, index);
+
+ if (!event || !event->pmu)
+ return 0;
+
+ return perf_event_local_pmu_read(event);
+}
+
+static struct perf_event_attr *alloc_attr(int event_id)
+{
+ struct perf_event_attr *attr;
+
+ attr = kzalloc(sizeof(struct perf_event_attr), GFP_KERNEL);
+ if (!attr)
+ return ERR_PTR(-ENOMEM);
+
+ attr->type = PERF_TYPE_RAW;
+ attr->config = event_id;
+ attr->size = sizeof(struct perf_event_attr);
+ attr->pinned = 1;
+ attr->disabled = 1;
+ //attr->exclude_hv;
+ //attr->exclude_idle;
+ //attr->exclude_kernel;
+
+ return attr;
+}
+
+static int create_cpu_counter(int cpu, int event_id, int index)
+{
+ struct perf_event_attr *attr = NULL;
+ struct perf_event **events = perf_events_of_cpu(cpu);
+ struct perf_event *event = NULL;
+
+ attr = alloc_attr(event_id);
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ event = perf_event_create_kernel_counter(attr, cpu, NULL, NULL, NULL);
+ if (IS_ERR(event)) {
+ pr_err("unable to create perf event (cpu:%i-type:%d-pinned:%d-config:0x%llx) : %ld",
+ cpu, attr->type, attr->pinned, attr->config, PTR_ERR(event));
+ kfree(attr);
+ return PTR_ERR(event);
+ } else {
+ events[index] = event;
+ perf_event_enable(events[index]);
+ if (event->hw.idx == -1) {
+ pr_err("pinned event unable to get onto hardware, perf event (cpu:%i-type:%d-config:0x%llx)",
+ cpu, attr->type, attr->config);
+ kfree(attr);
+ return -EINVAL;
+ }
+ pr_info("create perf_event (cpu:%i-idx:%d-type:%d-pinned:%d-exclude_hv:%d"
+ "-exclude_idle:%d-exclude_kernel:%d-config:0x%llx-addr:%px)",
+ event->cpu, event->hw.idx,
+ event->attr.type, event->attr.pinned, event->attr.exclude_hv,
+ event->attr.exclude_idle, event->attr.exclude_kernel,
+ event->attr.config, event);
+ }
+
+ kfree(attr);
+ return 0;
+}
+
+static int release_cpu_counter(int cpu, int event_id, int index)
+{
+ struct perf_event **events = perf_events_of_cpu(cpu);
+ struct perf_event *event = NULL;
+
+ event = events[index];
+
+ if (!event)
+ return 0;
+
+ pr_info("release perf_event (cpu:%i-idx:%d-type:%d-pinned:%d-exclude_hv:%d"
+ "-exclude_idle:%d-exclude_kernel:%d-config:0x%llx)",
+ event->cpu, event->hw.idx,
+ event->attr.type, event->attr.pinned, event->attr.exclude_hv,
+ event->attr.exclude_idle, event->attr.exclude_kernel,
+ event->attr.config);
+
+ perf_event_release_kernel(event);
+ events[index] = NULL;
+
+ return 0;
+}
+
+enum {
+ CYCLES_INDEX = 0,
+ INST_RETIRED_INDEX,
+ PHASE_EVENT_FINAL_TERMINATOR
+};
+
+#define CYCLES 0x0011
+#define INST_RETIRED 0x0008
+
+static int pevents[PHASE_PEVENT_NUM] = {
+ CYCLES,
+ INST_RETIRED,
+ PHASE_EVENT_FINAL_TERMINATOR,
+};
+
+#define for_each_phase_pevents(index, events) \
+ for (index = 0; events != NULL && index < PHASE_PEVENT_NUM && \
+ events[index] != PHASE_EVENT_FINAL_TERMINATOR; index++)
+
+
+/******************************************
+ * Helpers for phase perf
+ *****************************************/
+static int do_pevents(int (*fn)(int, int, int), int cpu)
+{
+ int index;
+ int err;
+
+ for_each_phase_pevents(index, phase_perf_pevents) {
+ err = fn(cpu, phase_perf_pevents[index], index);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int __phase_perf_create(void *args)
+{
+ int err;
+ int cpu = raw_smp_processor_id();
+
+ /* create pinned events */
+ pr_info("create pinned events\n");
+ err = do_pevents(create_cpu_counter, cpu);
+ if (err) {
+ pr_err("create pinned events failed\n");
+ do_pevents(release_cpu_counter, cpu);
+ return err;
+ }
+
+ pr_info("[%d] phase class event create success\n", cpu);
+ return 0;
+}
+
+static int do_phase_perf_create(int *pevents, const struct cpumask *cpus)
+{
+ phase_perf_pevents = pevents;
+ return stop_machine(__phase_perf_create, NULL, cpus);
+}
+
+static int __do_phase_perf_release(void *args)
+{
+ int cpu = raw_smp_processor_id();
+
+ /* release pinned events */
+ pr_info("release pinned events\n");
+ do_pevents(release_cpu_counter, cpu);
+
+ pr_info("[%d] phase class event release success\n", cpu);
+ return 0;
+}
+
+static void do_phase_perf_release(const struct cpumask *cpus)
+{
+ stop_machine(__do_phase_perf_release, NULL, cpus);
+ phase_perf_pevents = NULL;
+}
+
+int phase_perf_create(void)
+{
+ return do_phase_perf_create(pevents, cpu_possible_mask);
+}
+
+void phase_perf_release(void)
+{
+ do_phase_perf_release(cpu_possible_mask);
+}
+
+DECLARE_STATIC_KEY_FALSE(sched_phase);
+DECLARE_STATIC_KEY_FALSE(sched_phase_printk);
+
+#define PHASE_EVENT_OVERFLOW (~0ULL)
+
+static inline u64 phase_event_count_sub(u64 curr, u64 prev)
+{
+ if (curr < prev) { /* ovewrflow */
+ u64 tmp = PHASE_EVENT_OVERFLOW - prev;
+ return curr + tmp;
+ } else {
+ return curr - prev;
+ }
+}
+
+static inline void phase_calc_delta(struct task_struct *p,
+ struct phase_event_count *prev,
+ struct phase_event_count *curr,
+ struct phase_event_count *delta)
+{
+ int *pevents = phase_perf_pevents;
+ int index;
+
+ for_each_phase_pevents(index, pevents) {
+ delta->pcount.data[index] = phase_event_count_sub(curr->pcount.data[index], prev->pcount.data[index]);
+ }
+}
+
+static inline u64 phase_data_of_pevent(struct phase_event_pcount *counter, int event_id)
+{
+ int index;
+ int *events = phase_perf_pevents;
+
+ for_each_phase_pevents(index, events) {
+ if (event_id == events[index])
+ return counter->data[index];
+ }
+
+ return 0;
+}
+
+static int cal_ring_history_average(int *history, int nr, int s_pos, int c_nr)
+{
+ int average = 0;
+ int start = ((s_pos - c_nr) + nr) % nr;
+
+ if (start < 0)
+ return 0;
+
+ for (;start != s_pos;) {
+ if (history[start] == 0) {
+ c_nr--;
+ if (c_nr == 0)
+ return 0;
+ continue;
+ }
+ average += history[start];
+ start = (start + 1) % nr;
+ }
+
+ return start / c_nr;
+}
+
+static void __phase_cal_ipcx10(struct task_struct *p, struct phase_event_count *delta)
+{
+ u64 ins;
+ u64 cycles;
+ //invalid zero
+ int ipcx10 = 0;
+
+ ins = phase_data_of_pevent(&delta->pcount, INST_RETIRED_INDEX);
+ cycles = phase_data_of_pevent(&delta->pcount, CYCLES_INDEX);
+
+ if (cycles)
+ ipcx10 = (ins * 10) / cycles;
+
+ if (static_branch_unlikely(&sched_phase_printk)) {
+ trace_printk("ins:%lld cycles:%lld\n", ins, cycles);
+ }
+
+ p->sched_memqos.ipcx10_history[p->sched_memqos.ipc_ringpos] = ipcx10;
+ p->sched_memqos.ipc_ringpos = (p->sched_memqos.ipc_ringpos + 1) % 10;
+ cal_ring_history_average(p->sched_memqos.ipcx10_history, 10, p->sched_memqos.ipc_ringpos, 5);
+}
+
+static void __phase_cal_memband_div_10(struct task_struct *p)
+{
+ int pos;
+ int result[4];
+
+ pos = p->sched_memqos.memband_ringpos;
+
+ phase_get_memband(p->sched_memqos.mpam_profile.profile, result, 4);
+
+ if (static_branch_unlikely(&sched_phase_printk)) {
+ trace_printk("memband:%d %d %d %d profile:%llx\n", result[0], result[1], result[2], result[3], p->sched_memqos.mpam_profile.profile);
+ }
+
+ p->sched_memqos.memband_div_10_total[0] = p->sched_memqos.memband_div_10_total[0] - p->sched_memqos.memband_div_10_history[0][pos];
+ p->sched_memqos.memband_div_10_total[0] = p->sched_memqos.memband_div_10_total[0] + result[0] / 10;
+ p->sched_memqos.memband_div_10_history[0][p->sched_memqos.memband_ringpos] = result[0] / 10;
+
+ p->sched_memqos.memband_div_10_total[1] = p->sched_memqos.memband_div_10_total[1] - p->sched_memqos.memband_div_10_history[1][pos];
+ p->sched_memqos.memband_div_10_total[1] = p->sched_memqos.memband_div_10_total[1] + result[1] / 10;
+ p->sched_memqos.memband_div_10_history[1][p->sched_memqos.memband_ringpos] = result[1] / 10;
+
+ p->sched_memqos.memband_div_10_total[2] = p->sched_memqos.memband_div_10_total[2] - p->sched_memqos.memband_div_10_history[2][pos];
+ p->sched_memqos.memband_div_10_total[2] = p->sched_memqos.memband_div_10_total[2] + result[2] / 10;
+ p->sched_memqos.memband_div_10_history[2][p->sched_memqos.memband_ringpos] = result[2] / 10;
+
+ p->sched_memqos.memband_div_10_total[3] = p->sched_memqos.memband_div_10_total[3] - p->sched_memqos.memband_div_10_history[3][pos];
+ p->sched_memqos.memband_div_10_total[3] = p->sched_memqos.memband_div_10_total[3] + result[3] / 10;
+ p->sched_memqos.memband_div_10_history[3][p->sched_memqos.memband_ringpos] = result[3] / 10;
+
+ p->sched_memqos.memband_ringpos = (pos + 1) % 10;
+
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[0], 10, pos, 5);
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[1], 10, pos, 5);
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[2], 10, pos, 5);
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[3], 10, pos, 5);
+}
+
+static DEFINE_PER_CPU(struct phase_event_count, prev_phase_event_count);
+static DEFINE_PER_CPU(struct phase_event_count, curr_phase_event_count);
+
+static void phase_perf_read_events(int cpu, u64 *pdata)
+{
+ int index;
+
+ for_each_phase_pevents(index, phase_perf_pevents) {
+ pdata[index] = read_cpu_counter(cpu, index);
+ }
+}
+
+static inline struct phase_event_count *phase_read_prev(unsigned int cpu)
+{
+ return &per_cpu(prev_phase_event_count, cpu);
+}
+
+static inline struct phase_event_count *phase_read_curr(unsigned int cpu)
+{
+ struct phase_event_count *curr = &per_cpu(curr_phase_event_count, cpu);
+
+ phase_perf_read_events(cpu, curr->pcount.data);
+
+ return curr;
+}
+
+void phase_account_task(struct task_struct *p, int cpu)
+{
+ struct phase_event_count delta;
+ struct phase_event_count *prev, *curr;
+
+ if (!static_branch_likely(&sched_phase))
+ return;
+
+ //if (!sched_core_enabled(cpu_rq(cpu)))
+ // return;
+
+ /* update phase_event_count */
+ prev = phase_read_prev(cpu);
+ curr = phase_read_curr(cpu);
+ phase_calc_delta(p, prev, curr, &delta);
+ *prev = *curr;
+
+ /* calculate phase */
+ __phase_cal_ipcx10(p, &delta);
+ __phase_cal_memband_div_10(p);
+ p->sched_memqos.sample_times++;
+ if ((p->sched_memqos.sample_times % 3) == 0)
+ p->sched_memqos.account_ready = 1;
+}
+
+
+void phase_trace_printk(struct task_struct *p)
+{
+ if (!static_branch_unlikely(&sched_phase_printk))
+ return;
+
+ trace_printk("p->comm:%s(%d) ipcpos:%d ipcx10:%d membandpos:%d memband_div_10:%d numa_score[0]:%d numa_score[1]:%d numa_score[2]:%d numa_score[3]:%d turbo:%d prefered_nid:%d classid:%d partid:%d\n",
+ p->comm, p->pid, p->sched_memqos.ipc_ringpos,\
+ p->sched_memqos.ipcx10, \
+ p->sched_memqos.memband_ringpos,\
+ p->sched_memqos.memband_div_10, \
+ p->sched_memqos.numa_score[0], \
+ p->sched_memqos.numa_score[1], \
+ p->sched_memqos.numa_score[2], \
+ p->sched_memqos.numa_score[3], \
+ p->sched_memqos.turbo, \
+ p->sched_memqos.preferred_nid, \
+ p->sched_memqos.class_id, \
+ p->closid);
+}
diff --git a/kernel/sched/memqos/phase_sim_knn.c b/kernel/sched/memqos/phase_sim_knn.c
new file mode 100644
index 0000000000000..b80bb6b9ae0a3
--- /dev/null
+++ b/kernel/sched/memqos/phase_sim_knn.c
@@ -0,0 +1,92 @@
+#include <linux/types.h>
+
+#define DATA_ROW 20
+void QuickSort(u64 arr[DATA_ROW][2], int L, int R) {
+ int i = L;
+ int j = R;
+ int kk = (L + R) / 2;
+ u64 pivot = arr[kk][0];
+
+ while (i <= j) {
+ while (pivot > arr[i][0]) {
+ i++;
+ }
+ while (pivot < arr[j][0]) {
+ j--;
+ }
+ if (i <= j) {
+ u64 temp = arr[i][0];
+
+ arr[i][0] = arr[j][0];
+ arr[j][0] = temp;
+ i++; j--;
+ }
+ }
+ if (L < j) {
+ QuickSort(arr, L, j);
+ }
+ if (i < R) {
+ QuickSort(arr, i, R);
+ }
+}
+
+u64 euclidean_distance(u64 *row1, u64 *row2, int col) {
+ u64 distance = 0;
+ int i;
+
+ for (i = 0; i < col - 1; i++) {
+ distance += ((row1[i] - row2[i]) * (row1[i] - row2[i]));
+ }
+ return distance;
+}
+
+#define num_neighbors 6
+#define MAX_TAG 8
+
+int get_neighbors_tag(u64 train_data[DATA_ROW][3], int train_row, int col, u64 *test_row) {
+ int i;
+ u64 neighbors[MAX_TAG] = {0};
+ int max_tag = 0;
+ u64 distances[DATA_ROW][2];
+
+ for (i = 0; i < train_row; i++) {
+ distances[i][0] = euclidean_distance(train_data[i], test_row, col);
+ distances[i][1] = train_data[i][col - 1];
+ }
+ QuickSort(distances, 0, train_row - 1);
+ for (i = 0; i < num_neighbors; i++) {
+ neighbors[distances[i][1]]++;
+ if (neighbors[distances[i][1]] > neighbors[max_tag])
+ max_tag = distances[i][1];
+ }
+ return max_tag;
+}
+
+static u64 train_data[DATA_ROW][3] = {
+ {0, 1, 0},
+ {0, 9, 0},
+ {0, 20, 1},
+ {0, 30, 1},
+ {0, 40, 2},
+ {0, 50, 3},
+ {0, 60, 3},
+ {0, 70, 3},
+ {0, 80, 4},
+ {0, 90, 4},
+ {0, 100, 4},
+ {0, 110, 5},
+ {0, 120, 5},
+ {0, 130, 6},
+ {0, 140, 6},
+ {0, 150, 7},
+};
+
+int knn_get_tag(int ipcx10, int memband_div_10)
+{
+ u64 test_data[2];
+
+ test_data[0] = ipcx10;
+ test_data[1] = memband_div_10;
+
+ return get_neighbors_tag(train_data, DATA_ROW, 3, test_data);
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 685f9881b8e23..0d2764c4449ce 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -465,6 +465,13 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+ {
+ .procname = "phase",
+ .mode = 0555,
+ .child = phase_table,
+ },
+#endif
#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4cac46d56f387..d748c291e7047 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2164,12 +2164,15 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
{
struct mempolicy *pol;
struct page *page;
- int preferred_nid;
+ int preferred_nid = -1;
nodemask_t *nmask;
+ if (current->sched_memqos.preferred_nid)
+ preferred_nid = current->sched_memqos.preferred_nid - 1;
+
pol = get_vma_policy(vma, addr);
- if (pol->mode == MPOL_INTERLEAVE) {
+ if (pol->mode == MPOL_INTERLEAVE && preferred_nid != -1) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
@@ -2233,7 +2236,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
}
nmask = policy_nodemask(gfp, pol);
- preferred_nid = policy_node(gfp, pol, node);
+ if (preferred_nid == -1)
+ preferred_nid = policy_node(gfp, pol, node);
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
mark_vma_cdm(nmask, page, vma);
mpol_cond_put(pol);
--
2.25.1
1
0

22 Mar '23
Send 16 patches to test patchwork->PR function
Baisong Zhong (1):
media: dvb-usb: az6027: fix null-ptr-deref in az6027_i2c_xfer()
Chen Zhongjin (1):
ftrace: Fix invalid address access in lookup_rec() when index is 0
Darrick J. Wong (1):
ext4: fix another off-by-one fsmap error on 1k block filesystems
David Hildenbrand (2):
mm: optimize do_wp_page() for exclusive pages in the swapcache
mm: optimize do_wp_page() for fresh pages in local LRU pagevecs
Kuniyuki Iwashima (1):
seccomp: Move copy_seccomp() to no failure path.
Li Huafei (2):
livepatch: Cleanup klp_mem_prepare()
livepatch: Narrow the scope of the 'text_mutex' lock
Luke D. Jones (1):
HID: asus: Remove check for same LED brightness on set
Nicholas Piggin (1):
mm/vmalloc: huge vmalloc backing pages should be split rather than
compound
Pietro Borrello (2):
HID: asus: use spinlock to protect concurrent accesses
HID: asus: use spinlock to safely schedule workers
Xin Long (2):
tipc: set con sock in tipc_conn_alloc
tipc: add an extra conn_get in tipc_conn_alloc
Zheng Yejian (1):
livepatch/core: Fix hungtask against cpu hotplug on x86
Zhihao Cheng (1):
jbd2: fix data missing when reusing bh which is ready to be
checkpointed
arch/x86/kernel/livepatch.c | 11 +++++--
drivers/hid/hid-asus.c | 38 ++++++++++++++++++-----
drivers/media/usb/dvb-usb/az6027.c | 4 +++
fs/ext4/fsmap.c | 2 ++
fs/jbd2/transaction.c | 50 +++++++++++++++++-------------
kernel/fork.c | 17 ++++++----
kernel/livepatch/core.c | 49 ++++++++++++++++++++---------
kernel/trace/ftrace.c | 3 +-
mm/memory.c | 28 +++++++++++++----
mm/vmalloc.c | 22 ++++++++++---
net/tipc/topsrv.c | 20 ++++++------
11 files changed, 172 insertions(+), 72 deletions(-)
--
2.25.1
1
16

[PATCH openEuler-5.10-LTS-SP1 01/16] jbd2: fix data missing when reusing bh which is ready to be checkpointed
by Jialin Zhang 22 Mar '23
by Jialin Zhang 22 Mar '23
22 Mar '23
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
mainline inclusion
from mainline-v6.3-rc1
commit e6b9bd7290d334451ce054e98e752abc055e0034
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6C5HV
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Following process will make data lost and could lead to a filesystem
corrupted problem:
1. jh(bh) is inserted into T1->t_checkpoint_list, bh is dirty, and
jh->b_transaction = NULL
2. T1 is added into journal->j_checkpoint_transactions.
3. Get bh prepare to write while doing checkpoing:
PA PB
do_get_write_access jbd2_log_do_checkpoint
spin_lock(&jh->b_state_lock)
if (buffer_dirty(bh))
clear_buffer_dirty(bh) // clear buffer dirty
set_buffer_jbddirty(bh)
transaction =
journal->j_checkpoint_transactions
jh = transaction->t_checkpoint_list
if (!buffer_dirty(bh))
__jbd2_journal_remove_checkpoint(jh)
// bh won't be flushed
jbd2_cleanup_journal_tail
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved)
4. Aborting journal/Power-cut before writing latest bh on journal area.
In this way we get a corrupted filesystem with bh's data lost.
Fix it by moving the clearing of buffer_dirty bit just before the call
to __jbd2_journal_file_buffer(), both bit clearing and jh->b_transaction
assignment are under journal->j_list_lock locked, so that
jbd2_log_do_checkpoint() will wait until jh's new transaction fininshed
even bh is currently not dirty. And journal_shrink_one_cp_list() won't
remove jh from checkpoint list if the buffer head is reused in
do_get_write_access().
Fetch a reproducer in [Link].
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216898
Cc: <stable(a)kernel.org>
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Signed-off-by: zhanchengbin <zhanchengbin1(a)huawei.com>
Suggested-by: Jan Kara <jack(a)suse.cz>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230110015327.1181863-1-chengzhihao1@huawei.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
fs/jbd2/transaction.c | 50 +++++++++++++++++++++++++------------------
1 file changed, 29 insertions(+), 21 deletions(-)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cefee2dead54..8fa88c42fcb4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -984,36 +984,28 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* ie. locked but not dirty) or tune2fs (which may actually have
* the buffer dirtied, ugh.) */
- if (buffer_dirty(bh)) {
+ if (buffer_dirty(bh) && jh->b_transaction) {
+ warn_dirty_buffer(bh);
/*
- * First question: is this buffer already part of the current
- * transaction or the existing committing transaction?
- */
- if (jh->b_transaction) {
- J_ASSERT_JH(jh,
- jh->b_transaction == transaction ||
- jh->b_transaction ==
- journal->j_committing_transaction);
- if (jh->b_next_transaction)
- J_ASSERT_JH(jh, jh->b_next_transaction ==
- transaction);
- warn_dirty_buffer(bh);
- }
- /*
- * In any case we need to clean the dirty flag and we must
- * do it under the buffer lock to be sure we don't race
- * with running write-out.
+ * We need to clean the dirty flag and we must do it under the
+ * buffer lock to be sure we don't race with running write-out.
*/
JBUFFER_TRACE(jh, "Journalling dirty buffer");
clear_buffer_dirty(bh);
+ /*
+ * The buffer is going to be added to BJ_Reserved list now and
+ * nothing guarantees jbd2_journal_dirty_metadata() will be
+ * ever called for it. So we need to set jbddirty bit here to
+ * make sure the buffer is dirtied and written out when the
+ * journaling machinery is done with it.
+ */
set_buffer_jbddirty(bh);
}
- unlock_buffer(bh);
-
error = -EROFS;
if (is_handle_aborted(handle)) {
spin_unlock(&jh->b_state_lock);
+ unlock_buffer(bh);
goto out;
}
error = 0;
@@ -1023,8 +1015,10 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* b_next_transaction points to it
*/
if (jh->b_transaction == transaction ||
- jh->b_next_transaction == transaction)
+ jh->b_next_transaction == transaction) {
+ unlock_buffer(bh);
goto done;
+ }
/*
* this is the first time this transaction is touching this buffer,
@@ -1048,10 +1042,24 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
*/
smp_wmb();
spin_lock(&journal->j_list_lock);
+ if (test_clear_buffer_dirty(bh)) {
+ /*
+ * Execute buffer dirty clearing and jh->b_transaction
+ * assignment under journal->j_list_lock locked to
+ * prevent bh being removed from checkpoint list if
+ * the buffer is in an intermediate state (not dirty
+ * and jh->b_transaction is NULL).
+ */
+ JBUFFER_TRACE(jh, "Journalling dirty buffer");
+ set_buffer_jbddirty(bh);
+ }
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
spin_unlock(&journal->j_list_lock);
+ unlock_buffer(bh);
goto done;
}
+ unlock_buffer(bh);
+
/*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
--
2.25.1
1
15

[PATCH openEuler-5.10-LTS-SP1 01/14] jbd2: fix data missing when reusing bh which is ready to be checkpointed
by Jialin Zhang 22 Mar '23
by Jialin Zhang 22 Mar '23
22 Mar '23
From: Zhihao Cheng <chengzhihao1(a)huawei.com>
mainline inclusion
from mainline-v6.3-rc1
commit e6b9bd7290d334451ce054e98e752abc055e0034
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6C5HV
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Following process will make data lost and could lead to a filesystem
corrupted problem:
1. jh(bh) is inserted into T1->t_checkpoint_list, bh is dirty, and
jh->b_transaction = NULL
2. T1 is added into journal->j_checkpoint_transactions.
3. Get bh prepare to write while doing checkpoing:
PA PB
do_get_write_access jbd2_log_do_checkpoint
spin_lock(&jh->b_state_lock)
if (buffer_dirty(bh))
clear_buffer_dirty(bh) // clear buffer dirty
set_buffer_jbddirty(bh)
transaction =
journal->j_checkpoint_transactions
jh = transaction->t_checkpoint_list
if (!buffer_dirty(bh))
__jbd2_journal_remove_checkpoint(jh)
// bh won't be flushed
jbd2_cleanup_journal_tail
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved)
4. Aborting journal/Power-cut before writing latest bh on journal area.
In this way we get a corrupted filesystem with bh's data lost.
Fix it by moving the clearing of buffer_dirty bit just before the call
to __jbd2_journal_file_buffer(), both bit clearing and jh->b_transaction
assignment are under journal->j_list_lock locked, so that
jbd2_log_do_checkpoint() will wait until jh's new transaction fininshed
even bh is currently not dirty. And journal_shrink_one_cp_list() won't
remove jh from checkpoint list if the buffer head is reused in
do_get_write_access().
Fetch a reproducer in [Link].
Link: https://bugzilla.kernel.org/show_bug.cgi?id=216898
Cc: <stable(a)kernel.org>
Signed-off-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Signed-off-by: zhanchengbin <zhanchengbin1(a)huawei.com>
Suggested-by: Jan Kara <jack(a)suse.cz>
Reviewed-by: Jan Kara <jack(a)suse.cz>
Link: https://lore.kernel.org/r/20230110015327.1181863-1-chengzhihao1@huawei.com
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Reviewed-by: Yang Erkun <yangerkun(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
fs/jbd2/transaction.c | 50 +++++++++++++++++++++++++------------------
1 file changed, 29 insertions(+), 21 deletions(-)
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cefee2dead54..8fa88c42fcb4 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -984,36 +984,28 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* ie. locked but not dirty) or tune2fs (which may actually have
* the buffer dirtied, ugh.) */
- if (buffer_dirty(bh)) {
+ if (buffer_dirty(bh) && jh->b_transaction) {
+ warn_dirty_buffer(bh);
/*
- * First question: is this buffer already part of the current
- * transaction or the existing committing transaction?
- */
- if (jh->b_transaction) {
- J_ASSERT_JH(jh,
- jh->b_transaction == transaction ||
- jh->b_transaction ==
- journal->j_committing_transaction);
- if (jh->b_next_transaction)
- J_ASSERT_JH(jh, jh->b_next_transaction ==
- transaction);
- warn_dirty_buffer(bh);
- }
- /*
- * In any case we need to clean the dirty flag and we must
- * do it under the buffer lock to be sure we don't race
- * with running write-out.
+ * We need to clean the dirty flag and we must do it under the
+ * buffer lock to be sure we don't race with running write-out.
*/
JBUFFER_TRACE(jh, "Journalling dirty buffer");
clear_buffer_dirty(bh);
+ /*
+ * The buffer is going to be added to BJ_Reserved list now and
+ * nothing guarantees jbd2_journal_dirty_metadata() will be
+ * ever called for it. So we need to set jbddirty bit here to
+ * make sure the buffer is dirtied and written out when the
+ * journaling machinery is done with it.
+ */
set_buffer_jbddirty(bh);
}
- unlock_buffer(bh);
-
error = -EROFS;
if (is_handle_aborted(handle)) {
spin_unlock(&jh->b_state_lock);
+ unlock_buffer(bh);
goto out;
}
error = 0;
@@ -1023,8 +1015,10 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
* b_next_transaction points to it
*/
if (jh->b_transaction == transaction ||
- jh->b_next_transaction == transaction)
+ jh->b_next_transaction == transaction) {
+ unlock_buffer(bh);
goto done;
+ }
/*
* this is the first time this transaction is touching this buffer,
@@ -1048,10 +1042,24 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
*/
smp_wmb();
spin_lock(&journal->j_list_lock);
+ if (test_clear_buffer_dirty(bh)) {
+ /*
+ * Execute buffer dirty clearing and jh->b_transaction
+ * assignment under journal->j_list_lock locked to
+ * prevent bh being removed from checkpoint list if
+ * the buffer is in an intermediate state (not dirty
+ * and jh->b_transaction is NULL).
+ */
+ JBUFFER_TRACE(jh, "Journalling dirty buffer");
+ set_buffer_jbddirty(bh);
+ }
__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
spin_unlock(&journal->j_list_lock);
+ unlock_buffer(bh);
goto done;
}
+ unlock_buffer(bh);
+
/*
* If there is already a copy-out version of this buffer, then we don't
* need to make another one
--
2.25.1
1
13

[PATCH openEuler-1.0-LTS] ext4: fix another off-by-one fsmap error on 1k block filesystems
by Yongqiang Liu 22 Mar '23
by Yongqiang Liu 22 Mar '23
22 Mar '23
From: "Darrick J. Wong" <djwong(a)kernel.org>
mainline inclusion
from mainline-v6.3-rc2
commit c993799baf9c5861f8df91beb80e1611b12efcbd
category: bugfix
bugzilla: 188522,https://gitee.com/openeuler/kernel/issues/I6N7ZP
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Apparently syzbot figured out that issuing this FSMAP call:
struct fsmap_head cmd = {
.fmh_count = ...;
.fmh_keys = {
{ .fmr_device = /* ext4 dev */, .fmr_physical = 0, },
{ .fmr_device = /* ext4 dev */, .fmr_physical = 0, },
},
...
};
ret = ioctl(fd, FS_IOC_GETFSMAP, &cmd);
Produces this crash if the underlying filesystem is a 1k-block ext4
filesystem:
kernel BUG at fs/ext4/ext4.h:3331!
invalid opcode: 0000 [#1] PREEMPT SMP
CPU: 3 PID: 3227965 Comm: xfs_io Tainted: G W O 6.2.0-rc8-achx
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1 04/01/2014
RIP: 0010:ext4_mb_load_buddy_gfp+0x47c/0x570 [ext4]
RSP: 0018:ffffc90007c03998 EFLAGS: 00010246
RAX: ffff888004978000 RBX: ffffc90007c03a20 RCX: ffff888041618000
RDX: 0000000000000000 RSI: 00000000000005a4 RDI: ffffffffa0c99b11
RBP: ffff888012330000 R08: ffffffffa0c2b7d0 R09: 0000000000000400
R10: ffffc90007c03950 R11: 0000000000000000 R12: 0000000000000001
R13: 00000000ffffffff R14: 0000000000000c40 R15: ffff88802678c398
FS: 00007fdf2020c880(0000) GS:ffff88807e100000(0000) knlGS:0000000000000000
CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 00007ffd318a5fe8 CR3: 000000007f80f001 CR4: 00000000001706e0
Call Trace:
<TASK>
ext4_mballoc_query_range+0x4b/0x210 [ext4 dfa189daddffe8fecd3cdfd00564e0f265a8ab80]
ext4_getfsmap_datadev+0x713/0x890 [ext4 dfa189daddffe8fecd3cdfd00564e0f265a8ab80]
ext4_getfsmap+0x2b7/0x330 [ext4 dfa189daddffe8fecd3cdfd00564e0f265a8ab80]
ext4_ioc_getfsmap+0x153/0x2b0 [ext4 dfa189daddffe8fecd3cdfd00564e0f265a8ab80]
__ext4_ioctl+0x2a7/0x17e0 [ext4 dfa189daddffe8fecd3cdfd00564e0f265a8ab80]
__x64_sys_ioctl+0x82/0xa0
do_syscall_64+0x2b/0x80
entry_SYSCALL_64_after_hwframe+0x46/0xb0
RIP: 0033:0x7fdf20558aff
RSP: 002b:00007ffd318a9e30 EFLAGS: 00000246 ORIG_RAX: 0000000000000010
RAX: ffffffffffffffda RBX: 00000000000200c0 RCX: 00007fdf20558aff
RDX: 00007fdf1feb2010 RSI: 00000000c0c0583b RDI: 0000000000000003
RBP: 00005625c0634be0 R08: 00005625c0634c40 R09: 0000000000000001
R10: 0000000000000000 R11: 0000000000000246 R12: 00007fdf1feb2010
R13: 00005625be70d994 R14: 0000000000000800 R15: 0000000000000000
For GETFSMAP calls, the caller selects a physical block device by
writing its block number into fsmap_head.fmh_keys[01].fmr_device.
To query mappings for a subrange of the device, the starting byte of the
range is written to fsmap_head.fmh_keys[0].fmr_physical and the last
byte of the range goes in fsmap_head.fmh_keys[1].fmr_physical.
IOWs, to query what mappings overlap with bytes 3-14 of /dev/sda, you'd
set the inputs as follows:
fmh_keys[0] = { .fmr_device = major(8, 0), .fmr_physical = 3},
fmh_keys[1] = { .fmr_device = major(8, 0), .fmr_physical = 14},
Which would return you whatever is mapped in the 12 bytes starting at
physical offset 3.
The crash is due to insufficient range validation of keys[1] in
ext4_getfsmap_datadev. On 1k-block filesystems, block 0 is not part of
the filesystem, which means that s_first_data_block is nonzero.
ext4_get_group_no_and_offset subtracts this quantity from the blocknr
argument before cracking it into a group number and a block number
within a group. IOWs, block group 0 spans blocks 1-8192 (1-based)
instead of 0-8191 (0-based) like what happens with larger blocksizes.
The net result of this encoding is that blocknr < s_first_data_block is
not a valid input to this function. The end_fsb variable is set from
the keys that are copied from userspace, which means that in the above
example, its value is zero. That leads to an underflow here:
blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
The division then operates on -1:
offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >>
EXT4_SB(sb)->s_cluster_bits;
Leaving an impossibly large group number (2^32-1) in blocknr.
ext4_getfsmap_check_keys checked that keys[0].fmr_physical and
keys[1].fmr_physical are in increasing order, but
ext4_getfsmap_datadev adjusts keys[0].fmr_physical to be at least
s_first_data_block. This implies that we have to check it again after
the adjustment, which is the piece that I forgot.
Reported-by: syzbot+6be2b977c89f79b6b153(a)syzkaller.appspotmail.com
Fixes: 4a4956249dac ("ext4: fix off-by-one fsmap error on 1k block filesystems")
Link: https://syzkaller.appspot.com/bug?id=79d5768e9bfe362911ac1a5057a36fc6b5c300…
Cc: stable(a)vger.kernel.org
Signed-off-by: Darrick J. Wong <djwong(a)kernel.org>
Link: https://lore.kernel.org/r/Y+58NPTH7VNGgzdd@magnolia
Signed-off-by: Theodore Ts'o <tytso(a)mit.edu>
Signed-off-by: Baokun Li <libaokun1(a)huawei.com>
Reviewed-by: Zhihao Cheng <chengzhihao1(a)huawei.com>
Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
fs/ext4/fsmap.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 6f3f245f3a80..6b52ace1463c 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -486,6 +486,8 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
keys[0].fmr_physical = bofs;
if (keys[1].fmr_physical >= eofs)
keys[1].fmr_physical = eofs - 1;
+ if (keys[1].fmr_physical < keys[0].fmr_physical)
+ return 0;
start_fsb = keys[0].fmr_physical;
end_fsb = keys[1].fmr_physical;
--
2.25.1
1
0
您好!
Kernel SIG 邀请您参加 2023-03-24 14:00 召开的Zoom会议(自动录制)
会议主题:openEuler Kernel SIG双周例会
会议内容:
1. 进展update
2. 议题征集中
欢迎大家积极申报议题(新增议题可以直接回复邮件,或录入会议看板)
会议链接:https://us06web.zoom.us/j/85131900036?pwd=RFZVYlZFK3B1RVhpTHROOW82OTdLQT09
会议纪要:https://etherpad.openeuler.org/p/Kernel-meetings
温馨提醒:建议接入会议后修改参会人的姓名,也可以使用您在gitee.com的ID
更多资讯尽在:https://openeuler.org/zh/
Hello!
openEuler Kernel SIG invites you to attend the Zoom conference(auto recording) will be held at 2023-03-24 14:00,
The subject of the conference is openEuler Kernel SIG双周例会,
Summary:
1. 进展update
2. 议题征集中
欢迎大家积极申报议题(新增议题可以直接回复邮件,或录入会议看板)
You can join the meeting at https://us06web.zoom.us/j/85131900036?pwd=RFZVYlZFK3B1RVhpTHROOW82OTdLQT09.
Add topics at https://etherpad.openeuler.org/p/Kernel-meetings.
Note: You are advised to change the participant name after joining the conference or use your ID at gitee.com.
More information: https://openeuler.org/en/
1
0

21 Mar '23
From: Xin Long <lucien.xin(a)gmail.com>
stable inclusion
from stable-v4.19.268
commit 2c9c64a95d97727c9ada0d35abc90ee5fdbaeff7
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6NCRH
CVE: CVE-2023-1382
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id…
--------------------------------
[ Upstream commit 0e5d56c64afcd6fd2d132ea972605b66f8a7d3c4 ]
A crash was reported by Wei Chen:
BUG: kernel NULL pointer dereference, address: 0000000000000018
RIP: 0010:tipc_conn_close+0x12/0x100
Call Trace:
tipc_topsrv_exit_net+0x139/0x320
ops_exit_list.isra.9+0x49/0x80
cleanup_net+0x31a/0x540
process_one_work+0x3fa/0x9f0
worker_thread+0x42/0x5c0
It was caused by !con->sock in tipc_conn_close(). In tipc_topsrv_accept(),
con is allocated in conn_idr then its sock is set:
con = tipc_conn_alloc();
... <----[1]
con->sock = newsock;
If tipc_conn_close() is called in anytime of [1], the null-pointer-def
is triggered by con->sock->sk due to con->sock is not yet set.
This patch fixes it by moving the con->sock setting to tipc_conn_alloc()
under s->idr_lock. So that con->sock can never be NULL when getting the
con from s->conn_idr. It will be also safer to move con->server and flag
CF_CONNECTED setting under s->idr_lock, as they should all be set before
tipc_conn_alloc() is called.
Fixes: c5fa7b3cf3cb ("tipc: introduce new TIPC server infrastructure")
Reported-by: Wei Chen <harperchen1110(a)gmail.com>
Signed-off-by: Xin Long <lucien.xin(a)gmail.com>
Acked-by: Jon Maloy <jmaloy(a)redhat.com>
Signed-off-by: Jakub Kicinski <kuba(a)kernel.org>
Signed-off-by: Sasha Levin <sashal(a)kernel.org>
conflict:
net/tipc/topsrv.c
Signed-off-by: Lu Wei <luwei32(a)huawei.com>
Reviewed-by: Liu Jian <liujian56(a)huawei.com>
Reviewed-by: Xiu Jianfeng <xiujianfeng(a)huawei.com>
Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com>
---
net/tipc/topsrv.c | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c
index 1c4733153d74..89a1f127dfaf 100644
--- a/net/tipc/topsrv.c
+++ b/net/tipc/topsrv.c
@@ -184,7 +184,7 @@ static void tipc_conn_close(struct tipc_conn *con)
conn_put(con);
}
-static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s)
+static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s, struct socket *sock)
{
struct tipc_conn *con;
int ret;
@@ -210,10 +210,11 @@ static struct tipc_conn *tipc_conn_alloc(struct tipc_topsrv *s)
}
con->conid = ret;
s->idr_in_use++;
- spin_unlock_bh(&s->idr_lock);
set_bit(CF_CONNECTED, &con->flags);
con->server = s;
+ con->sock = sock;
+ spin_unlock_bh(&s->idr_lock);
return con;
}
@@ -467,7 +468,7 @@ static void tipc_topsrv_accept(struct work_struct *work)
ret = kernel_accept(lsock, &newsock, O_NONBLOCK);
if (ret < 0)
return;
- con = tipc_conn_alloc(srv);
+ con = tipc_conn_alloc(srv, newsock);
if (IS_ERR(con)) {
ret = PTR_ERR(con);
sock_release(newsock);
@@ -479,7 +480,6 @@ static void tipc_topsrv_accept(struct work_struct *work)
newsk->sk_data_ready = tipc_conn_data_ready;
newsk->sk_write_space = tipc_conn_write_space;
newsk->sk_user_data = con;
- con->sock = newsock;
write_unlock_bh(&newsk->sk_callback_lock);
/* Wake up receive process in case of 'SYN+' message */
@@ -577,12 +577,11 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower,
sub.filter = filter;
*(u32 *)&sub.usr_handle = port;
- con = tipc_conn_alloc(tipc_topsrv(net));
+ con = tipc_conn_alloc(tipc_topsrv(net), NULL);
if (IS_ERR(con))
return false;
*conid = con->conid;
- con->sock = NULL;
rc = tipc_conn_rcv_sub(tipc_topsrv(net), con, &sub);
if (rc >= 0)
return true;
--
2.25.1
1
1

[PATCH PR openEuler-22.03-LTS-SP1] mm/vmalloc: huge vmalloc backing pages should be split rather than compound
by Jialin Zhang 21 Mar '23
by Jialin Zhang 21 Mar '23
21 Mar '23
From: Nicholas Piggin <npiggin(a)gmail.com>
mainline inclusion
from mainline-v5.18-rc4
commit 3b8000ae185cb068adbda5f966a3835053c85fd4
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I6LD0S
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Huge vmalloc higher-order backing pages were allocated with __GFP_COMP
in order to allow the sub-pages to be refcounted by callers such as
"remap_vmalloc_page [sic]" (remap_vmalloc_range).
However a similar problem exists for other struct page fields callers
use, for example fb_deferred_io_fault() takes a vmalloc'ed page and
not only refcounts it but uses ->lru, ->mapping, ->index.
This is not compatible with compound sub-pages, and can cause bad page
state issues like
BUG: Bad page state in process swapper/0 pfn:00743
page:(____ptrval____) refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x743
flags: 0x7ffff000000000(node=0|zone=0|lastcpupid=0x7ffff)
raw: 007ffff000000000 c00c00000001d0c8 c00c00000001d0c8 0000000000000000
raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000
page dumped because: corrupted mapping in tail page
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.18.0-rc3-00082-gfc6fff4a7ce1-dirty #2810
Call Trace:
dump_stack_lvl+0x74/0xa8 (unreliable)
bad_page+0x12c/0x170
free_tail_pages_check+0xe8/0x190
free_pcp_prepare+0x31c/0x4e0
free_unref_page+0x40/0x1b0
__vunmap+0x1d8/0x420
...
The correct approach is to use split high-order pages for the huge
vmalloc backing. These allow callers to treat them in exactly the same
way as individually-allocated order-0 pages.
Link: https://lore.kernel.org/all/14444103-d51b-0fb3-ee63-c3f182f0b546@molgen.mpg…
Signed-off-by: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Paul Menzel <pmenzel(a)molgen.mpg.de>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: Rick Edgecombe <rick.p.edgecombe(a)intel.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
conflicts:
mm/vmalloc.c
Signed-off-by: ZhangPeng <zhangpeng362(a)huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
mm/vmalloc.c | 22 +++++++++++++++++-----
1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e27cd716ca95..2ca2c1bc0db9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2641,14 +2641,17 @@ static void __vunmap(const void *addr, int deallocate_pages)
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
- unsigned int page_order = vm_area_page_order(area);
int i;
- for (i = 0; i < area->nr_pages; i += 1U << page_order) {
+ for (i = 0; i < area->nr_pages; i++) {
struct page *page = area->pages[i];
BUG_ON(!page);
- __free_pages(page, page_order);
+ /*
+ * High-order allocs for huge vmallocs are split, so
+ * can be freed as an array of order-0 allocations
+ */
+ __free_pages(page, 0);
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
@@ -2930,8 +2933,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
int p;
- /* Compound pages required for remap_vmalloc_page */
- page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order);
+ page = alloc_pages_node(node, gfp_mask, page_order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vfree() */
area->nr_pages = i;
@@ -2943,6 +2945,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
goto fail;
}
+ /*
+ * Higher order allocations must be able to be treated as
+ * indepdenent small pages by callers (as they can with
+ * small-page vmallocs). Some drivers do their own refcounting
+ * on vmalloc_to_page() pages, some use page->mapping,
+ * page->lru, etc.
+ */
+ if (page_order)
+ split_page(page, page_order);
+
for (p = 0; p < (1U << page_order); p++)
area->pages[i + p] = page + p;
--
2.25.1
1
0

[PATCH openEuler-23.03] Fix CVE-2023-23005, CVE-2023-0597 and CVE-2022-4269
by Jialin Zhang 18 Mar '23
by Jialin Zhang 18 Mar '23
18 Mar '23
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
...NULL-vs-IS_ERR-checking-in-memory_ti.patch | 50 +++++
...-x86-mm-Randomize-per-cpu-entry-area.patch | 172 ++++++++++++++++++
...ap-shadow-for-percpu-pages-on-demand.patch | 126 +++++++++++++
...-physical-address-for-every-page-of-.patch | 50 +++++
...KASAN-shadow-for-entire-per-CPU-rang.patch | 121 ++++++++++++
...-local-CPU_ENTRY_AREA-variables-to-s.patch | 88 +++++++++
...lpers-to-align-shadow-addresses-up-a.patch | 113 ++++++++++++
...te-shadow-for-shared-chunk-of-the-CP.patch | 99 ++++++++++
...rred-better-wording-on-protection-ag.patch | 97 ++++++++++
...he-backlog-for-nested-calls-to-mirre.patch | 149 +++++++++++++++
kernel.spec | 27 ++-
11 files changed, 1090 insertions(+), 2 deletions(-)
create mode 100644 0015-mm-demotion-fix-NULL-vs-IS_ERR-checking-in-memory_ti.patch
create mode 100644 0016-x86-mm-Randomize-per-cpu-entry-area.patch
create mode 100644 0017-x86-kasan-Map-shadow-for-percpu-pages-on-demand.patch
create mode 100644 0018-x86-mm-Recompute-physical-address-for-every-page-of-.patch
create mode 100644 0019-x86-mm-Populate-KASAN-shadow-for-entire-per-CPU-rang.patch
create mode 100644 0020-x86-kasan-Rename-local-CPU_ENTRY_AREA-variables-to-s.patch
create mode 100644 0021-x86-kasan-Add-helpers-to-align-shadow-addresses-up-a.patch
create mode 100644 0022-x86-kasan-Populate-shadow-for-shared-chunk-of-the-CP.patch
create mode 100644 0023-net-sched-act_mirred-better-wording-on-protection-ag.patch
create mode 100644 0024-act_mirred-use-the-backlog-for-nested-calls-to-mirre.patch
diff --git a/0015-mm-demotion-fix-NULL-vs-IS_ERR-checking-in-memory_ti.patch b/0015-mm-demotion-fix-NULL-vs-IS_ERR-checking-in-memory_ti.patch
new file mode 100644
index 0000000..f598fc5
--- /dev/null
+++ b/0015-mm-demotion-fix-NULL-vs-IS_ERR-checking-in-memory_ti.patch
@@ -0,0 +1,50 @@
+From e11c121e73d4e98ed13259d6b19830f33ca60d76 Mon Sep 17 00:00:00 2001
+From: Miaoqian Lin <linmq006(a)gmail.com>
+Date: Fri, 17 Mar 2023 10:05:08 +0800
+Subject: [PATCH 15/24] mm/demotion: fix NULL vs IS_ERR checking in
+ memory_tier_init
+
+mainline inclusion
+from mainline-v6.2-rc1
+commit 4a625ceee8a0ab0273534cb6b432ce6b331db5ee
+category: bugfix
+bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6IXO8
+CVE: CVE-2023-23005
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
+
+--------------------------------
+
+alloc_memory_type() returns error pointers on error instead of NULL. Use
+IS_ERR() to check the return value to fix this.
+
+Link: https://lkml.kernel.org/r/20221110030751.1627266-1-linmq006@gmail.com
+Fixes: 7b88bda3761b ("mm/demotion/dax/kmem: set node's abstract distance to MEMTIER_DEFAULT_DAX_ADISTANCE")
+Signed-off-by: Miaoqian Lin <linmq006(a)gmail.com>
+Reviewed-by: "Huang, Ying" <ying.huang(a)intel.com>
+Cc: Aneesh Kumar K.V <aneesh.kumar(a)linux.ibm.com>
+Cc: Wei Xu <weixugc(a)google.com>
+Signed-off-by: Andrew Morton <akpm(a)linux-foundation.org>
+Signed-off-by: Ma Wupeng <mawupeng1(a)huawei.com>
+Reviewed-by: tong tiangen <tongtiangen(a)huawei.com>
+Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
+---
+ mm/memory-tiers.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
+index ba863f46759d..96022973c9ba 100644
+--- a/mm/memory-tiers.c
++++ b/mm/memory-tiers.c
+@@ -645,7 +645,7 @@ static int __init memory_tier_init(void)
+ * than default DRAM tier.
+ */
+ default_dram_type = alloc_memory_type(MEMTIER_ADISTANCE_DRAM);
+- if (!default_dram_type)
++ if (IS_ERR(default_dram_type))
+ panic("%s() failed to allocate default DRAM tier\n", __func__);
+
+ /*
+--
+2.25.1
+
diff --git a/0016-x86-mm-Randomize-per-cpu-entry-area.patch b/0016-x86-mm-Randomize-per-cpu-entry-area.patch
new file mode 100644
index 0000000..6f5dd2d
--- /dev/null
+++ b/0016-x86-mm-Randomize-per-cpu-entry-area.patch
@@ -0,0 +1,172 @@
+From 0324d3cd1b57c06b0cf31b6db643ced5b29b0947 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz(a)infradead.org>
+Date: Fri, 17 Mar 2023 03:07:41 +0000
+Subject: [PATCH 16/24] x86/mm: Randomize per-cpu entry area
+
+mainline inclusion
+from mainline-v6.2-rc1
+commit 97e3d26b5e5f371b3ee223d94dd123e6c442ba80
+category: bugfix
+bugzilla: 188336
+CVE: CVE-2023-0597
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
+
+--------------------------------
+
+Seth found that the CPU-entry-area; the piece of per-cpu data that is
+mapped into the userspace page-tables for kPTI is not subject to any
+randomization -- irrespective of kASLR settings.
+
+On x86_64 a whole P4D (512 GB) of virtual address space is reserved for
+this structure, which is plenty large enough to randomize things a
+little.
+
+As such, use a straight forward randomization scheme that avoids
+duplicates to spread the existing CPUs over the available space.
+
+ [ bp: Fix le build. ]
+
+Reported-by: Seth Jenkins <sethjenkins(a)google.com>
+Reviewed-by: Kees Cook <keescook(a)chromium.org>
+Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
+Signed-off-by: Dave Hansen <dave.hansen(a)linux.intel.com>
+Signed-off-by: Borislav Petkov <bp(a)suse.de>
+Signed-off-by: Tong Tiangen <tongtiangen(a)huawei.com>
+Reviewed-by: Nanyong Sun <sunnanyong(a)huawei.com>
+Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
+Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
+---
+ arch/x86/include/asm/cpu_entry_area.h | 4 ---
+ arch/x86/include/asm/pgtable_areas.h | 8 ++++-
+ arch/x86/kernel/hw_breakpoint.c | 2 +-
+ arch/x86/mm/cpu_entry_area.c | 46 ++++++++++++++++++++++++---
+ 4 files changed, 50 insertions(+), 10 deletions(-)
+
+diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h
+index 75efc4c6f076..462fc34f1317 100644
+--- a/arch/x86/include/asm/cpu_entry_area.h
++++ b/arch/x86/include/asm/cpu_entry_area.h
+@@ -130,10 +130,6 @@ struct cpu_entry_area {
+ };
+
+ #define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area))
+-#define CPU_ENTRY_AREA_ARRAY_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS)
+-
+-/* Total size includes the readonly IDT mapping page as well: */
+-#define CPU_ENTRY_AREA_TOTAL_SIZE (CPU_ENTRY_AREA_ARRAY_SIZE + PAGE_SIZE)
+
+ DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+ DECLARE_PER_CPU(struct cea_exception_stacks *, cea_exception_stacks);
+diff --git a/arch/x86/include/asm/pgtable_areas.h b/arch/x86/include/asm/pgtable_areas.h
+index d34cce1b995c..4f056fb88174 100644
+--- a/arch/x86/include/asm/pgtable_areas.h
++++ b/arch/x86/include/asm/pgtable_areas.h
+@@ -11,6 +11,12 @@
+
+ #define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT)
+
+-#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_ARRAY_SIZE - CPU_ENTRY_AREA_BASE)
++#ifdef CONFIG_X86_32
++#define CPU_ENTRY_AREA_MAP_SIZE (CPU_ENTRY_AREA_PER_CPU + \
++ (CPU_ENTRY_AREA_SIZE * NR_CPUS) - \
++ CPU_ENTRY_AREA_BASE)
++#else
++#define CPU_ENTRY_AREA_MAP_SIZE P4D_SIZE
++#endif
+
+ #endif /* _ASM_X86_PGTABLE_AREAS_H */
+diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
+index 668a4a6533d9..bbb0f737aab1 100644
+--- a/arch/x86/kernel/hw_breakpoint.c
++++ b/arch/x86/kernel/hw_breakpoint.c
+@@ -266,7 +266,7 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
+
+ /* CPU entry erea is always used for CPU entry */
+ if (within_area(addr, end, CPU_ENTRY_AREA_BASE,
+- CPU_ENTRY_AREA_TOTAL_SIZE))
++ CPU_ENTRY_AREA_MAP_SIZE))
+ return true;
+
+ /*
+diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
+index 6c2f1b76a0b6..20844cf141fb 100644
+--- a/arch/x86/mm/cpu_entry_area.c
++++ b/arch/x86/mm/cpu_entry_area.c
+@@ -15,16 +15,53 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage)
+ #ifdef CONFIG_X86_64
+ static DEFINE_PER_CPU_PAGE_ALIGNED(struct exception_stacks, exception_stacks);
+ DEFINE_PER_CPU(struct cea_exception_stacks*, cea_exception_stacks);
+-#endif
+
+-#ifdef CONFIG_X86_32
++static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, _cea_offset);
++
++static __always_inline unsigned int cea_offset(unsigned int cpu)
++{
++ return per_cpu(_cea_offset, cpu);
++}
++
++static __init void init_cea_offsets(void)
++{
++ unsigned int max_cea;
++ unsigned int i, j;
++
++ max_cea = (CPU_ENTRY_AREA_MAP_SIZE - PAGE_SIZE) / CPU_ENTRY_AREA_SIZE;
++
++ /* O(sodding terrible) */
++ for_each_possible_cpu(i) {
++ unsigned int cea;
++
++again:
++ cea = prandom_u32_max(max_cea);
++
++ for_each_possible_cpu(j) {
++ if (cea_offset(j) == cea)
++ goto again;
++
++ if (i == j)
++ break;
++ }
++
++ per_cpu(_cea_offset, i) = cea;
++ }
++}
++#else /* !X86_64 */
+ DECLARE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack);
++
++static __always_inline unsigned int cea_offset(unsigned int cpu)
++{
++ return cpu;
++}
++static inline void init_cea_offsets(void) { }
+ #endif
+
+ /* Is called from entry code, so must be noinstr */
+ noinstr struct cpu_entry_area *get_cpu_entry_area(int cpu)
+ {
+- unsigned long va = CPU_ENTRY_AREA_PER_CPU + cpu * CPU_ENTRY_AREA_SIZE;
++ unsigned long va = CPU_ENTRY_AREA_PER_CPU + cea_offset(cpu) * CPU_ENTRY_AREA_SIZE;
+ BUILD_BUG_ON(sizeof(struct cpu_entry_area) % PAGE_SIZE != 0);
+
+ return (struct cpu_entry_area *) va;
+@@ -205,7 +242,6 @@ static __init void setup_cpu_entry_area_ptes(void)
+
+ /* The +1 is for the readonly IDT: */
+ BUILD_BUG_ON((CPU_ENTRY_AREA_PAGES+1)*PAGE_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
+- BUILD_BUG_ON(CPU_ENTRY_AREA_TOTAL_SIZE != CPU_ENTRY_AREA_MAP_SIZE);
+ BUG_ON(CPU_ENTRY_AREA_BASE & ~PMD_MASK);
+
+ start = CPU_ENTRY_AREA_BASE;
+@@ -221,6 +257,8 @@ void __init setup_cpu_entry_areas(void)
+ {
+ unsigned int cpu;
+
++ init_cea_offsets();
++
+ setup_cpu_entry_area_ptes();
+
+ for_each_possible_cpu(cpu)
+--
+2.25.1
+
diff --git a/0017-x86-kasan-Map-shadow-for-percpu-pages-on-demand.patch b/0017-x86-kasan-Map-shadow-for-percpu-pages-on-demand.patch
new file mode 100644
index 0000000..b9129b6
--- /dev/null
+++ b/0017-x86-kasan-Map-shadow-for-percpu-pages-on-demand.patch
@@ -0,0 +1,126 @@
+From 68992563c4b6b1776bd90dafe76caa88ff6dbfe8 Mon Sep 17 00:00:00 2001
+From: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
+Date: Fri, 17 Mar 2023 03:07:42 +0000
+Subject: [PATCH 17/24] x86/kasan: Map shadow for percpu pages on demand
+
+mainline inclusion
+from mainline-v6.2-rc1
+commit 3f148f3318140035e87decc1214795ff0755757b
+category: bugfix
+bugzilla: 188336
+CVE: CVE-2023-0597
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
+
+--------------------------------
+
+KASAN maps shadow for the entire CPU-entry-area:
+ [CPU_ENTRY_AREA_BASE, CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE]
+
+This will explode once the per-cpu entry areas are randomized since it
+will increase CPU_ENTRY_AREA_MAP_SIZE to 512 GB and KASAN fails to
+allocate shadow for such big area.
+
+Fix this by allocating KASAN shadow only for really used cpu entry area
+addresses mapped by cea_map_percpu_pages()
+
+Thanks to the 0day folks for finding and reporting this to be an issue.
+
+[ dhansen: tweak changelog since this will get committed before peterz's
+ actual cpu-entry-area randomization ]
+
+Signed-off-by: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
+Signed-off-by: Dave Hansen <dave.hansen(a)linux.intel.com>
+Tested-by: Yujie Liu <yujie.liu(a)intel.com>
+Cc: kernel test robot <yujie.liu(a)intel.com>
+Link: https://lore.kernel.org/r/202210241508.2e203c3d-yujie.liu@intel.com
+Signed-off-by: Tong Tiangen <tongtiangen(a)huawei.com>
+Reviewed-by: Nanyong Sun <sunnanyong(a)huawei.com>
+Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
+Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
+---
+ arch/x86/include/asm/kasan.h | 3 +++
+ arch/x86/mm/cpu_entry_area.c | 8 +++++++-
+ arch/x86/mm/kasan_init_64.c | 15 ++++++++++++---
+ 3 files changed, 22 insertions(+), 4 deletions(-)
+
+diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h
+index 13e70da38bed..de75306b932e 100644
+--- a/arch/x86/include/asm/kasan.h
++++ b/arch/x86/include/asm/kasan.h
+@@ -28,9 +28,12 @@
+ #ifdef CONFIG_KASAN
+ void __init kasan_early_init(void);
+ void __init kasan_init(void);
++void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid);
+ #else
+ static inline void kasan_early_init(void) { }
+ static inline void kasan_init(void) { }
++static inline void kasan_populate_shadow_for_vaddr(void *va, size_t size,
++ int nid) { }
+ #endif
+
+ #endif
+diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
+index 20844cf141fb..dff9001e5e12 100644
+--- a/arch/x86/mm/cpu_entry_area.c
++++ b/arch/x86/mm/cpu_entry_area.c
+@@ -9,6 +9,7 @@
+ #include <asm/cpu_entry_area.h>
+ #include <asm/fixmap.h>
+ #include <asm/desc.h>
++#include <asm/kasan.h>
+
+ static DEFINE_PER_CPU_PAGE_ALIGNED(struct entry_stack_page, entry_stack_storage);
+
+@@ -90,8 +91,13 @@ void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
+ static void __init
+ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
+ {
++ phys_addr_t pa = per_cpu_ptr_to_phys(ptr);
++
++ kasan_populate_shadow_for_vaddr(cea_vaddr, pages * PAGE_SIZE,
++ early_pfn_to_nid(PFN_DOWN(pa)));
++
+ for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
+- cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
++ cea_set_pte(cea_vaddr, pa, prot);
+ }
+
+ static void __init percpu_setup_debug_store(unsigned int cpu)
+diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
+index e7b9b464a82f..d1416926ad52 100644
+--- a/arch/x86/mm/kasan_init_64.c
++++ b/arch/x86/mm/kasan_init_64.c
+@@ -316,6 +316,18 @@ void __init kasan_early_init(void)
+ kasan_map_early_shadow(init_top_pgt);
+ }
+
++void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid)
++{
++ unsigned long shadow_start, shadow_end;
++
++ shadow_start = (unsigned long)kasan_mem_to_shadow(va);
++ shadow_start = round_down(shadow_start, PAGE_SIZE);
++ shadow_end = (unsigned long)kasan_mem_to_shadow(va + size);
++ shadow_end = round_up(shadow_end, PAGE_SIZE);
++
++ kasan_populate_shadow(shadow_start, shadow_end, nid);
++}
++
+ void __init kasan_init(void)
+ {
+ int i;
+@@ -393,9 +405,6 @@ void __init kasan_init(void)
+ kasan_mem_to_shadow((void *)VMALLOC_END + 1),
+ shadow_cpu_entry_begin);
+
+- kasan_populate_shadow((unsigned long)shadow_cpu_entry_begin,
+- (unsigned long)shadow_cpu_entry_end, 0);
+-
+ kasan_populate_early_shadow(shadow_cpu_entry_end,
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+--
+2.25.1
+
diff --git a/0018-x86-mm-Recompute-physical-address-for-every-page-of-.patch b/0018-x86-mm-Recompute-physical-address-for-every-page-of-.patch
new file mode 100644
index 0000000..2696052
--- /dev/null
+++ b/0018-x86-mm-Recompute-physical-address-for-every-page-of-.patch
@@ -0,0 +1,50 @@
+From 12867b242d6e431f6f947e53abd1094cd0075b55 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc(a)google.com>
+Date: Fri, 17 Mar 2023 03:07:43 +0000
+Subject: [PATCH 18/24] x86/mm: Recompute physical address for every page of
+ per-CPU CEA mapping
+
+mainline inclusion
+from mainline-v6.2-rc1
+commit 80d72a8f76e8f3f0b5a70b8c7022578e17bde8e7
+category: bugfix
+bugzilla: 188336
+CVE: CVE-2023-0597
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
+
+--------------------------------
+
+Recompute the physical address for each per-CPU page in the CPU entry
+area, a recent commit inadvertantly modified cea_map_percpu_pages() such
+that every PTE is mapped to the physical address of the first page.
+
+Fixes: 9fd429c28073 ("x86/kasan: Map shadow for percpu pages on demand")
+Signed-off-by: Sean Christopherson <seanjc(a)google.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
+Reviewed-by: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
+Link: https://lkml.kernel.org/r/20221110203504.1985010-2-seanjc@google.com
+Signed-off-by: Tong Tiangen <tongtiangen(a)huawei.com>
+Reviewed-by: Nanyong Sun <sunnanyong(a)huawei.com>
+Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
+Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
+---
+ arch/x86/mm/cpu_entry_area.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
+index dff9001e5e12..d831aae94b41 100644
+--- a/arch/x86/mm/cpu_entry_area.c
++++ b/arch/x86/mm/cpu_entry_area.c
+@@ -97,7 +97,7 @@ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
+ early_pfn_to_nid(PFN_DOWN(pa)));
+
+ for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
+- cea_set_pte(cea_vaddr, pa, prot);
++ cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
+ }
+
+ static void __init percpu_setup_debug_store(unsigned int cpu)
+--
+2.25.1
+
diff --git a/0019-x86-mm-Populate-KASAN-shadow-for-entire-per-CPU-rang.patch b/0019-x86-mm-Populate-KASAN-shadow-for-entire-per-CPU-rang.patch
new file mode 100644
index 0000000..8fa96dd
--- /dev/null
+++ b/0019-x86-mm-Populate-KASAN-shadow-for-entire-per-CPU-rang.patch
@@ -0,0 +1,121 @@
+From 91bb861cfc95653af4223af2e00b9e637c501d5a Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc(a)google.com>
+Date: Fri, 17 Mar 2023 03:07:44 +0000
+Subject: [PATCH 19/24] x86/mm: Populate KASAN shadow for entire per-CPU range
+ of CPU entry area
+
+mainline inclusion
+from mainline-v6.2-rc1
+commit 97650148a15e0b30099d6175ffe278b9f55ec66a
+category: bugfix
+bugzilla: 188336
+CVE: CVE-2023-0597
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
+
+--------------------------------
+
+Populate a KASAN shadow for the entire possible per-CPU range of the CPU
+entry area instead of requiring that each individual chunk map a shadow.
+Mapping shadows individually is error prone, e.g. the per-CPU GDT mapping
+was left behind, which can lead to not-present page faults during KASAN
+validation if the kernel performs a software lookup into the GDT. The DS
+buffer is also likely affected.
+
+The motivation for mapping the per-CPU areas on-demand was to avoid
+mapping the entire 512GiB range that's reserved for the CPU entry area,
+shaving a few bytes by not creating shadows for potentially unused memory
+was not a goal.
+
+The bug is most easily reproduced by doing a sigreturn with a garbage
+CS in the sigcontext, e.g.
+
+ int main(void)
+ {
+ struct sigcontext regs;
+
+ syscall(__NR_mmap, 0x1ffff000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
+ syscall(__NR_mmap, 0x20000000ul, 0x1000000ul, 7ul, 0x32ul, -1, 0ul);
+ syscall(__NR_mmap, 0x21000000ul, 0x1000ul, 0ul, 0x32ul, -1, 0ul);
+
+ memset(®s, 0, sizeof(regs));
+ regs.cs = 0x1d0;
+ syscall(__NR_rt_sigreturn);
+ return 0;
+ }
+
+to coerce the kernel into doing a GDT lookup to compute CS.base when
+reading the instruction bytes on the subsequent #GP to determine whether
+or not the #GP is something the kernel should handle, e.g. to fixup UMIP
+violations or to emulate CLI/STI for IOPL=3 applications.
+
+ BUG: unable to handle page fault for address: fffffbc8379ace00
+ #PF: supervisor read access in kernel mode
+ #PF: error_code(0x0000) - not-present page
+ PGD 16c03a067 P4D 16c03a067 PUD 15b990067 PMD 15b98f067 PTE 0
+ Oops: 0000 [#1] PREEMPT SMP KASAN
+ CPU: 3 PID: 851 Comm: r2 Not tainted 6.1.0-rc3-next-20221103+ #432
+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
+ RIP: 0010:kasan_check_range+0xdf/0x190
+ Call Trace:
+ <TASK>
+ get_desc+0xb0/0x1d0
+ insn_get_seg_base+0x104/0x270
+ insn_fetch_from_user+0x66/0x80
+ fixup_umip_exception+0xb1/0x530
+ exc_general_protection+0x181/0x210
+ asm_exc_general_protection+0x22/0x30
+ RIP: 0003:0x0
+ Code: Unable to access opcode bytes at 0xffffffffffffffd6.
+ RSP: 0003:0000000000000000 EFLAGS: 00000202
+ RAX: 0000000000000000 RBX: 0000000000000000 RCX: 00000000000001d0
+ RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
+ RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
+ R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
+ R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
+ </TASK>
+
+Fixes: 9fd429c28073 ("x86/kasan: Map shadow for percpu pages on demand")
+Reported-by: syzbot+ffb4f000dc2872c93f62(a)syzkaller.appspotmail.com
+Suggested-by: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
+Signed-off-by: Sean Christopherson <seanjc(a)google.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
+Reviewed-by: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
+Link: https://lkml.kernel.org/r/20221110203504.1985010-3-seanjc@google.com
+Signed-off-by: Tong Tiangen <tongtiangen(a)huawei.com>
+Reviewed-by: Nanyong Sun <sunnanyong(a)huawei.com>
+Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
+Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
+---
+ arch/x86/mm/cpu_entry_area.c | 8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+diff --git a/arch/x86/mm/cpu_entry_area.c b/arch/x86/mm/cpu_entry_area.c
+index d831aae94b41..7c855dffcdc2 100644
+--- a/arch/x86/mm/cpu_entry_area.c
++++ b/arch/x86/mm/cpu_entry_area.c
+@@ -91,11 +91,6 @@ void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags)
+ static void __init
+ cea_map_percpu_pages(void *cea_vaddr, void *ptr, int pages, pgprot_t prot)
+ {
+- phys_addr_t pa = per_cpu_ptr_to_phys(ptr);
+-
+- kasan_populate_shadow_for_vaddr(cea_vaddr, pages * PAGE_SIZE,
+- early_pfn_to_nid(PFN_DOWN(pa)));
+-
+ for ( ; pages; pages--, cea_vaddr+= PAGE_SIZE, ptr += PAGE_SIZE)
+ cea_set_pte(cea_vaddr, per_cpu_ptr_to_phys(ptr), prot);
+ }
+@@ -195,6 +190,9 @@ static void __init setup_cpu_entry_area(unsigned int cpu)
+ pgprot_t tss_prot = PAGE_KERNEL;
+ #endif
+
++ kasan_populate_shadow_for_vaddr(cea, CPU_ENTRY_AREA_SIZE,
++ early_cpu_to_node(cpu));
++
+ cea_set_pte(&cea->gdt, get_cpu_gdt_paddr(cpu), gdt_prot);
+
+ cea_map_percpu_pages(&cea->entry_stack_page,
+--
+2.25.1
+
diff --git a/0020-x86-kasan-Rename-local-CPU_ENTRY_AREA-variables-to-s.patch b/0020-x86-kasan-Rename-local-CPU_ENTRY_AREA-variables-to-s.patch
new file mode 100644
index 0000000..e4f9f63
--- /dev/null
+++ b/0020-x86-kasan-Rename-local-CPU_ENTRY_AREA-variables-to-s.patch
@@ -0,0 +1,88 @@
+From 0560fceb4d3c76133f1a89decbf1c3334afdbd00 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc(a)google.com>
+Date: Fri, 17 Mar 2023 03:07:45 +0000
+Subject: [PATCH 20/24] x86/kasan: Rename local CPU_ENTRY_AREA variables to
+ shorten names
+
+mainline inclusion
+from mainline-v6.2-rc1
+commit 7077d2ccb94dafd00b29cc2d601c9f6891648f5b
+category: bugfix
+bugzilla: 188336
+CVE: CVE-2023-0597
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
+
+--------------------------------
+
+Rename the CPU entry area variables in kasan_init() to shorten their
+names, a future fix will reference the beginning of the per-CPU portion
+of the CPU entry area, and shadow_cpu_entry_per_cpu_begin is a bit much.
+
+No functional change intended.
+
+Signed-off-by: Sean Christopherson <seanjc(a)google.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
+Reviewed-by: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
+Link: https://lkml.kernel.org/r/20221110203504.1985010-4-seanjc@google.com
+Signed-off-by: Tong Tiangen <tongtiangen(a)huawei.com>
+Reviewed-by: Nanyong Sun <sunnanyong(a)huawei.com>
+Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
+Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
+---
+ arch/x86/mm/kasan_init_64.c | 22 +++++++++++-----------
+ 1 file changed, 11 insertions(+), 11 deletions(-)
+
+diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
+index d1416926ad52..ad7872ae10ed 100644
+--- a/arch/x86/mm/kasan_init_64.c
++++ b/arch/x86/mm/kasan_init_64.c
+@@ -331,7 +331,7 @@ void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid)
+ void __init kasan_init(void)
+ {
+ int i;
+- void *shadow_cpu_entry_begin, *shadow_cpu_entry_end;
++ void *shadow_cea_begin, *shadow_cea_end;
+
+ memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+
+@@ -372,16 +372,16 @@ void __init kasan_init(void)
+ map_range(&pfn_mapped[i]);
+ }
+
+- shadow_cpu_entry_begin = (void *)CPU_ENTRY_AREA_BASE;
+- shadow_cpu_entry_begin = kasan_mem_to_shadow(shadow_cpu_entry_begin);
+- shadow_cpu_entry_begin = (void *)round_down(
+- (unsigned long)shadow_cpu_entry_begin, PAGE_SIZE);
++ shadow_cea_begin = (void *)CPU_ENTRY_AREA_BASE;
++ shadow_cea_begin = kasan_mem_to_shadow(shadow_cea_begin);
++ shadow_cea_begin = (void *)round_down(
++ (unsigned long)shadow_cea_begin, PAGE_SIZE);
+
+- shadow_cpu_entry_end = (void *)(CPU_ENTRY_AREA_BASE +
++ shadow_cea_end = (void *)(CPU_ENTRY_AREA_BASE +
+ CPU_ENTRY_AREA_MAP_SIZE);
+- shadow_cpu_entry_end = kasan_mem_to_shadow(shadow_cpu_entry_end);
+- shadow_cpu_entry_end = (void *)round_up(
+- (unsigned long)shadow_cpu_entry_end, PAGE_SIZE);
++ shadow_cea_end = kasan_mem_to_shadow(shadow_cea_end);
++ shadow_cea_end = (void *)round_up(
++ (unsigned long)shadow_cea_end, PAGE_SIZE);
+
+ kasan_populate_early_shadow(
+ kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+@@ -403,9 +403,9 @@ void __init kasan_init(void)
+
+ kasan_populate_early_shadow(
+ kasan_mem_to_shadow((void *)VMALLOC_END + 1),
+- shadow_cpu_entry_begin);
++ shadow_cea_begin);
+
+- kasan_populate_early_shadow(shadow_cpu_entry_end,
++ kasan_populate_early_shadow(shadow_cea_end,
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+ kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
+--
+2.25.1
+
diff --git a/0021-x86-kasan-Add-helpers-to-align-shadow-addresses-up-a.patch b/0021-x86-kasan-Add-helpers-to-align-shadow-addresses-up-a.patch
new file mode 100644
index 0000000..5f3f362
--- /dev/null
+++ b/0021-x86-kasan-Add-helpers-to-align-shadow-addresses-up-a.patch
@@ -0,0 +1,113 @@
+From ec4ebad1a3ed5a1ff3301de4df9a12ebf81b09c1 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc(a)google.com>
+Date: Fri, 17 Mar 2023 03:07:46 +0000
+Subject: [PATCH 21/24] x86/kasan: Add helpers to align shadow addresses up and
+ down
+
+mainline inclusion
+from mainline-v6.2-rc1
+commit bde258d97409f2a45243cb393a55ea9ecfc7aba5
+category: bugfix
+bugzilla: 188336
+CVE: CVE-2023-0597
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
+
+--------------------------------
+
+Add helpers to dedup code for aligning shadow address up/down to page
+boundaries when translating an address to its shadow.
+
+No functional change intended.
+
+Signed-off-by: Sean Christopherson <seanjc(a)google.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
+Reviewed-by: Andrey Ryabinin <ryabinin.a.a(a)gmail.com>
+Link: https://lkml.kernel.org/r/20221110203504.1985010-5-seanjc@google.com
+Signed-off-by: Tong Tiangen <tongtiangen(a)huawei.com>
+Reviewed-by: Nanyong Sun <sunnanyong(a)huawei.com>
+Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
+Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
+---
+ arch/x86/mm/kasan_init_64.c | 40 ++++++++++++++++++++-----------------
+ 1 file changed, 22 insertions(+), 18 deletions(-)
+
+diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
+index ad7872ae10ed..afc5e129ca7b 100644
+--- a/arch/x86/mm/kasan_init_64.c
++++ b/arch/x86/mm/kasan_init_64.c
+@@ -316,22 +316,33 @@ void __init kasan_early_init(void)
+ kasan_map_early_shadow(init_top_pgt);
+ }
+
++static unsigned long kasan_mem_to_shadow_align_down(unsigned long va)
++{
++ unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va);
++
++ return round_down(shadow, PAGE_SIZE);
++}
++
++static unsigned long kasan_mem_to_shadow_align_up(unsigned long va)
++{
++ unsigned long shadow = (unsigned long)kasan_mem_to_shadow((void *)va);
++
++ return round_up(shadow, PAGE_SIZE);
++}
++
+ void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid)
+ {
+ unsigned long shadow_start, shadow_end;
+
+- shadow_start = (unsigned long)kasan_mem_to_shadow(va);
+- shadow_start = round_down(shadow_start, PAGE_SIZE);
+- shadow_end = (unsigned long)kasan_mem_to_shadow(va + size);
+- shadow_end = round_up(shadow_end, PAGE_SIZE);
+-
++ shadow_start = kasan_mem_to_shadow_align_down((unsigned long)va);
++ shadow_end = kasan_mem_to_shadow_align_up((unsigned long)va + size);
+ kasan_populate_shadow(shadow_start, shadow_end, nid);
+ }
+
+ void __init kasan_init(void)
+ {
++ unsigned long shadow_cea_begin, shadow_cea_end;
+ int i;
+- void *shadow_cea_begin, *shadow_cea_end;
+
+ memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+
+@@ -372,16 +383,9 @@ void __init kasan_init(void)
+ map_range(&pfn_mapped[i]);
+ }
+
+- shadow_cea_begin = (void *)CPU_ENTRY_AREA_BASE;
+- shadow_cea_begin = kasan_mem_to_shadow(shadow_cea_begin);
+- shadow_cea_begin = (void *)round_down(
+- (unsigned long)shadow_cea_begin, PAGE_SIZE);
+-
+- shadow_cea_end = (void *)(CPU_ENTRY_AREA_BASE +
+- CPU_ENTRY_AREA_MAP_SIZE);
+- shadow_cea_end = kasan_mem_to_shadow(shadow_cea_end);
+- shadow_cea_end = (void *)round_up(
+- (unsigned long)shadow_cea_end, PAGE_SIZE);
++ shadow_cea_begin = kasan_mem_to_shadow_align_down(CPU_ENTRY_AREA_BASE);
++ shadow_cea_end = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_BASE +
++ CPU_ENTRY_AREA_MAP_SIZE);
+
+ kasan_populate_early_shadow(
+ kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+@@ -403,9 +407,9 @@ void __init kasan_init(void)
+
+ kasan_populate_early_shadow(
+ kasan_mem_to_shadow((void *)VMALLOC_END + 1),
+- shadow_cea_begin);
++ (void *)shadow_cea_begin);
+
+- kasan_populate_early_shadow(shadow_cea_end,
++ kasan_populate_early_shadow((void *)shadow_cea_end,
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+ kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext),
+--
+2.25.1
+
diff --git a/0022-x86-kasan-Populate-shadow-for-shared-chunk-of-the-CP.patch b/0022-x86-kasan-Populate-shadow-for-shared-chunk-of-the-CP.patch
new file mode 100644
index 0000000..d6f9b77
--- /dev/null
+++ b/0022-x86-kasan-Populate-shadow-for-shared-chunk-of-the-CP.patch
@@ -0,0 +1,99 @@
+From 885cbab14224aca9bcf6df23a432a84e55b55dd5 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc(a)google.com>
+Date: Fri, 17 Mar 2023 03:07:47 +0000
+Subject: [PATCH 22/24] x86/kasan: Populate shadow for shared chunk of the CPU
+ entry area
+
+mainline inclusion
+from mainline-v6.2-rc1
+commit 1cfaac2400c73378e78182a706be0f3ac8b93cd7
+category: bugfix
+bugzilla: 188336
+CVE: CVE-2023-0597
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
+
+--------------------------------
+
+Popuplate the shadow for the shared portion of the CPU entry area, i.e.
+the read-only IDT mapping, during KASAN initialization. A recent change
+modified KASAN to map the per-CPU areas on-demand, but forgot to keep a
+shadow for the common area that is shared amongst all CPUs.
+
+Map the common area in KASAN init instead of letting idt_map_in_cea() do
+the dirty work so that it Just Works in the unlikely event more shared
+data is shoved into the CPU entry area.
+
+The bug manifests as a not-present #PF when software attempts to lookup
+an IDT entry, e.g. when KVM is handling IRQs on Intel CPUs (KVM performs
+direct CALL to the IRQ handler to avoid the overhead of INTn):
+
+ BUG: unable to handle page fault for address: fffffbc0000001d8
+ #PF: supervisor read access in kernel mode
+ #PF: error_code(0x0000) - not-present page
+ PGD 16c03a067 P4D 16c03a067 PUD 0
+ Oops: 0000 [#1] PREEMPT SMP KASAN
+ CPU: 5 PID: 901 Comm: repro Tainted: G W 6.1.0-rc3+ #410
+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015
+ RIP: 0010:kasan_check_range+0xdf/0x190
+ vmx_handle_exit_irqoff+0x152/0x290 [kvm_intel]
+ vcpu_run+0x1d89/0x2bd0 [kvm]
+ kvm_arch_vcpu_ioctl_run+0x3ce/0xa70 [kvm]
+ kvm_vcpu_ioctl+0x349/0x900 [kvm]
+ __x64_sys_ioctl+0xb8/0xf0
+ do_syscall_64+0x2b/0x50
+ entry_SYSCALL_64_after_hwframe+0x46/0xb0
+
+Fixes: 9fd429c28073 ("x86/kasan: Map shadow for percpu pages on demand")
+Reported-by: syzbot+8cdd16fd5a6c0565e227(a)syzkaller.appspotmail.com
+Signed-off-by: Sean Christopherson <seanjc(a)google.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz(a)infradead.org>
+Link: https://lkml.kernel.org/r/20221110203504.1985010-6-seanjc@google.com
+Signed-off-by: Tong Tiangen <tongtiangen(a)huawei.com>
+Reviewed-by: Nanyong Sun <sunnanyong(a)huawei.com>
+Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
+Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
+---
+ arch/x86/mm/kasan_init_64.c | 12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
+index afc5e129ca7b..0302491d799d 100644
+--- a/arch/x86/mm/kasan_init_64.c
++++ b/arch/x86/mm/kasan_init_64.c
+@@ -341,7 +341,7 @@ void __init kasan_populate_shadow_for_vaddr(void *va, size_t size, int nid)
+
+ void __init kasan_init(void)
+ {
+- unsigned long shadow_cea_begin, shadow_cea_end;
++ unsigned long shadow_cea_begin, shadow_cea_per_cpu_begin, shadow_cea_end;
+ int i;
+
+ memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+@@ -384,6 +384,7 @@ void __init kasan_init(void)
+ }
+
+ shadow_cea_begin = kasan_mem_to_shadow_align_down(CPU_ENTRY_AREA_BASE);
++ shadow_cea_per_cpu_begin = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_PER_CPU);
+ shadow_cea_end = kasan_mem_to_shadow_align_up(CPU_ENTRY_AREA_BASE +
+ CPU_ENTRY_AREA_MAP_SIZE);
+
+@@ -409,6 +410,15 @@ void __init kasan_init(void)
+ kasan_mem_to_shadow((void *)VMALLOC_END + 1),
+ (void *)shadow_cea_begin);
+
++ /*
++ * Populate the shadow for the shared portion of the CPU entry area.
++ * Shadows for the per-CPU areas are mapped on-demand, as each CPU's
++ * area is randomly placed somewhere in the 512GiB range and mapping
++ * the entire 512GiB range is prohibitively expensive.
++ */
++ kasan_populate_shadow(shadow_cea_begin,
++ shadow_cea_per_cpu_begin, 0);
++
+ kasan_populate_early_shadow((void *)shadow_cea_end,
+ kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+--
+2.25.1
+
diff --git a/0023-net-sched-act_mirred-better-wording-on-protection-ag.patch b/0023-net-sched-act_mirred-better-wording-on-protection-ag.patch
new file mode 100644
index 0000000..8065822
--- /dev/null
+++ b/0023-net-sched-act_mirred-better-wording-on-protection-ag.patch
@@ -0,0 +1,97 @@
+From 1420d4aeb4cecca648b494e6d875c222da1d9309 Mon Sep 17 00:00:00 2001
+From: Davide Caratti <dcaratti(a)redhat.com>
+Date: Sat, 18 Mar 2023 16:46:22 +0800
+Subject: [PATCH 23/24] net/sched: act_mirred: better wording on protection
+ against excessive stack growth
+
+mainline inclusion
+from mainline-v6.3-rc1
+commit 78dcdffe0418ac8f3f057f26fe71ccf4d8ed851f
+category: bugfix
+bugzilla: https://gitee.com/src-openeuler/kernel/issues/I64END
+CVE: CVE-2022-4269
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
+
+--------------------------------
+
+with commit e2ca070f89ec ("net: sched: protect against stack overflow in
+TC act_mirred"), act_mirred protected itself against excessive stack growth
+using per_cpu counter of nested calls to tcf_mirred_act(), and capping it
+to MIRRED_RECURSION_LIMIT. However, such protection does not detect
+recursion/loops in case the packet is enqueued to the backlog (for example,
+when the mirred target device has RPS or skb timestamping enabled). Change
+the wording from "recursion" to "nesting" to make it more clear to readers.
+
+CC: Jamal Hadi Salim <jhs(a)mojatatu.com>
+Signed-off-by: Davide Caratti <dcaratti(a)redhat.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner(a)gmail.com>
+Acked-by: Jamal Hadi Salim <jhs(a)mojatatu.com>
+Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
+Signed-off-by: Ziyang Xuan <william.xuanziyang(a)huawei.com>
+Reviewed-by: Yue Haibing <yuehaibing(a)huawei.com>
+Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
+---
+ net/sched/act_mirred.c | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
+index b8ad6ae282c0..ded6ee054be1 100644
+--- a/net/sched/act_mirred.c
++++ b/net/sched/act_mirred.c
+@@ -28,8 +28,8 @@
+ static LIST_HEAD(mirred_list);
+ static DEFINE_SPINLOCK(mirred_list_lock);
+
+-#define MIRRED_RECURSION_LIMIT 4
+-static DEFINE_PER_CPU(unsigned int, mirred_rec_level);
++#define MIRRED_NEST_LIMIT 4
++static DEFINE_PER_CPU(unsigned int, mirred_nest_level);
+
+ static bool tcf_mirred_is_act_redirect(int action)
+ {
+@@ -224,7 +224,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
+ struct sk_buff *skb2 = skb;
+ bool m_mac_header_xmit;
+ struct net_device *dev;
+- unsigned int rec_level;
++ unsigned int nest_level;
+ int retval, err = 0;
+ bool use_reinsert;
+ bool want_ingress;
+@@ -235,11 +235,11 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
+ int mac_len;
+ bool at_nh;
+
+- rec_level = __this_cpu_inc_return(mirred_rec_level);
+- if (unlikely(rec_level > MIRRED_RECURSION_LIMIT)) {
++ nest_level = __this_cpu_inc_return(mirred_nest_level);
++ if (unlikely(nest_level > MIRRED_NEST_LIMIT)) {
+ net_warn_ratelimited("Packet exceeded mirred recursion limit on dev %s\n",
+ netdev_name(skb->dev));
+- __this_cpu_dec(mirred_rec_level);
++ __this_cpu_dec(mirred_nest_level);
+ return TC_ACT_SHOT;
+ }
+
+@@ -308,7 +308,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
+ err = tcf_mirred_forward(want_ingress, skb);
+ if (err)
+ tcf_action_inc_overlimit_qstats(&m->common);
+- __this_cpu_dec(mirred_rec_level);
++ __this_cpu_dec(mirred_nest_level);
+ return TC_ACT_CONSUMED;
+ }
+ }
+@@ -320,7 +320,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a,
+ if (tcf_mirred_is_act_redirect(m_eaction))
+ retval = TC_ACT_SHOT;
+ }
+- __this_cpu_dec(mirred_rec_level);
++ __this_cpu_dec(mirred_nest_level);
+
+ return retval;
+ }
+--
+2.25.1
+
diff --git a/0024-act_mirred-use-the-backlog-for-nested-calls-to-mirre.patch b/0024-act_mirred-use-the-backlog-for-nested-calls-to-mirre.patch
new file mode 100644
index 0000000..edfc0ba
--- /dev/null
+++ b/0024-act_mirred-use-the-backlog-for-nested-calls-to-mirre.patch
@@ -0,0 +1,149 @@
+From a6bb3989ccb7d3493c20e709179904733c6db856 Mon Sep 17 00:00:00 2001
+From: Davide Caratti <dcaratti(a)redhat.com>
+Date: Sat, 18 Mar 2023 16:46:40 +0800
+Subject: [PATCH 24/24] act_mirred: use the backlog for nested calls to mirred
+ ingress
+
+mainline inclusion
+from mainline-v6.3-rc1
+commit ca22da2fbd693b54dc8e3b7b54ccc9f7e9ba3640
+category: bugfix
+bugzilla: https://gitee.com/src-openeuler/kernel/issues/I64END
+CVE: CVE-2022-4269
+
+Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
+
+--------------------------------
+
+William reports kernel soft-lockups on some OVS topologies when TC mirred
+egress->ingress action is hit by local TCP traffic [1].
+The same can also be reproduced with SCTP (thanks Xin for verifying), when
+client and server reach themselves through mirred egress to ingress, and
+one of the two peers sends a "heartbeat" packet (from within a timer).
+
+Enqueueing to backlog proved to fix this soft lockup; however, as Cong
+noticed [2], we should preserve - when possible - the current mirred
+behavior that counts as "overlimits" any eventual packet drop subsequent to
+the mirred forwarding action [3]. A compromise solution might use the
+backlog only when tcf_mirred_act() has a nest level greater than one:
+change tcf_mirred_forward() accordingly.
+
+Also, add a kselftest that can reproduce the lockup and verifies TC mirred
+ability to account for further packet drops after TC mirred egress->ingress
+(when the nest level is 1).
+
+ [1] https://lore.kernel.org/netdev/33dc43f587ec1388ba456b4915c75f02a8aae226.166…
+ [2] https://lore.kernel.org/netdev/Y0w%2FWWY60gqrtGLp@pop-os.localdomain/
+ [3] such behavior is not guaranteed: for example, if RPS or skb RX
+ timestamping is enabled on the mirred target device, the kernel
+ can defer receiving the skb and return NET_RX_SUCCESS inside
+ tcf_mirred_forward().
+
+Reported-by: William Zhao <wizhao(a)redhat.com>
+CC: Xin Long <lucien.xin(a)gmail.com>
+Signed-off-by: Davide Caratti <dcaratti(a)redhat.com>
+Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner(a)gmail.com>
+Acked-by: Jamal Hadi Salim <jhs(a)mojatatu.com>
+Signed-off-by: Paolo Abeni <pabeni(a)redhat.com>
+Signed-off-by: Ziyang Xuan <william.xuanziyang(a)huawei.com>
+Reviewed-by: Yue Haibing <yuehaibing(a)huawei.com>
+Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
+---
+ net/sched/act_mirred.c | 7 +++
+ .../selftests/net/forwarding/tc_actions.sh | 49 ++++++++++++++++++-
+ 2 files changed, 55 insertions(+), 1 deletion(-)
+
+diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
+index ded6ee054be1..baeae5e5c8f0 100644
+--- a/net/sched/act_mirred.c
++++ b/net/sched/act_mirred.c
+@@ -205,12 +205,19 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
+ return err;
+ }
+
++static bool is_mirred_nested(void)
++{
++ return unlikely(__this_cpu_read(mirred_nest_level) > 1);
++}
++
+ static int tcf_mirred_forward(bool want_ingress, struct sk_buff *skb)
+ {
+ int err;
+
+ if (!want_ingress)
+ err = tcf_dev_queue_xmit(skb, dev_queue_xmit);
++ else if (is_mirred_nested())
++ err = netif_rx(skb);
+ else
+ err = netif_receive_skb(skb);
+
+diff --git a/tools/testing/selftests/net/forwarding/tc_actions.sh b/tools/testing/selftests/net/forwarding/tc_actions.sh
+index 1e0a62f638fe..919c0dd9fe4b 100755
+--- a/tools/testing/selftests/net/forwarding/tc_actions.sh
++++ b/tools/testing/selftests/net/forwarding/tc_actions.sh
+@@ -3,7 +3,8 @@
+
+ ALL_TESTS="gact_drop_and_ok_test mirred_egress_redirect_test \
+ mirred_egress_mirror_test matchall_mirred_egress_mirror_test \
+- gact_trap_test mirred_egress_to_ingress_test"
++ gact_trap_test mirred_egress_to_ingress_test \
++ mirred_egress_to_ingress_tcp_test"
+ NUM_NETIFS=4
+ source tc_common.sh
+ source lib.sh
+@@ -198,6 +199,52 @@ mirred_egress_to_ingress_test()
+ log_test "mirred_egress_to_ingress ($tcflags)"
+ }
+
++mirred_egress_to_ingress_tcp_test()
++{
++ local tmpfile=$(mktemp) tmpfile1=$(mktemp)
++
++ RET=0
++ dd conv=sparse status=none if=/dev/zero bs=1M count=2 of=$tmpfile
++ tc filter add dev $h1 protocol ip pref 100 handle 100 egress flower \
++ $tcflags ip_proto tcp src_ip 192.0.2.1 dst_ip 192.0.2.2 \
++ action ct commit nat src addr 192.0.2.2 pipe \
++ action ct clear pipe \
++ action ct commit nat dst addr 192.0.2.1 pipe \
++ action ct clear pipe \
++ action skbedit ptype host pipe \
++ action mirred ingress redirect dev $h1
++ tc filter add dev $h1 protocol ip pref 101 handle 101 egress flower \
++ $tcflags ip_proto icmp \
++ action mirred ingress redirect dev $h1
++ tc filter add dev $h1 protocol ip pref 102 handle 102 ingress flower \
++ ip_proto icmp \
++ action drop
++
++ ip vrf exec v$h1 nc --recv-only -w10 -l -p 12345 -o $tmpfile1 &
++ local rpid=$!
++ ip vrf exec v$h1 nc -w1 --send-only 192.0.2.2 12345 <$tmpfile
++ wait -n $rpid
++ cmp -s $tmpfile $tmpfile1
++ check_err $? "server output check failed"
++
++ $MZ $h1 -c 10 -p 64 -a $h1mac -b $h1mac -A 192.0.2.1 -B 192.0.2.1 \
++ -t icmp "ping,id=42,seq=5" -q
++ tc_check_packets "dev $h1 egress" 101 10
++ check_err $? "didn't mirred redirect ICMP"
++ tc_check_packets "dev $h1 ingress" 102 10
++ check_err $? "didn't drop mirred ICMP"
++ local overlimits=$(tc_rule_stats_get ${h1} 101 egress .overlimits)
++ test ${overlimits} = 10
++ check_err $? "wrong overlimits, expected 10 got ${overlimits}"
++
++ tc filter del dev $h1 egress protocol ip pref 100 handle 100 flower
++ tc filter del dev $h1 egress protocol ip pref 101 handle 101 flower
++ tc filter del dev $h1 ingress protocol ip pref 102 handle 102 flower
++
++ rm -f $tmpfile $tmpfile1
++ log_test "mirred_egress_to_ingress_tcp ($tcflags)"
++}
++
+ setup_prepare()
+ {
+ h1=${NETIFS[p1]}
+--
+2.25.1
+
diff --git a/kernel.spec b/kernel.spec
index 0eb5b16..f4bc7a4 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -10,9 +10,9 @@
%global upstream_version 6.1
%global upstream_sublevel 19
-%global devel_release 6
+%global devel_release 7
%global maintenance_release .0.0
-%global pkg_release .16
+%global pkg_release .17
%define with_debuginfo 0
# Do not recompute the build-id of vmlinux in find-debuginfo.sh
@@ -84,6 +84,16 @@ Patch0011: 0011-bpf-Two-helper-functions-are-introduced-to-parse-use.patch
Patch0012: 0012-net-bpf-Add-a-writeable_tracepoint-to-inet_stream_co.patch
Patch0013: 0013-nfs-client-multipath.patch
Patch0014: 0014-nfs-client-multipath-config.patch
+Patch0015: 0015-mm-demotion-fix-NULL-vs-IS_ERR-checking-in-memory_ti.patch
+Patch0016: 0016-x86-mm-Randomize-per-cpu-entry-area.patch
+Patch0017: 0017-x86-kasan-Map-shadow-for-percpu-pages-on-demand.patch
+Patch0018: 0018-x86-mm-Recompute-physical-address-for-every-page-of-.patch
+Patch0019: 0019-x86-mm-Populate-KASAN-shadow-for-entire-per-CPU-rang.patch
+Patch0020: 0020-x86-kasan-Rename-local-CPU_ENTRY_AREA-variables-to-s.patch
+Patch0021: 0021-x86-kasan-Add-helpers-to-align-shadow-addresses-up-a.patch
+Patch0022: 0022-x86-kasan-Populate-shadow-for-shared-chunk-of-the-CP.patch
+Patch0023: 0023-net-sched-act_mirred-better-wording-on-protection-ag.patch
+Patch0024: 0024-act_mirred-use-the-backlog-for-nested-calls-to-mirre.patch
#BuildRequires:
@@ -323,6 +333,16 @@ Applypatches series.conf %{_builddir}/kernel-%{version}/linux-%{KernelVer}
%patch0012 -p1
%patch0013 -p1
%patch0014 -p1
+%patch0015 -p1
+%patch0016 -p1
+%patch0017 -p1
+%patch0018 -p1
+%patch0019 -p1
+%patch0020 -p1
+%patch0021 -p1
+%patch0022 -p1
+%patch0023 -p1
+%patch0024 -p1
touch .scmversion
find . \( -name "*.orig" -o -name "*~" \) -exec rm -f {} \; >/dev/null
@@ -905,6 +925,9 @@ fi
%endif
%changelog
+* Fri Mar 18 2023 Jialin Zhang <zhangjialin11(a)huawei.com> - 6.1.19-7.0.0.17
+- Fix CVE-2023-23005, CVE-2023-0597 and CVE-2022-4269
+
* Fri Mar 17 2023 Zheng Zengkai <zhengzengkai(a)huawei.com> - 6.1.19-6.0.0.16
- Fix kernel rpm build failure that libperf-jvmti.so is missing
--
2.25.1
1
0

[PATCH PR openEuler-22.03-LTS-SP1] mm/vmalloc: huge vmalloc backing pages should be split rather than compound
by Jialin Zhang 17 Mar '23
by Jialin Zhang 17 Mar '23
17 Mar '23
From: Nicholas Piggin <npiggin(a)gmail.com>
mainline inclusion
from mainline-v5.18-rc4
commit 3b8000ae185cb068adbda5f966a3835053c85fd4
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I6LD0S
CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
Huge vmalloc higher-order backing pages were allocated with __GFP_COMP
in order to allow the sub-pages to be refcounted by callers such as
"remap_vmalloc_page [sic]" (remap_vmalloc_range).
However a similar problem exists for other struct page fields callers
use, for example fb_deferred_io_fault() takes a vmalloc'ed page and
not only refcounts it but uses ->lru, ->mapping, ->index.
This is not compatible with compound sub-pages, and can cause bad page
state issues like
BUG: Bad page state in process swapper/0 pfn:00743
page:(____ptrval____) refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x743
flags: 0x7ffff000000000(node=0|zone=0|lastcpupid=0x7ffff)
raw: 007ffff000000000 c00c00000001d0c8 c00c00000001d0c8 0000000000000000
raw: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000
page dumped because: corrupted mapping in tail page
Modules linked in:
CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.18.0-rc3-00082-gfc6fff4a7ce1-dirty #2810
Call Trace:
dump_stack_lvl+0x74/0xa8 (unreliable)
bad_page+0x12c/0x170
free_tail_pages_check+0xe8/0x190
free_pcp_prepare+0x31c/0x4e0
free_unref_page+0x40/0x1b0
__vunmap+0x1d8/0x420
...
The correct approach is to use split high-order pages for the huge
vmalloc backing. These allow callers to treat them in exactly the same
way as individually-allocated order-0 pages.
Link: https://lore.kernel.org/all/14444103-d51b-0fb3-ee63-c3f182f0b546@molgen.mpg…
Signed-off-by: Nicholas Piggin <npiggin(a)gmail.com>
Cc: Paul Menzel <pmenzel(a)molgen.mpg.de>
Cc: Song Liu <songliubraving(a)fb.com>
Cc: Rick Edgecombe <rick.p.edgecombe(a)intel.com>
Signed-off-by: Linus Torvalds <torvalds(a)linux-foundation.org>
conflicts:
mm/vmalloc.c
Signed-off-by: ZhangPeng <zhangpeng362(a)huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com>
Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com>
---
mm/vmalloc.c | 22 +++++++++++++++++-----
1 file changed, 17 insertions(+), 5 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index e27cd716ca95..2ca2c1bc0db9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2641,14 +2641,17 @@ static void __vunmap(const void *addr, int deallocate_pages)
vm_remove_mappings(area, deallocate_pages);
if (deallocate_pages) {
- unsigned int page_order = vm_area_page_order(area);
int i;
- for (i = 0; i < area->nr_pages; i += 1U << page_order) {
+ for (i = 0; i < area->nr_pages; i++) {
struct page *page = area->pages[i];
BUG_ON(!page);
- __free_pages(page, page_order);
+ /*
+ * High-order allocs for huge vmallocs are split, so
+ * can be freed as an array of order-0 allocations
+ */
+ __free_pages(page, 0);
}
atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
@@ -2930,8 +2933,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
int p;
- /* Compound pages required for remap_vmalloc_page */
- page = alloc_pages_node(node, gfp_mask | __GFP_COMP, page_order);
+ page = alloc_pages_node(node, gfp_mask, page_order);
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vfree() */
area->nr_pages = i;
@@ -2943,6 +2945,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
goto fail;
}
+ /*
+ * Higher order allocations must be able to be treated as
+ * indepdenent small pages by callers (as they can with
+ * small-page vmallocs). Some drivers do their own refcounting
+ * on vmalloc_to_page() pages, some use page->mapping,
+ * page->lru, etc.
+ */
+ if (page_order)
+ split_page(page, page_order);
+
for (p = 0; p < (1U << page_order); p++)
area->pages[i + p] = page + p;
--
2.25.1
1
0