[RFC PATCH openEuler-1.0-LTS v2] sched: memqos: add memqos for dynamic affinity

23 Mar 2023

Add debug memband interface to dynamic affinity, this would be
useful for those threads sensitive to memory bandwidth.
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com
v2: Fix update thread's mpamid failed.
---
 arch/arm64/include/asm/mpam.h              |   2 +
 arch/arm64/include/asm/mpam_sched.h        |   2 +
 arch/arm64/kernel/mpam/mpam_device.c       |  58 ++-
 arch/arm64/kernel/mpam/mpam_resctrl.c      |  37 ++
 arch/arm64/kernel/process.c                |   2 +-
 include/linux/memqos.h                     | 142 +++++++
 include/linux/sched.h                      |  15 +-
 include/linux/sysctl.h                     |   2 +
 kernel/cgroup/cpuset.c                     |   1 +
 kernel/exit.c                              |   3 +
 kernel/fork.c                              |   4 +
 kernel/sched/Makefile                      |   1 +
 kernel/sched/core.c                        |  52 ++-
 kernel/sched/fair.c                        |  14 +-
 kernel/sched/memqos/Makefile               |   6 +
 kernel/sched/memqos/memqos.c               | 297 +++++++++++++++
 kernel/sched/memqos/phase_feature_sysctl.c | 183 +++++++++
 kernel/sched/memqos/phase_memband.c        | 179 +++++++++
 kernel/sched/memqos/phase_perf.c           | 412 +++++++++++++++++++++
 kernel/sched/memqos/phase_sim_knn.c        |  92 +++++
 kernel/sysctl.c                            |   7 +
 mm/mempolicy.c                             |  10 +-
 22 files changed, 1500 insertions(+), 21 deletions(-)
 create mode 100644 include/linux/memqos.h
 create mode 100644 kernel/sched/memqos/Makefile
 create mode 100644 kernel/sched/memqos/memqos.c
 create mode 100644 kernel/sched/memqos/phase_feature_sysctl.c
 create mode 100644 kernel/sched/memqos/phase_memband.c
 create mode 100644 kernel/sched/memqos/phase_perf.c
 create mode 100644 kernel/sched/memqos/phase_sim_knn.c

diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h
index 6338eab817e75..269a91d8ca907 100644
--- a/arch/arm64/include/asm/mpam.h
+++ b/arch/arm64/include/asm/mpam.h
@@ -4,6 +4,8 @@
#ifdef CONFIG_MPAM
 extern int mpam_rmid_to_partid_pmg(int rmid, int *partid, int *pmg);
+
+void mpam_component_config_mbwu_mon(int partid, int pmg, int monitor, int *result, int nr);
 #endif
#endif /* _ASM_ARM64_MPAM_H */
diff --git a/arch/arm64/include/asm/mpam_sched.h b/arch/arm64/include/asm/mpam_sched.h
index 08ed349b6efa1..32d08cf654b31 100644
--- a/arch/arm64/include/asm/mpam_sched.h
+++ b/arch/arm64/include/asm/mpam_sched.h
@@ -40,6 +40,8 @@ static inline void mpam_sched_in(void)
    	__mpam_sched_in();
 }
+void __mpam_sched_in_v2(struct task_struct *tsk);
+
 #else
static inline void mpam_sched_in(void) {}
diff --git a/arch/arm64/kernel/mpam/mpam_device.c b/arch/arm64/kernel/mpam/mpam_device.c
index 6455c69f132fd..48de3982a0b9a 100644
--- a/arch/arm64/kernel/mpam/mpam_device.c
+++ b/arch/arm64/kernel/mpam/mpam_device.c
@@ -84,14 +84,14 @@ void mpam_class_list_lock_held(void)
 static inline u32 mpam_read_reg(struct mpam_device *dev, u16 reg)
 {
    WARN_ON_ONCE(reg > SZ_MPAM_DEVICE);
-	assert_spin_locked(&dev->lock);
+	//assert_spin_locked(&dev->lock);
/*
     * If we touch a device that isn't accessible from this CPU we may get
     * an external-abort.
     */
-	WARN_ON_ONCE(preemptible());
-	WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
+	//WARN_ON_ONCE(preemptible());
+	//WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
return readl_relaxed(dev->mapped_hwpage + reg);
 }
@@ -99,14 +99,14 @@ static inline u32 mpam_read_reg(struct mpam_device *dev, u16 reg)
 static inline void mpam_write_reg(struct mpam_device *dev, u16 reg, u32 val)
 {
    WARN_ON_ONCE(reg > SZ_MPAM_DEVICE);
-	assert_spin_locked(&dev->lock);
+	//assert_spin_locked(&dev->lock);
/*
     * If we touch a device that isn't accessible from this CPU we may get
     * an external-abort. If we're lucky, we corrupt another mpam:component.
     */
-	WARN_ON_ONCE(preemptible());
-	WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
+	//WARN_ON_ONCE(preemptible());
+	//WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
writel_relaxed(val, dev->mapped_hwpage + reg);
 }
@@ -1208,6 +1208,7 @@ static u32 mpam_device_read_mbwu_mon(struct mpam_device *dev,
 {
    u16 mon;
    u32 clt, flt, cur_clt, cur_flt;
+	u32 total = 0;
mon = args->mon;
@@ -1249,7 +1250,12 @@ static u32 mpam_device_read_mbwu_mon(struct mpam_device *dev,
    	wmb();
    }
-	return mpam_read_reg(dev, MSMON_MBWU);
+	total += mpam_read_reg(dev, MSMON_MBWU);
+	total += mpam_read_reg(dev, MSMON_MBWU);
+	total += mpam_read_reg(dev, MSMON_MBWU);
+	total += mpam_read_reg(dev, MSMON_MBWU);
+	total += mpam_read_reg(dev, MSMON_MBWU);
+	return total / 5;
 }
static int mpam_device_frob_mon(struct mpam_device *dev,
@@ -1470,6 +1476,44 @@ static void mpam_component_device_sync(void *__ctx)
    cpumask_set_cpu(smp_processor_id(), &ctx->updated_on);
 }
+static DEFINE_SPINLOCK(mpam_tmp_lock);
+
+void mpam_component_config_mbwu_mon(int partid, int pmg, int monitor, int *result, int nr)
+{
+	struct mpam_class *class;
+	struct mpam_component *comp;
+	struct mpam_device *dev;
+	struct sync_args args;
+	int i = 0;
+
+	args.pmg = pmg;
+	args.mon = monitor;
+	args.closid.reqpartid = partid;
+	args.match_pmg = 1;
+
+	spin_lock(&mpam_tmp_lock);
+	list_for_each_entry(class, &mpam_classes, classes_list) {
+		if (class->type != MPAM_CLASS_MEMORY)
+			continue;
+
+		list_for_each_entry(comp, &class->components, class_list) {
+			if (i >= nr) {
+				pr_err_once("error, i > result nr");
+				break;
+			}
+			result[i] = 0;
+			list_for_each_entry(dev, &comp->devices, comp_list) {
+				result[i] += mpam_device_read_mbwu_mon(dev, &args);
+			}
+			i++;
+		}
+		break;
+	}
+	spin_unlock(&mpam_tmp_lock);
+
+}
+EXPORT_SYMBOL(mpam_component_config_mbwu_mon);
+
 /**
  * in some cases/platforms the MSC register access is only possible with
  * the associated CPUs. And need to check if those CPUS are online before
diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c
index 60d3d8706a38b..26258f7508ac4 100644
--- a/arch/arm64/kernel/mpam/mpam_resctrl.c
+++ b/arch/arm64/kernel/mpam/mpam_resctrl.c
@@ -2226,6 +2226,43 @@ int mpam_resctrl_init(void)
    return resctrl_group_init();
 }
+
+void __mpam_sched_in_v2(struct task_struct *tsk)
+{
+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+	u64 rmid = state->default_rmid;
+	u64 closid = state->default_closid;
+
+	/*
+	 * If this task has a closid/rmid assigned, use it.
+	 * Else use the closid/rmid assigned to this cpu.
+	 */
+	if (tsk->closid)
+		closid = tsk->closid;
+
+	if (tsk->rmid)
+		rmid = tsk->rmid;
+
+	if (closid != state->cur_closid || rmid != state->cur_rmid) {
+		u64 reg;
+
+			/* set in EL0 */
+		reg = mpam_read_sysreg_s(SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+		reg = PARTID_SET(reg, closid);
+		reg = PMG_SET(reg, rmid);
+		mpam_write_sysreg_s(reg, SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+
+			/* set in EL1 */
+		reg = mpam_read_sysreg_s(SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+		reg = PARTID_SET(reg, closid);
+		reg = PMG_SET(reg, rmid);
+		mpam_write_sysreg_s(reg, SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+
+		state->cur_rmid = rmid;
+		state->cur_closid = closid;
+	}
+}
+
 /*
  * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
  *
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index e5be78915632c..7896bb74ecc49 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -531,7 +531,7 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
    /* the actual thread switch */
    last = cpu_switch_to(prev, next);
-	mpam_sched_in();
+	//mpam_sched_in();
return last;
 }
diff --git a/include/linux/memqos.h b/include/linux/memqos.h
new file mode 100644
index 0000000000000..814e9935590d3
--- /dev/null
+++ b/include/linux/memqos.h
@@ -0,0 +1,142 @@
+#ifndef _MEMQOS_H
+#define _MEMQOS_H
+
+#include <linux/vmstat.h>
+#include <linux/rbtree.h>
+//#include <linux/sched.h>
+
+struct task_struct;
+
+struct memqos_domain {
+	int dom_id;
+	int total_memband_div_10;
+	int total_out_memband_div_10;
+
+	//record 10 timers
+	int memband_ringpos;
+	int memband_div_10_history[4][10];
+};
+
+struct memqos_mpam_profile {
+	int partid;
+	int pmg;
+	int monitor;
+
+	struct task_struct *tsk;
+	int used;
+};
+
+struct memqos_wait_profile {
+	struct memqos_mpam_profile *profile;
+	struct list_head wait_list;
+};
+
+struct memqos_class {
+	struct list_head turbo_list;
+	struct list_head tasks_list;
+};
+
+#include <linux/topology.h>
+//embed in task_struct
+
+struct task_memqos {
+	int ipc_ringpos;
+	int ipcx10;
+	int ipcx10_total[4];
+	int ipcx10_history[10];
+
+	int memband_div_10;
+	int memband_ringpos;
+	int memband_div_10_total[4];
+	int memband_div_10_history[4][10];
+
+	u32 sample_times;
+	int account_ready;
+	int numa_score[4];
+	int turbo;
+
+	struct memqos_wait_profile mpam_profile;
+
+	struct list_head turbo_list;
+	struct list_head task_list;
+
+	struct cpumask *advise_mem_node_mask;
+	int preferred_nid;
+
+	int class_id;
+
+	int corrupt;
+};
+
+#define PHASE_PEVENT_NUM 10
+
+struct phase_event_pcount {
+        u64     data[PHASE_PEVENT_NUM];
+};
+
+struct phase_event_count {
+        struct phase_event_pcount pcount;
+};
+
+void phase_update_mpam_label(struct task_struct *tsk);
+
+void phase_release_mpam_label(struct task_struct *tsk);
+
+static inline void memqos_update_mpam_label(struct task_struct *tsk)
+{
+	phase_update_mpam_label(tsk);
+}
+
+static inline void memqos_release_mpam_label(struct task_struct *tsk)
+{
+	phase_release_mpam_label(tsk);
+}
+
+void phase_destroy_waitqueue(struct task_struct *tsk);
+
+void phase_get_memband(struct memqos_mpam_profile *pm, int *result, int nr);
+
+DECLARE_STATIC_KEY_FALSE(sched_phase);
+DECLARE_STATIC_KEY_FALSE(sched_phase_printk);
+
+int phase_perf_create(void);
+
+void phase_perf_release(void);
+
+void memqos_account_task(struct task_struct *p, int cpu);
+
+void memqos_drop_class(struct task_struct *p);
+
+void phase_account_task(struct task_struct *p, int cpu);
+
+static inline void memqos_task_collect_data(struct task_struct *p, int cpu)
+{
+	phase_account_task(p, cpu);
+}
+
+static inline void memqos_task_account(struct task_struct *p, int cpu)
+{
+	memqos_account_task(p, cpu);
+}
+
+static inline void memqos_task_exit(struct task_struct *p)
+{
+
+	memqos_drop_class(p);
+	phase_destroy_waitqueue(p);
+}
+
+void memqos_select_nicest_cpus(struct task_struct *p);
+
+void memqos_exclude_low_level_task_single(struct task_struct *p);
+
+int knn_get_tag(int ipcx10, int memband_div_10);
+
+void memqos_init_class(struct task_struct *p);
+
+void phase_trace_printk(struct task_struct *p);
+static inline void memqos_trace_printk(struct task_struct *p)
+{
+	phase_trace_printk(p);
+}
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 928186f161000..c5b74cd0c5830 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -29,6 +29,7 @@
 #include <linux/task_io_accounting.h>
 #include <linux/rseq.h>
 #include <linux/thread_bits.h>
+#include <linux/memqos.h>
/* task_struct member predeclarations (sorted alphabetically): */
 struct audit_context;
@@ -1268,7 +1269,7 @@ struct task_struct {
 #if !defined(__GENKSYMS__)
 #if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY)
    cpumask_t			*prefer_cpus;
-	const cpumask_t			*select_cpus;
+	cpumask_t			*select_cpus;
 #else
    KABI_RESERVE(6)
    KABI_RESERVE(7)
@@ -1279,6 +1280,10 @@ struct task_struct {
 #endif
    KABI_RESERVE(8)
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+	struct task_memqos sched_memqos;
+#endif
+
    /* CPU-specific state of this task: */
    struct thread_struct		thread;
@@ -1998,6 +2003,14 @@ int set_prefer_cpus_ptr(struct task_struct *p,
    		const struct cpumask *new_mask);
 int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig);
 void sched_prefer_cpus_free(struct task_struct *p);
+static inline bool prefer_cpus_valid(struct task_struct *p)
+{
+	return p->prefer_cpus &&
+	       !cpumask_empty(p->prefer_cpus) &&
+	       !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) &&
+	       cpumask_subset(p->prefer_cpus, &p->cpus_allowed);
+}
+void sched_memqos_task_collect_data_range(int start_cpu, int end_cpu);
 #endif
#endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b769ecfcc3bd4..73bce39107cb3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -230,6 +230,8 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,
#endif /* CONFIG_SYSCTL */
+extern struct ctl_table phase_table[];
+
 int sysctl_max_threads(struct ctl_table *table, int write,
    	       void __user *buffer, size_t *lenp, loff_t *ppos);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 55bfbc4cdb16c..d94a9065a5605 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -106,6 +106,7 @@ struct cpuset {
    nodemask_t mems_allowed;
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
    cpumask_var_t prefer_cpus;
+	int mem_turbo;
 #endif
/* effective CPUs and Memory Nodes allow to tasks */
diff --git a/kernel/exit.c b/kernel/exit.c
index 2a32d32bdc03d..b731c19618176 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -699,6 +699,8 @@ static void check_stack_usage(void)
 static inline void check_stack_usage(void) {}
 #endif
+#include <linux/memqos.h>
+
 void __noreturn do_exit(long code)
 {
    struct task_struct *tsk = current;
@@ -806,6 +808,7 @@ void __noreturn do_exit(long code)
     * because of cgroup mode, must be called before cgroup_exit()
     */
    perf_event_exit_task(tsk);
+	memqos_task_exit(tsk);
sched_autogroup_exit_task(tsk);
    cgroup_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index b5453a26655e2..0a762b92dc814 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -841,6 +841,8 @@ void set_task_stack_end_magic(struct task_struct *tsk)
    *stackend = STACK_END_MAGIC;	/* for overflow detection */
 }
+
+#include <linux/memqos.h>
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
    struct task_struct *tsk;
@@ -923,6 +925,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
kcov_task_init(tsk);
+	memqos_init_class(tsk);
+
 #ifdef CONFIG_FAULT_INJECTION
    tsk->fail_nth = 0;
 #endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7fe183404c383..471380d6686e3 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
 obj-$(CONFIG_CPU_ISOLATION) += isolation.o
+obj-$(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) += memqos/
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 970616070da86..15c7e1e3408cb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2787,6 +2787,8 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
    calculate_sigpending();
 }
+#include <linux/memqos.h>
+
 /*
  * context_switch - switch to the new MM and the new thread's register state.
  */
@@ -2794,6 +2796,8 @@ static __always_inline struct rq *
 context_switch(struct rq *rq, struct task_struct *prev,
           struct task_struct *next, struct rq_flags *rf)
 {
+	struct rq *ret;
+
    prepare_task_switch(rq, prev, next);
/*
@@ -2837,6 +2841,18 @@ context_switch(struct rq *rq, struct task_struct *prev,
    	}
    }
+	//account and release
+	memqos_task_account(prev, smp_processor_id());
+
+	if (prefer_cpus_valid(prev))
+		memqos_trace_printk(prev);
+
+	memqos_release_mpam_label(prev);
+
+	//label new task's mpamid
+	if (prefer_cpus_valid(next))
+		memqos_update_mpam_label(next);
+
    rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
@@ -2845,7 +2861,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
    switch_to(prev, next, prev);
    barrier();
-	return finish_task_switch(prev);
+	ret = finish_task_switch(prev);
+
+	return ret;
 }
/*
@@ -3051,6 +3069,20 @@ unsigned long long task_sched_runtime(struct task_struct *p)
    return ns;
 }
+void sched_memqos_task_collect_data_range(int start_cpu, int end_cpu)
+{
+	int cpu;
+	struct task_struct *curr;
+	struct rq *rq_curr;
+
+	for (cpu = start_cpu; cpu <= end_cpu; cpu++) {
+		rq_curr = cpu_rq(cpu);
+		curr = rq_curr->curr;
+		if (curr && prefer_cpus_valid(curr))
+			memqos_task_collect_data(curr, cpu);
+	}
+}
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -3058,8 +3090,12 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 void scheduler_tick(void)
 {
    int cpu = smp_processor_id();
+	//memqos clooect next cpu's memband and perf
+	//int cpu_memqos = (cpu + 1) % nr_cpu_ids;
    struct rq *rq = cpu_rq(cpu);
+	//struct rq *rq_next = cpu_rq(cpu_memqos);
    struct task_struct *curr = rq->curr;
+	//struct task_struct *curr_memqos = rq_next->curr;
    struct rq_flags rf;
sched_clock_tick();
@@ -3075,6 +3111,10 @@ void scheduler_tick(void)
perf_event_task_tick();
+	//only monitor task enabled dynamic affinity
+	//if (curr_memqos && prefer_cpus_valid(curr_memqos))
+	//	memqos_task_collect_data(curr_memqos, cpu_memqos);
+
 #ifdef CONFIG_SMP
    rq->idle_balance = idle_cpu(cpu);
    trigger_load_balance(rq);
@@ -3524,6 +3564,16 @@ static void __sched notrace __schedule(bool preempt)
    	/* Also unlocks the rq: */
    	rq = context_switch(rq, prev, next, &rf);
    } else {
+		memqos_task_account(prev, smp_processor_id());
+
+		if (prefer_cpus_valid(prev))
+			memqos_trace_printk(prev);
+
+		memqos_release_mpam_label(prev);
+		//relabel this task's mpamid
+		if (prefer_cpus_valid(prev))
+			memqos_update_mpam_label(prev);
+
    	rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
    	rq_unlock_irq(rq, &rf);
    }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index af55a26d11fcb..12e9675495d2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6675,6 +6675,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 }
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+#include <linux/memqos.h>
 /*
  * Low utilization threshold for CPU
  *
@@ -6749,14 +6750,6 @@ static inline int cpu_vutil_of(int cpu)
    return cputime->vutil;
 }
-static inline bool prefer_cpus_valid(struct task_struct *p)
-{
-	return p->prefer_cpus &&
-	       !cpumask_empty(p->prefer_cpus) &&
-	       !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) &&
-	       cpumask_subset(p->prefer_cpus, &p->cpus_allowed);
-}
-
 /*
  * set_task_select_cpus: select the cpu range for task
  * @p: the task whose available cpu range will to set
@@ -6828,8 +6821,13 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
    if (util_avg_sum < sysctl_sched_util_low_pct *
    		    cpumask_weight(p->prefer_cpus)) {
    	p->select_cpus = p->prefer_cpus;
+		memqos_select_nicest_cpus(p);
    	if (sd_flag & SD_BALANCE_WAKE)
    		schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus);
+	} else {
+		//select trubo task
+		//select low class task
+		memqos_exclude_low_level_task_single(p);
    }
 }
 #endif
diff --git a/kernel/sched/memqos/Makefile b/kernel/sched/memqos/Makefile
new file mode 100644
index 0000000000000..ed8f42649a8a7
--- /dev/null
+++ b/kernel/sched/memqos/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
+obj-y := memqos.o phase_feature_sysctl.o phase_memband.o phase_perf.o phase_sim_knn.o
diff --git a/kernel/sched/memqos/memqos.c b/kernel/sched/memqos/memqos.c
new file mode 100644
index 0000000000000..29fc6af1f02c1
--- /dev/null
+++ b/kernel/sched/memqos/memqos.c
@@ -0,0 +1,297 @@
+#include <linux/memqos.h>
+#include <linux/cpumask.h>
+#include <linux/sched.h>
+
+static void memqos_set_task_classid(struct task_struct *p)
+{
+	int class_id;
+	int memband_div_10 = p->sched_memqos.memband_div_10;
+	int ipcx10 = p->sched_memqos.ipcx10;
+
+	class_id = knn_get_tag((u64)ipcx10, (u64)memband_div_10);
+	p->sched_memqos.class_id = class_id;
+}
+
+//static memqos_domain mq_domains[] = {
+//	{.dom_id = 0, .total_memband = 0, .total_out_memband = 0,},
+//	{.dom_id = 1, .total_memband = 0, .total_out_memband = 0,},
+//	{.dom_id = 2, .total_memband = 0, .total_out_memband = 0,},
+//	{.dom_id = 3, .total_memband = 0, .total_out_memband = 0,},
+//};
+
+static DEFINE_PER_CPU(struct memqos_class, memqos_classes[8]);
+//static DEFINE_PER_CPU(spinlock_t, memqos_class_lock);
+static DEFINE_SPINLOCK(memqos_class_lock);
+
+static int memqos_class_online(unsigned int cpu)
+{
+	int class_id = 0;
+	struct memqos_class *class;
+
+	for (class_id = 0; class_id < 8; class_id++) {
+		class = &per_cpu(memqos_classes, cpu)[class_id];
+		INIT_LIST_HEAD(&class->tasks_list);
+		INIT_LIST_HEAD(&class->turbo_list);
+	}
+	return 0;
+}
+
+static int memqos_class_offline(unsigned int cpu)
+{
+	return 0;
+}
+
+#include <linux/cpu.h>
+#include <linux/cacheinfo.h>
+
+static void memqos_init(void)
+{
+        int cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+                                        "memqos:online", memqos_class_online,
+                                        memqos_class_offline);
+	if (cpuhp_state <= 0) {
+		pr_err("Failed to register 'dyn' cpuhp callbacks");
+		return;
+	}
+}
+late_initcall(memqos_init);
+
+static void memqos_insert_to_class(struct task_struct *p, int cpu)
+{
+	unsigned long flag;
+	int class_id = p->sched_memqos.class_id;
+	struct memqos_class *class;
+	struct task_memqos *memqos;
+
+	if (class_id >= 8)
+		return;
+
+	memqos = &p->sched_memqos;
+
+	class = &per_cpu(memqos_classes, cpu)[class_id];
+
+	spin_lock_irqsave(&memqos_class_lock, flag);
+	if (p->sched_memqos.corrupt) {
+		spin_unlock_irqrestore(&memqos_class_lock, flag);
+		return;
+	}
+
+	list_move_tail(&p->sched_memqos.task_list, &class->tasks_list);
+	if (memqos->turbo)
+		list_move_tail(&p->sched_memqos.turbo_list, &class->turbo_list);
+	spin_unlock_irqrestore(&memqos_class_lock, flag);
+}
+
+static void memqos_drop_class_without_lock(struct task_struct *p)
+{
+	list_del_init(&p->sched_memqos.task_list);
+	list_del_init(&p->sched_memqos.turbo_list);
+}
+
+static void memqos_score(struct task_struct *p)
+{
+	int total_n1 = p->sched_memqos.memband_div_10_total[0];
+	int total_n2 = p->sched_memqos.memband_div_10_total[1];
+	int total_n3 = p->sched_memqos.memband_div_10_total[2];
+	int total_n4 = p->sched_memqos.memband_div_10_total[3];
+
+	p->sched_memqos.numa_score[0] = (total_n1 - (total_n2 + total_n3 + total_n4)) * 10 / total_n1;
+	p->sched_memqos.numa_score[1] = (total_n2 - (total_n1 + total_n3 + total_n4)) * 10 / total_n2;
+	p->sched_memqos.numa_score[2] = (total_n3 - (total_n1 + total_n2 + total_n4)) * 10 / total_n3;
+	p->sched_memqos.numa_score[3] = (total_n4 - (total_n1 + total_n2 + total_n3)) * 10 / total_n4;
+
+	//over x% percent
+	if (p->sched_memqos.numa_score[0] > 0)
+		p->sched_memqos.turbo = 1;
+	else if (p->sched_memqos.numa_score[1] > 0)
+		p->sched_memqos.turbo = 2;
+	else if (p->sched_memqos.numa_score[2] > 0)
+		p->sched_memqos.turbo = 3;
+	else if (p->sched_memqos.numa_score[3] > 0)
+		p->sched_memqos.turbo = 4;
+	else
+		p->sched_memqos.turbo = 0;
+}
+
+void memqos_account_task(struct task_struct *p, int cpu)
+{
+	if (!p->sched_memqos.account_ready ||
+		p->sched_memqos.corrupt)
+		return;
+	memqos_set_task_classid(p);
+	memqos_insert_to_class(p, cpu);
+	memqos_score(p);
+	p->sched_memqos.account_ready = 0;
+}
+
+void memqos_init_class(struct task_struct *p)
+{
+	memset(&p->sched_memqos, 0, sizeof(struct task_memqos));
+	spin_lock(&memqos_class_lock);
+	INIT_LIST_HEAD(&p->sched_memqos.task_list);
+	INIT_LIST_HEAD(&p->sched_memqos.turbo_list);
+	INIT_LIST_HEAD(&p->sched_memqos.mpam_profile.wait_list);
+	spin_unlock(&memqos_class_lock);
+
+	p->closid = 0;
+	p->rmid = 0;
+}
+
+//destroy ?
+void memqos_drop_class(struct task_struct *p)
+{
+	spin_lock(&memqos_class_lock);
+	memqos_drop_class_without_lock(p);
+	p->sched_memqos.corrupt = 1;
+	spin_unlock(&memqos_class_lock);
+}
+
+void memqos_select_nicest_cpus(struct task_struct *p)
+{
+	int i = 0;
+	int max_score = -10000;
+	int select_node = 0;
+	struct task_memqos *memqos = &p->sched_memqos;
+
+	if (!memqos->turbo) {
+		for (i = 0; i < 4; i++) {
+			if (!cpumask_intersects(cpumask_of_node(i), p->select_cpus))
+				continue;
+
+			if (memqos->numa_score[i] > max_score) {
+				select_node = i;
+				max_score = memqos->numa_score[i];
+			}
+		}
+
+		cpumask_and(p->select_cpus, p->select_cpus, cpumask_of_node(select_node));
+		//p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+		p->sched_memqos.preferred_nid = memqos->turbo;
+		return;
+	}
+
+	select_node = memqos->turbo - 1;
+	if (cpumask_intersects(cpumask_of_node(select_node), p->select_cpus)) {
+		cpumask_and(p->select_cpus, p->select_cpus, cpumask_of_node(select_node));
+		//p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+		p->sched_memqos.preferred_nid = memqos->turbo;
+	}
+
+	//if turbo another cpus, wait...
+	return;
+}
+
+void memqos_exclude_low_level_task_single(struct task_struct *p)
+{
+	int i, j, cpu;
+	int find = 0;
+	int select_node = 0;
+	const struct cpumask *cpumask;
+	struct cpumask cpumask_med;
+	struct memqos_class *class;
+	struct task_memqos *memqos = &p->sched_memqos;;
+	struct task_struct *tsk = NULL;
+	int max_score = -100000;
+
+	if (memqos->turbo) {
+		select_node = memqos->turbo - 1;
+		cpumask = cpumask_of_node(select_node);
+		if (!cpumask_intersects(cpumask, p->prefer_cpus) &&
+			(cpumask_intersects(&p->cpus_allowed, cpumask))) {
+			cpumask_and(p->select_cpus, &p->cpus_allowed, cpumask);
+			//go out!
+			spin_lock(&memqos_class_lock);
+			memqos_drop_class_without_lock(p);
+			spin_unlock(&memqos_class_lock);
+			//p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+			p->sched_memqos.preferred_nid = memqos->turbo;
+			return;
+		} else if (cpumask_intersects(p->prefer_cpus, cpumask)) {
+			cpumask_and(p->select_cpus, p->prefer_cpus, cpumask);
+			//p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+			p->sched_memqos.preferred_nid = memqos->turbo;
+		}
+	}
+
+	//select turbo one
+	for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+		if (!cpumask_test_cpu(cpu, p->prefer_cpus))
+			continue;
+
+		spin_lock(&memqos_class_lock);
+		for (i = 7; i >= 0; i--) {
+			class = &per_cpu(memqos_classes, cpu)[i];
+			list_for_each_entry(memqos, &class->turbo_list, turbo_list) {
+				if (!memqos->turbo)
+					continue;
+				select_node = memqos->turbo - 1;
+				cpumask = cpumask_of_node(select_node);
+				if (!cpumask_intersects(cpumask, p->prefer_cpus)) {
+					tsk = container_of(memqos, struct task_struct, sched_memqos);
+					if (!cpumask_intersects(cpumask, &tsk->cpus_allowed))
+						continue;
+					cpumask_and(tsk->select_cpus, &tsk->cpus_allowed, cpumask);
+					//mem prefered
+					//tsk->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+					tsk->sched_memqos.preferred_nid = memqos->turbo;
+					find = 1;
+					break;
+				}
+			}
+			if (find) {
+				memqos_drop_class_without_lock(tsk);
+				spin_unlock(&memqos_class_lock);
+				return;
+			}
+		}
+		spin_unlock(&memqos_class_lock);
+	}
+
+	find = 0;
+
+	//if not, select lower class's tsk
+	for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+		if (!cpumask_test_cpu(cpu, p->prefer_cpus))
+			continue;
+
+		spin_lock(&memqos_class_lock);
+		//only find below class tsk
+		for (i = 0; i < memqos->class_id; i++) {
+			class = &per_cpu(memqos_classes, cpu)[i];
+			list_for_each_entry(memqos, &class->tasks_list, task_list) {
+				if (memqos->turbo)
+					continue;
+
+				tsk = container_of(memqos, struct task_struct, sched_memqos);
+				for (j = 0; j < 4; j++) {
+					if (!cpumask_intersects(cpumask_of_node(i), &tsk->cpus_allowed))
+						continue;
+					if (memqos->numa_score[j] > max_score) {
+						select_node = j;
+						max_score = memqos->numa_score[j];
+					}
+					find = 1;
+				}
+				if (!find)
+					continue;
+
+				cpumask_and(&cpumask_med, cpumask_of_node(select_node), &tsk->cpus_allowed);
+				cpumask_andnot(&cpumask_med, &cpumask_med, p->prefer_cpus);
+				if (cpumask_empty(&cpumask_med))
+					continue;
+				cpumask_copy(tsk->select_cpus, &cpumask_med);
+				//mem prefered
+				//tsk->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+				tsk->sched_memqos.preferred_nid = memqos->turbo;
+				memqos_drop_class_without_lock(tsk);
+				spin_unlock(&memqos_class_lock);
+				return;
+			}
+		}
+		spin_unlock(&memqos_class_lock);
+	}
+
+	//do not care, this task may out
+	return;
+}
+
diff --git a/kernel/sched/memqos/phase_feature_sysctl.c b/kernel/sched/memqos/phase_feature_sysctl.c
new file mode 100644
index 0000000000000..9106a90868a3d
--- /dev/null
+++ b/kernel/sched/memqos/phase_feature_sysctl.c
@@ -0,0 +1,183 @@
+#include <linux/sched.h>
+#include <linux/sysctl.h>
+#include <linux/capability.h>
+#include <linux/cpumask.h>
+#include <linux/topology.h>
+#include <linux/sched/task.h>
+
+#include <linux/memqos.h>
+
+#ifdef CONFIG_PROC_SYSCTL
+
+//setup timer for counting
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <asm/ioctl.h>
+
+//at least 2 cpu
+static enum hrtimer_restart timer_fn_twin_a(struct hrtimer *timer_data)
+{
+	sched_memqos_task_collect_data_range(0, nr_cpu_ids / 2 - 1);
+        hrtimer_forward_now(timer_data, 1 * NSEC_PER_MSEC);
+        return HRTIMER_RESTART;
+}
+
+static enum hrtimer_restart timer_fn_twin_b(struct hrtimer *timer_data)
+{
+	sched_memqos_task_collect_data_range(nr_cpu_ids / 2, nr_cpu_ids - 1);
+        hrtimer_forward_now(timer_data, 1 * NSEC_PER_MSEC);
+        return HRTIMER_RESTART;
+}
+
+static struct hrtimer timer_twin_a;
+static struct hrtimer timer_twin_b;
+
+static void memqos_timer_init_func_a(void *info) {
+        hrtimer_init(&timer_twin_a, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+        timer_twin_a.function = timer_fn_twin_a;
+        hrtimer_start(&timer_twin_a, ktime_add_ns(ktime_get(), 10000000), HRTIMER_MODE_ABS);
+}
+
+static void memqos_timer_init_func_b(void *info) {
+        hrtimer_init(&timer_twin_b, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+        timer_twin_b.function = timer_fn_twin_b;
+        hrtimer_start(&timer_twin_b, ktime_add_ns(ktime_get(), 10000000), HRTIMER_MODE_ABS);
+}
+
+static void memqos_timer_init_a(void)
+{
+	smp_call_function_single(0, memqos_timer_init_func_b, NULL, 0);
+}
+
+static void memqos_timer_init_b(void)
+{
+	smp_call_function_single(nr_cpu_ids / 2, memqos_timer_init_func_a, NULL, 0);
+}
+
+static void memqos_timer_twin_init(void) {
+	memqos_timer_init_a();
+	memqos_timer_init_b();
+}
+
+static void memqos_timer_twin_exit(void) {
+        hrtimer_cancel(&timer_twin_a);
+        hrtimer_cancel(&timer_twin_b);
+}
+
+DEFINE_STATIC_KEY_FALSE(sched_phase);
+DEFINE_STATIC_KEY_FALSE(sched_phase_printk);
+
+static int set_phase_state(bool enabled)
+{
+	int err;
+	int state = static_branch_likely(&sched_phase);
+
+	if (enabled == state) {
+		pr_warn("phase has already %s\n", state ? "enabled" : "disabled");
+		return 0;
+	}
+
+	if (enabled) {
+		err = phase_perf_create();
+		if (err) {
+			pr_err("phase enable failed\n");
+			return err;
+		}
+		static_branch_enable(&sched_phase);
+		pr_info("phase enabled\n");
+		memqos_timer_twin_init();
+	} else {
+		static_branch_disable(&sched_phase);
+		phase_perf_release();
+		pr_info("phase disabled\n");
+		memqos_timer_twin_exit();
+	}
+
+	return 0;
+}
+
+/*
+ * the other procfs files of phase cannot be modified if sched_phase is already enabled
+ */
+static int phase_proc_state(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = static_branch_likely(&sched_phase);
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+	if (write)
+		err = set_phase_state(state);
+
+	return err;
+}
+
+static int set_phase_state_printk(bool enabled)
+{
+	if (enabled) {
+		static_branch_enable(&sched_phase_printk);
+	} else {
+		static_branch_disable(&sched_phase_printk);
+	}
+
+	return 0;
+}
+
+/*
+ * the other procfs files of phase cannot be modified if sched_phase is already enabled
+ */
+static int phase_proc_state_printk(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = static_branch_likely(&sched_phase);
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+	if (write)
+		err = set_phase_state_printk(state);
+
+	return err;
+}
+
+
+static int __maybe_unused zero;
+static int __maybe_unused one = 1;
+
+struct ctl_table phase_table[] = {
+	{
+		.procname	= "enabled",
+		.data		= NULL,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= phase_proc_state,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.procname	= "trace_enabled",
+		.data		= NULL,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= phase_proc_state_printk,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{ }
+};
+#endif /* CONFIG_PROC_SYSCTL */
diff --git a/kernel/sched/memqos/phase_memband.c b/kernel/sched/memqos/phase_memband.c
new file mode 100644
index 0000000000000..df8b2811f6ab7
--- /dev/null
+++ b/kernel/sched/memqos/phase_memband.c
@@ -0,0 +1,179 @@
+#include <linux/types.h>
+#include <linux/cpu.h>
+#include <linux/memqos.h>
+
+#include <asm/cpu.h>
+#include <asm/cputype.h>
+#include <asm/cpufeature.h>
+#include <asm/mpam_sched.h>
+
+static const int nr_partid = 15;
+static const int nr_monitor = 4;
+
+static LIST_HEAD(phase_mpam_waitqueue);
+
+//mpam_profile_res[0] not used
+struct memqos_mpam_profile mpam_profile_res[16] = {
+	{ .partid = 0, .monitor = 0, .used = 1},
+	{ .partid = 1, .monitor = 0,},
+	{ .partid = 2, .monitor = 1,},
+	{ .partid = 3, .monitor = 2,},
+	{ .partid = 4, .monitor = 3,},
+	{ .partid = 5, .monitor = 0,},
+	{ .partid = 6, .monitor = 1,},
+	{ .partid = 7, .monitor = 2,},
+	{ .partid = 8, .monitor = 3,},
+	{ .partid = 9, .monitor = 0,},
+	{ .partid = 10, .monitor = 1,},
+	{ .partid = 11, .monitor = 2,},
+	{ .partid = 12, .monitor = 3,},
+	{ .partid = 13, .monitor = 0,},
+	{ .partid = 14, .monitor = 1,},
+	{ .partid = 15, .monitor = 2,},
+};
+
+static DEFINE_SPINLOCK(phase_partid_lock);
+
+void phase_update_mpam_label(struct task_struct *tsk)
+{
+	int i = 0;
+	//unsigned long flag;
+
+	WARN_ON_ONCE(tsk->closid);
+
+	if (tsk->sched_memqos.corrupt) {
+		phase_release_mpam_label(tsk);
+		return;
+	}
+
+	spin_lock(&phase_partid_lock);
+	if (tsk->sched_memqos.mpam_profile.profile != &mpam_profile_res[0] &&
+		tsk->sched_memqos.mpam_profile.profile != NULL) {
+		tsk->closid = tsk->sched_memqos.mpam_profile.profile->partid;
+		tsk->sched_memqos.mpam_profile.profile->tsk = tsk;
+		//tsk->sched_memqos.mpam_profile.profile->used = 1;
+		tsk->rmid = 0;
+		spin_unlock(&phase_partid_lock);
+		//if (static_branch_unlikely(&sched_phase_printk)) {
+		//	trace_printk("task pid:%d get partid%d succeed\n", tsk->pid, tsk->closid);
+		//}
+		__mpam_sched_in_v2(tsk);
+		return;
+	}
+
+	//is in profile queue, wait...
+	if (tsk->sched_memqos.mpam_profile.profile == &mpam_profile_res[0]) {
+		spin_unlock(&phase_partid_lock);
+		return;
+	}
+
+	for (i = 1; i < 16; i++) {
+		if (mpam_profile_res[i].used) {
+			if (static_branch_unlikely(&sched_phase_printk)) {
+				//if (mpam_profile_res[i].tsk)
+        			//	trace_printk("i%d want get partid, butpartid:%d get by pid:%d closid:%d\n",
+				//tsk->pid, i, mpam_profile_res[i].tsk->pid, mpam_profile_res[i].tsk->closid);
+				//else
+        			//	trace_printk("i%d want get partid, butpartid:%d get by pid:%d(NULL)\n",
+				//tsk->pid, i, tsk->pid);
+			}
+
+			continue;
+		}
+
+		tsk->sched_memqos.mpam_profile.profile = NULL;
+		break;
+	}
+
+	if (i == 16) {
+		list_move_tail(&tsk->sched_memqos.mpam_profile.wait_list, &phase_mpam_waitqueue);
+		tsk->sched_memqos.mpam_profile.profile = &mpam_profile_res[0];
+		spin_unlock(&phase_partid_lock);
+		//if (static_branch_unlikely(&sched_phase_printk)) {
+		//	trace_printk("task pid:%d no partid found, go to list\n", tsk->pid);
+		//}
+		//wait...
+		return;
+	}
+
+	mpam_profile_res[i].used = 1;
+	tsk->closid = mpam_profile_res[i].partid;
+	mpam_profile_res[i].tsk = tsk;
+	tsk->sched_memqos.mpam_profile.profile = &mpam_profile_res[i];
+	tsk->rmid = 0;
+	spin_unlock(&phase_partid_lock);
+	//if (static_branch_unlikely(&sched_phase_printk)) {
+		//trace_printk("task pid:%d get partid%d succeed\n", tsk->pid, i);
+	//}
+
+	__mpam_sched_in_v2(tsk);
+}
+
+static void phase_release_mpam_label_without_lock(struct task_struct *tsk)
+{
+	int closid;
+	struct memqos_wait_profile *next;
+
+	//assert locked
+
+	if (tsk->sched_memqos.mpam_profile.profile &&
+		tsk->sched_memqos.mpam_profile.profile->partid) {
+		closid = tsk->sched_memqos.mpam_profile.profile->partid;
+	} else if (tsk->closid == 0) {
+		return;
+	} else {
+		closid = tsk->closid;
+	}
+
+	tsk->closid = 0;
+	tsk->sched_memqos.mpam_profile.profile = NULL;
+	mpam_profile_res[closid].used = 0;
+	mpam_profile_res[closid].tsk = NULL;
+
+	//if (static_branch_unlikely(&sched_phase_printk)) {
+	//	trace_printk("task pid:%d release partid%d, list empty:%d\n", tsk->pid, closid, list_empty(&phase_mpam_waitqueue));
+	//}
+
+	next = list_first_entry_or_null(&phase_mpam_waitqueue, struct memqos_wait_profile, wait_list);
+	if (next) {
+		list_del_init(&next->wait_list);
+		mpam_profile_res[closid].used = 1;
+		next->profile = &mpam_profile_res[closid];
+	}
+
+	return;
+}
+
+//task shutdown
+void phase_destroy_waitqueue(struct task_struct *tsk)
+{
+	spin_lock(&phase_partid_lock);
+
+	//if (tsk->sched_memqos.mpam_profile.profile == &mpam_profile_res[0]) {
+	list_del_init(&tsk->sched_memqos.mpam_profile.wait_list);
+	//} else {
+	phase_release_mpam_label_without_lock(tsk);
+	//}
+	spin_unlock(&phase_partid_lock);
+}
+
+void phase_release_mpam_label(struct task_struct *tsk)
+{
+	spin_lock(&phase_partid_lock);
+	phase_release_mpam_label_without_lock(tsk);
+	spin_unlock(&phase_partid_lock);
+}
+
+#include <asm/mpam.h>
+void phase_get_memband(struct memqos_mpam_profile *pm, int *result, int nr)
+{
+	if (pm == &mpam_profile_res[0] || pm == NULL) {
+		result[0] = 0;
+		result[1] = 0;
+		result[2] = 0;
+		result[3] = 0;
+		return;
+	}
+
+	mpam_component_config_mbwu_mon(pm->partid, pm->pmg, pm->monitor, result, nr);
+}
diff --git a/kernel/sched/memqos/phase_perf.c b/kernel/sched/memqos/phase_perf.c
new file mode 100644
index 0000000000000..7b7f37e46f76c
--- /dev/null
+++ b/kernel/sched/memqos/phase_perf.c
@@ -0,0 +1,412 @@
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/percpu-defs.h>
+#include <linux/slab.h>
+#include <linux/stop_machine.h>
+#include <linux/memqos.h>
+#include <linux/sched.h>
+
+#define PHASE_FEVENT_NUM 3
+
+int *phase_perf_pevents = NULL;
+
+static DEFINE_PER_CPU(__typeof__(struct perf_event *)[PHASE_PEVENT_NUM], cpu_phase_perf_events);
+
+/******************************************
+ * Helpers for phase perf event
+ *****************************************/
+static inline struct perf_event *perf_event_of_cpu(int cpu, int index)
+{
+	return per_cpu(cpu_phase_perf_events, cpu)[index];
+}
+
+static inline struct perf_event **perf_events_of_cpu(int cpu)
+{
+	return per_cpu(cpu_phase_perf_events, cpu);
+}
+
+static inline u64 perf_event_local_pmu_read(struct perf_event *event)
+{
+	return 0;
+	if (event->state == PERF_EVENT_STATE_ACTIVE)
+		event->pmu->read(event);
+	return local64_read(&event->count);
+}
+
+/******************************************
+ * Helpers for cpu counters
+ *****************************************/
+static inline u64 read_cpu_counter(int cpu, int index)
+{
+	struct perf_event *event = perf_event_of_cpu(cpu, index);
+
+	if (!event || !event->pmu)
+		return 0;
+
+	return perf_event_local_pmu_read(event);
+}
+
+static struct perf_event_attr *alloc_attr(int event_id)
+{
+	struct perf_event_attr *attr;
+
+	attr = kzalloc(sizeof(struct perf_event_attr), GFP_KERNEL);
+	if (!attr)
+		return ERR_PTR(-ENOMEM);
+
+	attr->type = PERF_TYPE_RAW;
+	attr->config = event_id;
+	attr->size = sizeof(struct perf_event_attr);
+	attr->pinned = 1;
+	attr->disabled = 1;
+	//attr->exclude_hv;
+	//attr->exclude_idle;
+	//attr->exclude_kernel;
+
+	return attr;
+}
+
+static int create_cpu_counter(int cpu, int event_id, int index)
+{
+	struct perf_event_attr *attr = NULL;
+	struct perf_event **events = perf_events_of_cpu(cpu);
+	struct perf_event *event = NULL;
+
+	return 0;
+	attr = alloc_attr(event_id);
+	if (IS_ERR(attr))
+		return PTR_ERR(attr);
+
+	event = perf_event_create_kernel_counter(attr, cpu, NULL, NULL, NULL);
+	if (IS_ERR(event)) {
+		pr_err("unable to create perf event (cpu:%i-type:%d-pinned:%d-config:0x%llx) : %ld",
+			cpu, attr->type, attr->pinned, attr->config, PTR_ERR(event));
+		kfree(attr);
+		return PTR_ERR(event);
+	} else {
+		events[index] = event;
+		perf_event_enable(events[index]);
+		if (event->hw.idx == -1) {
+			pr_err("pinned event unable to get onto hardware, perf event (cpu:%i-type:%d-config:0x%llx)",
+				cpu, attr->type, attr->config);
+			kfree(attr);
+			return -EINVAL;
+		}
+		pr_info("create perf_event (cpu:%i-idx:%d-type:%d-pinned:%d-exclude_hv:%d"
+			"-exclude_idle:%d-exclude_kernel:%d-config:0x%llx-addr:%px)",
+			event->cpu, event->hw.idx,
+			event->attr.type, event->attr.pinned, event->attr.exclude_hv,
+			event->attr.exclude_idle, event->attr.exclude_kernel,
+			event->attr.config, event);
+	}
+
+	kfree(attr);
+	return 0;
+}
+
+static int release_cpu_counter(int cpu, int event_id, int index)
+{
+	struct perf_event **events = perf_events_of_cpu(cpu);
+	struct perf_event *event = NULL;
+
+	return 0;
+	event = events[index];
+
+	if (!event)
+		return 0;
+
+	pr_info("release perf_event (cpu:%i-idx:%d-type:%d-pinned:%d-exclude_hv:%d"
+		"-exclude_idle:%d-exclude_kernel:%d-config:0x%llx)",
+		event->cpu, event->hw.idx,
+		event->attr.type, event->attr.pinned, event->attr.exclude_hv,
+		event->attr.exclude_idle, event->attr.exclude_kernel,
+		event->attr.config);
+
+	perf_event_release_kernel(event);
+	events[index] = NULL;
+
+	return 0;
+}
+
+enum {
+	CYCLES_INDEX = 0,
+	INST_RETIRED_INDEX,
+	PHASE_EVENT_FINAL_TERMINATOR
+};
+
+#define CYCLES                                  0x0011
+#define INST_RETIRED                            0x0008
+
+static int pevents[PHASE_PEVENT_NUM] = {
+	CYCLES,
+	INST_RETIRED,
+        PHASE_EVENT_FINAL_TERMINATOR,
+};
+
+#define for_each_phase_pevents(index, events)   \
+	for (index = 0; events != NULL && index < PHASE_PEVENT_NUM && \
+		events[index] != PHASE_EVENT_FINAL_TERMINATOR; index++)
+
+
+/******************************************
+ * Helpers for phase perf
+ *****************************************/
+static int do_pevents(int (*fn)(int, int, int), int cpu)
+{
+	int index;
+	int err;
+
+	for_each_phase_pevents(index, phase_perf_pevents) {
+		err = fn(cpu, phase_perf_pevents[index], index);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+static int __phase_perf_create(void *args)
+{
+	int err;
+	int cpu = raw_smp_processor_id();
+
+	/* create pinned events */
+	pr_info("create pinned events\n");
+	err = do_pevents(create_cpu_counter, cpu);
+	if (err) {
+		pr_err("create pinned events failed\n");
+		do_pevents(release_cpu_counter, cpu);
+		return err;
+	}
+
+	pr_info("[%d] phase class event create success\n", cpu);
+	return 0;
+}
+
+static int do_phase_perf_create(int *pevents, const struct cpumask *cpus)
+{
+	phase_perf_pevents = pevents;
+	return stop_machine(__phase_perf_create, NULL, cpus);
+}
+
+static int __do_phase_perf_release(void *args)
+{
+	int cpu = raw_smp_processor_id();
+
+	/* release pinned events */
+	pr_info("release pinned events\n");
+	do_pevents(release_cpu_counter, cpu);
+
+	pr_info("[%d] phase class event release success\n", cpu);
+	return 0;
+}
+
+static void do_phase_perf_release(const struct cpumask *cpus)
+{
+	stop_machine(__do_phase_perf_release, NULL, cpus);
+	phase_perf_pevents = NULL;
+}
+
+int phase_perf_create(void)
+{
+	return do_phase_perf_create(pevents, cpu_possible_mask);
+}
+
+void phase_perf_release(void)
+{
+	do_phase_perf_release(cpu_possible_mask);
+}
+
+DECLARE_STATIC_KEY_FALSE(sched_phase);
+DECLARE_STATIC_KEY_FALSE(sched_phase_printk);
+
+#define PHASE_EVENT_OVERFLOW (~0ULL)
+
+static inline u64 phase_event_count_sub(u64 curr, u64 prev)
+{
+	if (curr < prev) {	/* ovewrflow */
+		u64 tmp = PHASE_EVENT_OVERFLOW - prev;
+		return curr + tmp;
+	} else {
+		return curr - prev;
+	}
+}
+
+static inline void phase_calc_delta(struct task_struct *p,
+				    struct phase_event_count *prev,
+				    struct phase_event_count *curr,
+				    struct phase_event_count *delta)
+{
+	int *pevents = phase_perf_pevents;
+	int index;
+
+	for_each_phase_pevents(index, pevents) {
+		delta->pcount.data[index] = phase_event_count_sub(curr->pcount.data[index], prev->pcount.data[index]);
+	}
+}
+
+static inline u64 phase_data_of_pevent(struct phase_event_pcount *counter, int event_id)
+{
+	int index;
+	int *events = phase_perf_pevents;
+
+	for_each_phase_pevents(index, events) {
+		if (event_id == events[index])
+			return counter->data[index];
+	}
+
+	return 0;
+}
+
+static int cal_ring_history_average(int *history, int nr, int s_pos, int c_nr)
+{
+	int average = 0;
+	int start = ((s_pos - c_nr) + nr) % nr;
+
+	if (start < 0)
+		return 0;
+
+	for (;start != s_pos;) {
+		if (history[start] == 0) {
+			c_nr--;
+			if (c_nr == 0)
+				return 0;
+			continue;
+		}
+		average += history[start];
+		start = (start + 1) % nr;
+	}
+
+	return start / c_nr;
+}
+
+static void __phase_cal_ipcx10(struct task_struct *p, struct phase_event_count *delta)
+{
+	u64 ins;
+	u64 cycles;
+	//invalid zero
+	int ipcx10 = 0;
+
+	ins = phase_data_of_pevent(&delta->pcount, INST_RETIRED_INDEX);
+	cycles = phase_data_of_pevent(&delta->pcount, CYCLES_INDEX);
+
+	if (cycles)
+		ipcx10 = (ins * 10) / cycles;
+
+	//if (static_branch_unlikely(&sched_phase_printk)) {
+	//	trace_printk("ins:%lld cycles:%lld\n", ins, cycles);
+	//}
+
+	p->sched_memqos.ipcx10_history[p->sched_memqos.ipc_ringpos] = ipcx10;
+	p->sched_memqos.ipc_ringpos = (p->sched_memqos.ipc_ringpos + 1) % 10;
+	cal_ring_history_average(p->sched_memqos.ipcx10_history, 10, p->sched_memqos.ipc_ringpos, 5);
+}
+
+static void __phase_cal_memband_div_10(struct task_struct *p)
+{
+	int pos;
+	int result[4];
+
+	pos = p->sched_memqos.memband_ringpos;
+
+	phase_get_memband(p->sched_memqos.mpam_profile.profile, result, 4);
+
+	//if (static_branch_unlikely(&sched_phase_printk)) {
+	//	trace_printk("memband:%d %d %d %d profile:%llx\n", result[0], result[1], result[2], result[3], p->sched_memqos.mpam_profile.profile);
+	//}
+
+	p->sched_memqos.memband_div_10_total[0] = p->sched_memqos.memband_div_10_total[0] - p->sched_memqos.memband_div_10_history[0][pos];
+	p->sched_memqos.memband_div_10_total[0] = p->sched_memqos.memband_div_10_total[0] + result[0] / 10;
+	p->sched_memqos.memband_div_10_history[0][p->sched_memqos.memband_ringpos] = result[0] / 10;
+
+	p->sched_memqos.memband_div_10_total[1] = p->sched_memqos.memband_div_10_total[1] - p->sched_memqos.memband_div_10_history[1][pos];
+	p->sched_memqos.memband_div_10_total[1] = p->sched_memqos.memband_div_10_total[1] + result[1] / 10;
+	p->sched_memqos.memband_div_10_history[1][p->sched_memqos.memband_ringpos] = result[1] / 10;
+
+	p->sched_memqos.memband_div_10_total[2] = p->sched_memqos.memband_div_10_total[2] - p->sched_memqos.memband_div_10_history[2][pos];
+	p->sched_memqos.memband_div_10_total[2] = p->sched_memqos.memband_div_10_total[2] + result[2] / 10;
+	p->sched_memqos.memband_div_10_history[2][p->sched_memqos.memband_ringpos] = result[2] / 10;
+
+	p->sched_memqos.memband_div_10_total[3] = p->sched_memqos.memband_div_10_total[3] - p->sched_memqos.memband_div_10_history[3][pos];
+	p->sched_memqos.memband_div_10_total[3] = p->sched_memqos.memband_div_10_total[3] + result[3] / 10;
+	p->sched_memqos.memband_div_10_history[3][p->sched_memqos.memband_ringpos] = result[3] / 10;
+
+	p->sched_memqos.memband_ringpos = (pos + 1) % 10;
+
+	cal_ring_history_average(p->sched_memqos.memband_div_10_history[0], 10, pos, 5);
+	cal_ring_history_average(p->sched_memqos.memband_div_10_history[1], 10, pos, 5);
+	cal_ring_history_average(p->sched_memqos.memband_div_10_history[2], 10, pos, 5);
+	cal_ring_history_average(p->sched_memqos.memband_div_10_history[3], 10, pos, 5);
+}
+
+static DEFINE_PER_CPU(struct phase_event_count, prev_phase_event_count);
+static DEFINE_PER_CPU(struct phase_event_count, curr_phase_event_count);
+
+static void phase_perf_read_events(int cpu, u64 *pdata)
+{
+	int index;
+
+	for_each_phase_pevents(index, phase_perf_pevents) {
+		pdata[index] = read_cpu_counter(cpu, index);
+	}
+}
+
+static inline struct phase_event_count *phase_read_prev(unsigned int cpu)
+{
+        return &per_cpu(prev_phase_event_count, cpu);
+}
+
+static inline struct phase_event_count *phase_read_curr(unsigned int cpu)
+{
+	struct phase_event_count *curr = &per_cpu(curr_phase_event_count, cpu);
+
+	phase_perf_read_events(cpu, curr->pcount.data);
+
+	return curr;
+}
+
+void phase_account_task(struct task_struct *p, int cpu)
+{
+	struct phase_event_count delta;
+	struct phase_event_count *prev, *curr;
+
+	if (!static_branch_likely(&sched_phase))
+		return;
+
+	//if (!sched_core_enabled(cpu_rq(cpu)))
+	//	return;
+
+	/* update phase_event_count */
+	prev = phase_read_prev(cpu);
+	curr = phase_read_curr(cpu);
+	phase_calc_delta(p, prev, curr, &delta);
+	*prev = *curr;
+
+	/* calculate phase */
+	__phase_cal_ipcx10(p, &delta);
+	__phase_cal_memband_div_10(p);
+	p->sched_memqos.sample_times++;
+	if ((p->sched_memqos.sample_times % 3) == 0)
+		p->sched_memqos.account_ready = 1;
+}
+
+
+void phase_trace_printk(struct task_struct *p)
+{
+	if (!static_branch_unlikely(&sched_phase_printk))
+		return;
+
+	trace_printk("p->comm:%s(%d) ipcpos:%d ipcx10:%d membandpos:%d memband_div_10:%d numa_score[0]:%d numa_score[1]:%d numa_score[2]:%d numa_score[3]:%d turbo:%d prefered_nid:%d classid:%d partid:%d\n",
+			p->comm, p->pid, p->sched_memqos.ipc_ringpos,\
+			p->sched_memqos.ipcx10, \
+			p->sched_memqos.memband_ringpos,\
+			p->sched_memqos.memband_div_10, \
+			p->sched_memqos.numa_score[0], \
+			p->sched_memqos.numa_score[1], \
+			p->sched_memqos.numa_score[2], \
+			p->sched_memqos.numa_score[3], \
+			p->sched_memqos.turbo, \
+			p->sched_memqos.preferred_nid, \
+			p->sched_memqos.class_id, \
+			p->closid);
+}
diff --git a/kernel/sched/memqos/phase_sim_knn.c b/kernel/sched/memqos/phase_sim_knn.c
new file mode 100644
index 0000000000000..b80bb6b9ae0a3
--- /dev/null
+++ b/kernel/sched/memqos/phase_sim_knn.c
@@ -0,0 +1,92 @@
+#include <linux/types.h>
+
+#define DATA_ROW 20
+void QuickSort(u64 arr[DATA_ROW][2], int L, int R) {
+	int i = L;
+	int j = R;
+	int kk = (L + R) / 2;
+	u64 pivot = arr[kk][0];
+
+	while (i <= j) {
+		while (pivot > arr[i][0]) {
+			i++;
+		}
+		while (pivot < arr[j][0]) {
+			j--;
+		}
+		if (i <= j) {
+			u64 temp = arr[i][0];
+
+			arr[i][0] = arr[j][0];
+			arr[j][0] = temp;
+			i++; j--;
+		}
+	}
+	if (L < j) {
+		QuickSort(arr, L, j);
+	}
+	if (i < R) {
+		QuickSort(arr, i, R);
+	}
+}
+
+u64 euclidean_distance(u64 *row1, u64 *row2, int col) {
+	u64 distance = 0;
+	int i;
+
+	for (i = 0; i < col - 1; i++) {
+		distance += ((row1[i] - row2[i]) * (row1[i] - row2[i]));
+	}
+	return distance;
+}
+
+#define num_neighbors 6
+#define MAX_TAG 8
+
+int get_neighbors_tag(u64 train_data[DATA_ROW][3], int train_row, int col, u64 *test_row) {
+	int i;
+	u64 neighbors[MAX_TAG] = {0};
+	int max_tag = 0;
+	u64 distances[DATA_ROW][2];
+
+	for (i = 0; i < train_row; i++) {
+		distances[i][0] = euclidean_distance(train_data[i], test_row, col);
+		distances[i][1] = train_data[i][col - 1];
+	}
+	QuickSort(distances, 0, train_row - 1);
+	for (i = 0; i < num_neighbors; i++) {
+		neighbors[distances[i][1]]++;
+		if (neighbors[distances[i][1]] > neighbors[max_tag])
+			max_tag = distances[i][1];
+	}
+	return max_tag;
+}
+
+static u64 train_data[DATA_ROW][3] = {
+ {0, 1,  0},
+ {0, 9,  0},
+ {0, 20, 1},
+ {0, 30, 1},
+ {0, 40, 2},
+ {0, 50, 3},
+ {0, 60, 3},
+ {0, 70, 3},
+ {0, 80, 4},
+ {0, 90, 4},
+ {0, 100, 4},
+ {0, 110, 5},
+ {0, 120, 5},
+ {0, 130, 6},
+ {0, 140, 6},
+ {0, 150, 7},
+};
+
+int knn_get_tag(int ipcx10, int memband_div_10)
+{
+	u64 test_data[2];
+
+	test_data[0] = ipcx10;
+	test_data[1] = memband_div_10;
+
+	return get_neighbors_tag(train_data, DATA_ROW, 3, test_data);
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 685f9881b8e23..0d2764c4449ce 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -465,6 +465,13 @@ static struct ctl_table kern_table[] = {
    	.extra2		= &one,
    },
 #endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+        {
+                .procname       = "phase",
+                .mode           = 0555,
+                .child          = phase_table,
+        },
+#endif
 #endif /* CONFIG_SCHED_DEBUG */
    {
    	.procname	= "sched_rt_period_us",
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4cac46d56f387..d748c291e7047 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2164,12 +2164,15 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 {
    struct mempolicy *pol;
    struct page *page;
-	int preferred_nid;
+	int preferred_nid = -1;
    nodemask_t *nmask;
+	if (current->sched_memqos.preferred_nid)
+		preferred_nid = current->sched_memqos.preferred_nid - 1;
+
    pol = get_vma_policy(vma, addr);
-	if (pol->mode == MPOL_INTERLEAVE) {
+	if (pol->mode == MPOL_INTERLEAVE && preferred_nid != -1) {
    	unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
@@ -2233,7 +2236,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
    }
nmask = policy_nodemask(gfp, pol);
-	preferred_nid = policy_node(gfp, pol, node);
+	if (preferred_nid == -1)
+		preferred_nid = policy_node(gfp, pol, node);
    page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
    mark_vma_cdm(nmask, page, vma);
    mpol_cond_put(pol);
-- 
2.25.1


    

2025

2024

2023

2022

2021

2020

2019

[RFC PATCH openEuler-1.0-LTS v2] sched: memqos: add memqos for dynamic affinity