Add debug memband interface to dynamic affinity, this would be
useful for those threads sensitive to memory bandwidth.
Signed-off-by: Wang ShaoBo <bobo.shaobowang(a)huawei.com>
v2: Fix update thread's mpamid failed.
---
arch/arm64/include/asm/mpam.h | 2 +
arch/arm64/include/asm/mpam_sched.h | 2 +
arch/arm64/kernel/mpam/mpam_device.c | 58 ++-
arch/arm64/kernel/mpam/mpam_resctrl.c | 37 ++
arch/arm64/kernel/process.c | 2 +-
include/linux/memqos.h | 142 +++++++
include/linux/sched.h | 15 +-
include/linux/sysctl.h | 2 +
kernel/cgroup/cpuset.c | 1 +
kernel/exit.c | 3 +
kernel/fork.c | 4 +
kernel/sched/Makefile | 1 +
kernel/sched/core.c | 52 ++-
kernel/sched/fair.c | 14 +-
kernel/sched/memqos/Makefile | 6 +
kernel/sched/memqos/memqos.c | 297 +++++++++++++++
kernel/sched/memqos/phase_feature_sysctl.c | 183 +++++++++
kernel/sched/memqos/phase_memband.c | 179 +++++++++
kernel/sched/memqos/phase_perf.c | 412 +++++++++++++++++++++
kernel/sched/memqos/phase_sim_knn.c | 92 +++++
kernel/sysctl.c | 7 +
mm/mempolicy.c | 10 +-
22 files changed, 1500 insertions(+), 21 deletions(-)
create mode 100644 include/linux/memqos.h
create mode 100644 kernel/sched/memqos/Makefile
create mode 100644 kernel/sched/memqos/memqos.c
create mode 100644 kernel/sched/memqos/phase_feature_sysctl.c
create mode 100644 kernel/sched/memqos/phase_memband.c
create mode 100644 kernel/sched/memqos/phase_perf.c
create mode 100644 kernel/sched/memqos/phase_sim_knn.c
diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h
index 6338eab817e75..269a91d8ca907 100644
--- a/arch/arm64/include/asm/mpam.h
+++ b/arch/arm64/include/asm/mpam.h
@@ -4,6 +4,8 @@
#ifdef CONFIG_MPAM
extern int mpam_rmid_to_partid_pmg(int rmid, int *partid, int *pmg);
+
+void mpam_component_config_mbwu_mon(int partid, int pmg, int monitor, int *result, int nr);
#endif
#endif /* _ASM_ARM64_MPAM_H */
diff --git a/arch/arm64/include/asm/mpam_sched.h b/arch/arm64/include/asm/mpam_sched.h
index 08ed349b6efa1..32d08cf654b31 100644
--- a/arch/arm64/include/asm/mpam_sched.h
+++ b/arch/arm64/include/asm/mpam_sched.h
@@ -40,6 +40,8 @@ static inline void mpam_sched_in(void)
__mpam_sched_in();
}
+void __mpam_sched_in_v2(struct task_struct *tsk);
+
#else
static inline void mpam_sched_in(void) {}
diff --git a/arch/arm64/kernel/mpam/mpam_device.c b/arch/arm64/kernel/mpam/mpam_device.c
index 6455c69f132fd..48de3982a0b9a 100644
--- a/arch/arm64/kernel/mpam/mpam_device.c
+++ b/arch/arm64/kernel/mpam/mpam_device.c
@@ -84,14 +84,14 @@ void mpam_class_list_lock_held(void)
static inline u32 mpam_read_reg(struct mpam_device *dev, u16 reg)
{
WARN_ON_ONCE(reg > SZ_MPAM_DEVICE);
- assert_spin_locked(&dev->lock);
+ //assert_spin_locked(&dev->lock);
/*
* If we touch a device that isn't accessible from this CPU we may get
* an external-abort.
*/
- WARN_ON_ONCE(preemptible());
- WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
+ //WARN_ON_ONCE(preemptible());
+ //WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
return readl_relaxed(dev->mapped_hwpage + reg);
}
@@ -99,14 +99,14 @@ static inline u32 mpam_read_reg(struct mpam_device *dev, u16 reg)
static inline void mpam_write_reg(struct mpam_device *dev, u16 reg, u32 val)
{
WARN_ON_ONCE(reg > SZ_MPAM_DEVICE);
- assert_spin_locked(&dev->lock);
+ //assert_spin_locked(&dev->lock);
/*
* If we touch a device that isn't accessible from this CPU we may get
* an external-abort. If we're lucky, we corrupt another mpam:component.
*/
- WARN_ON_ONCE(preemptible());
- WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
+ //WARN_ON_ONCE(preemptible());
+ //WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
writel_relaxed(val, dev->mapped_hwpage + reg);
}
@@ -1208,6 +1208,7 @@ static u32 mpam_device_read_mbwu_mon(struct mpam_device *dev,
{
u16 mon;
u32 clt, flt, cur_clt, cur_flt;
+ u32 total = 0;
mon = args->mon;
@@ -1249,7 +1250,12 @@ static u32 mpam_device_read_mbwu_mon(struct mpam_device *dev,
wmb();
}
- return mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ total += mpam_read_reg(dev, MSMON_MBWU);
+ return total / 5;
}
static int mpam_device_frob_mon(struct mpam_device *dev,
@@ -1470,6 +1476,44 @@ static void mpam_component_device_sync(void *__ctx)
cpumask_set_cpu(smp_processor_id(), &ctx->updated_on);
}
+static DEFINE_SPINLOCK(mpam_tmp_lock);
+
+void mpam_component_config_mbwu_mon(int partid, int pmg, int monitor, int *result, int nr)
+{
+ struct mpam_class *class;
+ struct mpam_component *comp;
+ struct mpam_device *dev;
+ struct sync_args args;
+ int i = 0;
+
+ args.pmg = pmg;
+ args.mon = monitor;
+ args.closid.reqpartid = partid;
+ args.match_pmg = 1;
+
+ spin_lock(&mpam_tmp_lock);
+ list_for_each_entry(class, &mpam_classes, classes_list) {
+ if (class->type != MPAM_CLASS_MEMORY)
+ continue;
+
+ list_for_each_entry(comp, &class->components, class_list) {
+ if (i >= nr) {
+ pr_err_once("error, i > result nr");
+ break;
+ }
+ result[i] = 0;
+ list_for_each_entry(dev, &comp->devices, comp_list) {
+ result[i] += mpam_device_read_mbwu_mon(dev, &args);
+ }
+ i++;
+ }
+ break;
+ }
+ spin_unlock(&mpam_tmp_lock);
+
+}
+EXPORT_SYMBOL(mpam_component_config_mbwu_mon);
+
/**
* in some cases/platforms the MSC register access is only possible with
* the associated CPUs. And need to check if those CPUS are online before
diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c
index 60d3d8706a38b..26258f7508ac4 100644
--- a/arch/arm64/kernel/mpam/mpam_resctrl.c
+++ b/arch/arm64/kernel/mpam/mpam_resctrl.c
@@ -2226,6 +2226,43 @@ int mpam_resctrl_init(void)
return resctrl_group_init();
}
+
+void __mpam_sched_in_v2(struct task_struct *tsk)
+{
+ struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
+ u64 rmid = state->default_rmid;
+ u64 closid = state->default_closid;
+
+ /*
+ * If this task has a closid/rmid assigned, use it.
+ * Else use the closid/rmid assigned to this cpu.
+ */
+ if (tsk->closid)
+ closid = tsk->closid;
+
+ if (tsk->rmid)
+ rmid = tsk->rmid;
+
+ if (closid != state->cur_closid || rmid != state->cur_rmid) {
+ u64 reg;
+
+ /* set in EL0 */
+ reg = mpam_read_sysreg_s(SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+ reg = PARTID_SET(reg, closid);
+ reg = PMG_SET(reg, rmid);
+ mpam_write_sysreg_s(reg, SYS_MPAM0_EL1, "SYS_MPAM0_EL1");
+
+ /* set in EL1 */
+ reg = mpam_read_sysreg_s(SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+ reg = PARTID_SET(reg, closid);
+ reg = PMG_SET(reg, rmid);
+ mpam_write_sysreg_s(reg, SYS_MPAM1_EL1, "SYS_MPAM1_EL1");
+
+ state->cur_rmid = rmid;
+ state->cur_closid = closid;
+ }
+}
+
/*
* __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
*
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index e5be78915632c..7896bb74ecc49 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -531,7 +531,7 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
/* the actual thread switch */
last = cpu_switch_to(prev, next);
- mpam_sched_in();
+ //mpam_sched_in();
return last;
}
diff --git a/include/linux/memqos.h b/include/linux/memqos.h
new file mode 100644
index 0000000000000..814e9935590d3
--- /dev/null
+++ b/include/linux/memqos.h
@@ -0,0 +1,142 @@
+#ifndef _MEMQOS_H
+#define _MEMQOS_H
+
+#include <linux/vmstat.h>
+#include <linux/rbtree.h>
+//#include <linux/sched.h>
+
+struct task_struct;
+
+struct memqos_domain {
+ int dom_id;
+ int total_memband_div_10;
+ int total_out_memband_div_10;
+
+ //record 10 timers
+ int memband_ringpos;
+ int memband_div_10_history[4][10];
+};
+
+struct memqos_mpam_profile {
+ int partid;
+ int pmg;
+ int monitor;
+
+ struct task_struct *tsk;
+ int used;
+};
+
+struct memqos_wait_profile {
+ struct memqos_mpam_profile *profile;
+ struct list_head wait_list;
+};
+
+struct memqos_class {
+ struct list_head turbo_list;
+ struct list_head tasks_list;
+};
+
+#include <linux/topology.h>
+//embed in task_struct
+
+struct task_memqos {
+ int ipc_ringpos;
+ int ipcx10;
+ int ipcx10_total[4];
+ int ipcx10_history[10];
+
+ int memband_div_10;
+ int memband_ringpos;
+ int memband_div_10_total[4];
+ int memband_div_10_history[4][10];
+
+ u32 sample_times;
+ int account_ready;
+ int numa_score[4];
+ int turbo;
+
+ struct memqos_wait_profile mpam_profile;
+
+ struct list_head turbo_list;
+ struct list_head task_list;
+
+ struct cpumask *advise_mem_node_mask;
+ int preferred_nid;
+
+ int class_id;
+
+ int corrupt;
+};
+
+#define PHASE_PEVENT_NUM 10
+
+struct phase_event_pcount {
+ u64 data[PHASE_PEVENT_NUM];
+};
+
+struct phase_event_count {
+ struct phase_event_pcount pcount;
+};
+
+void phase_update_mpam_label(struct task_struct *tsk);
+
+void phase_release_mpam_label(struct task_struct *tsk);
+
+static inline void memqos_update_mpam_label(struct task_struct *tsk)
+{
+ phase_update_mpam_label(tsk);
+}
+
+static inline void memqos_release_mpam_label(struct task_struct *tsk)
+{
+ phase_release_mpam_label(tsk);
+}
+
+void phase_destroy_waitqueue(struct task_struct *tsk);
+
+void phase_get_memband(struct memqos_mpam_profile *pm, int *result, int nr);
+
+DECLARE_STATIC_KEY_FALSE(sched_phase);
+DECLARE_STATIC_KEY_FALSE(sched_phase_printk);
+
+int phase_perf_create(void);
+
+void phase_perf_release(void);
+
+void memqos_account_task(struct task_struct *p, int cpu);
+
+void memqos_drop_class(struct task_struct *p);
+
+void phase_account_task(struct task_struct *p, int cpu);
+
+static inline void memqos_task_collect_data(struct task_struct *p, int cpu)
+{
+ phase_account_task(p, cpu);
+}
+
+static inline void memqos_task_account(struct task_struct *p, int cpu)
+{
+ memqos_account_task(p, cpu);
+}
+
+static inline void memqos_task_exit(struct task_struct *p)
+{
+
+ memqos_drop_class(p);
+ phase_destroy_waitqueue(p);
+}
+
+void memqos_select_nicest_cpus(struct task_struct *p);
+
+void memqos_exclude_low_level_task_single(struct task_struct *p);
+
+int knn_get_tag(int ipcx10, int memband_div_10);
+
+void memqos_init_class(struct task_struct *p);
+
+void phase_trace_printk(struct task_struct *p);
+static inline void memqos_trace_printk(struct task_struct *p)
+{
+ phase_trace_printk(p);
+}
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 928186f161000..c5b74cd0c5830 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -29,6 +29,7 @@
#include <linux/task_io_accounting.h>
#include <linux/rseq.h>
#include <linux/thread_bits.h>
+#include <linux/memqos.h>
/* task_struct member predeclarations (sorted alphabetically): */
struct audit_context;
@@ -1268,7 +1269,7 @@ struct task_struct {
#if !defined(__GENKSYMS__)
#if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY)
cpumask_t *prefer_cpus;
- const cpumask_t *select_cpus;
+ cpumask_t *select_cpus;
#else
KABI_RESERVE(6)
KABI_RESERVE(7)
@@ -1279,6 +1280,10 @@ struct task_struct {
#endif
KABI_RESERVE(8)
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+ struct task_memqos sched_memqos;
+#endif
+
/* CPU-specific state of this task: */
struct thread_struct thread;
@@ -1998,6 +2003,14 @@ int set_prefer_cpus_ptr(struct task_struct *p,
const struct cpumask *new_mask);
int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig);
void sched_prefer_cpus_free(struct task_struct *p);
+static inline bool prefer_cpus_valid(struct task_struct *p)
+{
+ return p->prefer_cpus &&
+ !cpumask_empty(p->prefer_cpus) &&
+ !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) &&
+ cpumask_subset(p->prefer_cpus, &p->cpus_allowed);
+}
+void sched_memqos_task_collect_data_range(int start_cpu, int end_cpu);
#endif
#endif
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index b769ecfcc3bd4..73bce39107cb3 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -230,6 +230,8 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,
#endif /* CONFIG_SYSCTL */
+extern struct ctl_table phase_table[];
+
int sysctl_max_threads(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 55bfbc4cdb16c..d94a9065a5605 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -106,6 +106,7 @@ struct cpuset {
nodemask_t mems_allowed;
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
cpumask_var_t prefer_cpus;
+ int mem_turbo;
#endif
/* effective CPUs and Memory Nodes allow to tasks */
diff --git a/kernel/exit.c b/kernel/exit.c
index 2a32d32bdc03d..b731c19618176 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -699,6 +699,8 @@ static void check_stack_usage(void)
static inline void check_stack_usage(void) {}
#endif
+#include <linux/memqos.h>
+
void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
@@ -806,6 +808,7 @@ void __noreturn do_exit(long code)
* because of cgroup mode, must be called before cgroup_exit()
*/
perf_event_exit_task(tsk);
+ memqos_task_exit(tsk);
sched_autogroup_exit_task(tsk);
cgroup_exit(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index b5453a26655e2..0a762b92dc814 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -841,6 +841,8 @@ void set_task_stack_end_magic(struct task_struct *tsk)
*stackend = STACK_END_MAGIC; /* for overflow detection */
}
+
+#include <linux/memqos.h>
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
@@ -923,6 +925,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
kcov_task_init(tsk);
+ memqos_init_class(tsk);
+
#ifdef CONFIG_FAULT_INJECTION
tsk->fail_nth = 0;
#endif
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7fe183404c383..471380d6686e3 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_CPU_ISOLATION) += isolation.o
+obj-$(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) += memqos/
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 970616070da86..15c7e1e3408cb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2787,6 +2787,8 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
calculate_sigpending();
}
+#include <linux/memqos.h>
+
/*
* context_switch - switch to the new MM and the new thread's register state.
*/
@@ -2794,6 +2796,8 @@ static __always_inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next, struct rq_flags *rf)
{
+ struct rq *ret;
+
prepare_task_switch(rq, prev, next);
/*
@@ -2837,6 +2841,18 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
}
+ //account and release
+ memqos_task_account(prev, smp_processor_id());
+
+ if (prefer_cpus_valid(prev))
+ memqos_trace_printk(prev);
+
+ memqos_release_mpam_label(prev);
+
+ //label new task's mpamid
+ if (prefer_cpus_valid(next))
+ memqos_update_mpam_label(next);
+
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
@@ -2845,7 +2861,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
switch_to(prev, next, prev);
barrier();
- return finish_task_switch(prev);
+ ret = finish_task_switch(prev);
+
+ return ret;
}
/*
@@ -3051,6 +3069,20 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
+void sched_memqos_task_collect_data_range(int start_cpu, int end_cpu)
+{
+ int cpu;
+ struct task_struct *curr;
+ struct rq *rq_curr;
+
+ for (cpu = start_cpu; cpu <= end_cpu; cpu++) {
+ rq_curr = cpu_rq(cpu);
+ curr = rq_curr->curr;
+ if (curr && prefer_cpus_valid(curr))
+ memqos_task_collect_data(curr, cpu);
+ }
+}
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -3058,8 +3090,12 @@ unsigned long long task_sched_runtime(struct task_struct *p)
void scheduler_tick(void)
{
int cpu = smp_processor_id();
+ //memqos clooect next cpu's memband and perf
+ //int cpu_memqos = (cpu + 1) % nr_cpu_ids;
struct rq *rq = cpu_rq(cpu);
+ //struct rq *rq_next = cpu_rq(cpu_memqos);
struct task_struct *curr = rq->curr;
+ //struct task_struct *curr_memqos = rq_next->curr;
struct rq_flags rf;
sched_clock_tick();
@@ -3075,6 +3111,10 @@ void scheduler_tick(void)
perf_event_task_tick();
+ //only monitor task enabled dynamic affinity
+ //if (curr_memqos && prefer_cpus_valid(curr_memqos))
+ // memqos_task_collect_data(curr_memqos, cpu_memqos);
+
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
@@ -3524,6 +3564,16 @@ static void __sched notrace __schedule(bool preempt)
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
} else {
+ memqos_task_account(prev, smp_processor_id());
+
+ if (prefer_cpus_valid(prev))
+ memqos_trace_printk(prev);
+
+ memqos_release_mpam_label(prev);
+ //relabel this task's mpamid
+ if (prefer_cpus_valid(prev))
+ memqos_update_mpam_label(prev);
+
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
rq_unlock_irq(rq, &rf);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index af55a26d11fcb..12e9675495d2c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6675,6 +6675,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
}
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+#include <linux/memqos.h>
/*
* Low utilization threshold for CPU
*
@@ -6749,14 +6750,6 @@ static inline int cpu_vutil_of(int cpu)
return cputime->vutil;
}
-static inline bool prefer_cpus_valid(struct task_struct *p)
-{
- return p->prefer_cpus &&
- !cpumask_empty(p->prefer_cpus) &&
- !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) &&
- cpumask_subset(p->prefer_cpus, &p->cpus_allowed);
-}
-
/*
* set_task_select_cpus: select the cpu range for task
* @p: the task whose available cpu range will to set
@@ -6828,8 +6821,13 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu,
if (util_avg_sum < sysctl_sched_util_low_pct *
cpumask_weight(p->prefer_cpus)) {
p->select_cpus = p->prefer_cpus;
+ memqos_select_nicest_cpus(p);
if (sd_flag & SD_BALANCE_WAKE)
schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus);
+ } else {
+ //select trubo task
+ //select low class task
+ memqos_exclude_low_level_task_single(p);
}
}
#endif
diff --git a/kernel/sched/memqos/Makefile b/kernel/sched/memqos/Makefile
new file mode 100644
index 0000000000000..ed8f42649a8a7
--- /dev/null
+++ b/kernel/sched/memqos/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
+obj-y := memqos.o phase_feature_sysctl.o phase_memband.o phase_perf.o phase_sim_knn.o
diff --git a/kernel/sched/memqos/memqos.c b/kernel/sched/memqos/memqos.c
new file mode 100644
index 0000000000000..29fc6af1f02c1
--- /dev/null
+++ b/kernel/sched/memqos/memqos.c
@@ -0,0 +1,297 @@
+#include <linux/memqos.h>
+#include <linux/cpumask.h>
+#include <linux/sched.h>
+
+static void memqos_set_task_classid(struct task_struct *p)
+{
+ int class_id;
+ int memband_div_10 = p->sched_memqos.memband_div_10;
+ int ipcx10 = p->sched_memqos.ipcx10;
+
+ class_id = knn_get_tag((u64)ipcx10, (u64)memband_div_10);
+ p->sched_memqos.class_id = class_id;
+}
+
+//static memqos_domain mq_domains[] = {
+// {.dom_id = 0, .total_memband = 0, .total_out_memband = 0,},
+// {.dom_id = 1, .total_memband = 0, .total_out_memband = 0,},
+// {.dom_id = 2, .total_memband = 0, .total_out_memband = 0,},
+// {.dom_id = 3, .total_memband = 0, .total_out_memband = 0,},
+//};
+
+static DEFINE_PER_CPU(struct memqos_class, memqos_classes[8]);
+//static DEFINE_PER_CPU(spinlock_t, memqos_class_lock);
+static DEFINE_SPINLOCK(memqos_class_lock);
+
+static int memqos_class_online(unsigned int cpu)
+{
+ int class_id = 0;
+ struct memqos_class *class;
+
+ for (class_id = 0; class_id < 8; class_id++) {
+ class = &per_cpu(memqos_classes, cpu)[class_id];
+ INIT_LIST_HEAD(&class->tasks_list);
+ INIT_LIST_HEAD(&class->turbo_list);
+ }
+ return 0;
+}
+
+static int memqos_class_offline(unsigned int cpu)
+{
+ return 0;
+}
+
+#include <linux/cpu.h>
+#include <linux/cacheinfo.h>
+
+static void memqos_init(void)
+{
+ int cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN,
+ "memqos:online", memqos_class_online,
+ memqos_class_offline);
+ if (cpuhp_state <= 0) {
+ pr_err("Failed to register 'dyn' cpuhp callbacks");
+ return;
+ }
+}
+late_initcall(memqos_init);
+
+static void memqos_insert_to_class(struct task_struct *p, int cpu)
+{
+ unsigned long flag;
+ int class_id = p->sched_memqos.class_id;
+ struct memqos_class *class;
+ struct task_memqos *memqos;
+
+ if (class_id >= 8)
+ return;
+
+ memqos = &p->sched_memqos;
+
+ class = &per_cpu(memqos_classes, cpu)[class_id];
+
+ spin_lock_irqsave(&memqos_class_lock, flag);
+ if (p->sched_memqos.corrupt) {
+ spin_unlock_irqrestore(&memqos_class_lock, flag);
+ return;
+ }
+
+ list_move_tail(&p->sched_memqos.task_list, &class->tasks_list);
+ if (memqos->turbo)
+ list_move_tail(&p->sched_memqos.turbo_list, &class->turbo_list);
+ spin_unlock_irqrestore(&memqos_class_lock, flag);
+}
+
+static void memqos_drop_class_without_lock(struct task_struct *p)
+{
+ list_del_init(&p->sched_memqos.task_list);
+ list_del_init(&p->sched_memqos.turbo_list);
+}
+
+static void memqos_score(struct task_struct *p)
+{
+ int total_n1 = p->sched_memqos.memband_div_10_total[0];
+ int total_n2 = p->sched_memqos.memband_div_10_total[1];
+ int total_n3 = p->sched_memqos.memband_div_10_total[2];
+ int total_n4 = p->sched_memqos.memband_div_10_total[3];
+
+ p->sched_memqos.numa_score[0] = (total_n1 - (total_n2 + total_n3 + total_n4)) * 10 / total_n1;
+ p->sched_memqos.numa_score[1] = (total_n2 - (total_n1 + total_n3 + total_n4)) * 10 / total_n2;
+ p->sched_memqos.numa_score[2] = (total_n3 - (total_n1 + total_n2 + total_n4)) * 10 / total_n3;
+ p->sched_memqos.numa_score[3] = (total_n4 - (total_n1 + total_n2 + total_n3)) * 10 / total_n4;
+
+ //over x% percent
+ if (p->sched_memqos.numa_score[0] > 0)
+ p->sched_memqos.turbo = 1;
+ else if (p->sched_memqos.numa_score[1] > 0)
+ p->sched_memqos.turbo = 2;
+ else if (p->sched_memqos.numa_score[2] > 0)
+ p->sched_memqos.turbo = 3;
+ else if (p->sched_memqos.numa_score[3] > 0)
+ p->sched_memqos.turbo = 4;
+ else
+ p->sched_memqos.turbo = 0;
+}
+
+void memqos_account_task(struct task_struct *p, int cpu)
+{
+ if (!p->sched_memqos.account_ready ||
+ p->sched_memqos.corrupt)
+ return;
+ memqos_set_task_classid(p);
+ memqos_insert_to_class(p, cpu);
+ memqos_score(p);
+ p->sched_memqos.account_ready = 0;
+}
+
+void memqos_init_class(struct task_struct *p)
+{
+ memset(&p->sched_memqos, 0, sizeof(struct task_memqos));
+ spin_lock(&memqos_class_lock);
+ INIT_LIST_HEAD(&p->sched_memqos.task_list);
+ INIT_LIST_HEAD(&p->sched_memqos.turbo_list);
+ INIT_LIST_HEAD(&p->sched_memqos.mpam_profile.wait_list);
+ spin_unlock(&memqos_class_lock);
+
+ p->closid = 0;
+ p->rmid = 0;
+}
+
+//destroy ?
+void memqos_drop_class(struct task_struct *p)
+{
+ spin_lock(&memqos_class_lock);
+ memqos_drop_class_without_lock(p);
+ p->sched_memqos.corrupt = 1;
+ spin_unlock(&memqos_class_lock);
+}
+
+void memqos_select_nicest_cpus(struct task_struct *p)
+{
+ int i = 0;
+ int max_score = -10000;
+ int select_node = 0;
+ struct task_memqos *memqos = &p->sched_memqos;
+
+ if (!memqos->turbo) {
+ for (i = 0; i < 4; i++) {
+ if (!cpumask_intersects(cpumask_of_node(i), p->select_cpus))
+ continue;
+
+ if (memqos->numa_score[i] > max_score) {
+ select_node = i;
+ max_score = memqos->numa_score[i];
+ }
+ }
+
+ cpumask_and(p->select_cpus, p->select_cpus, cpumask_of_node(select_node));
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ return;
+ }
+
+ select_node = memqos->turbo - 1;
+ if (cpumask_intersects(cpumask_of_node(select_node), p->select_cpus)) {
+ cpumask_and(p->select_cpus, p->select_cpus, cpumask_of_node(select_node));
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ }
+
+ //if turbo another cpus, wait...
+ return;
+}
+
+void memqos_exclude_low_level_task_single(struct task_struct *p)
+{
+ int i, j, cpu;
+ int find = 0;
+ int select_node = 0;
+ const struct cpumask *cpumask;
+ struct cpumask cpumask_med;
+ struct memqos_class *class;
+ struct task_memqos *memqos = &p->sched_memqos;;
+ struct task_struct *tsk = NULL;
+ int max_score = -100000;
+
+ if (memqos->turbo) {
+ select_node = memqos->turbo - 1;
+ cpumask = cpumask_of_node(select_node);
+ if (!cpumask_intersects(cpumask, p->prefer_cpus) &&
+ (cpumask_intersects(&p->cpus_allowed, cpumask))) {
+ cpumask_and(p->select_cpus, &p->cpus_allowed, cpumask);
+ //go out!
+ spin_lock(&memqos_class_lock);
+ memqos_drop_class_without_lock(p);
+ spin_unlock(&memqos_class_lock);
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ return;
+ } else if (cpumask_intersects(p->prefer_cpus, cpumask)) {
+ cpumask_and(p->select_cpus, p->prefer_cpus, cpumask);
+ //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ p->sched_memqos.preferred_nid = memqos->turbo;
+ }
+ }
+
+ //select turbo one
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ if (!cpumask_test_cpu(cpu, p->prefer_cpus))
+ continue;
+
+ spin_lock(&memqos_class_lock);
+ for (i = 7; i >= 0; i--) {
+ class = &per_cpu(memqos_classes, cpu)[i];
+ list_for_each_entry(memqos, &class->turbo_list, turbo_list) {
+ if (!memqos->turbo)
+ continue;
+ select_node = memqos->turbo - 1;
+ cpumask = cpumask_of_node(select_node);
+ if (!cpumask_intersects(cpumask, p->prefer_cpus)) {
+ tsk = container_of(memqos, struct task_struct, sched_memqos);
+ if (!cpumask_intersects(cpumask, &tsk->cpus_allowed))
+ continue;
+ cpumask_and(tsk->select_cpus, &tsk->cpus_allowed, cpumask);
+ //mem prefered
+ //tsk->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ tsk->sched_memqos.preferred_nid = memqos->turbo;
+ find = 1;
+ break;
+ }
+ }
+ if (find) {
+ memqos_drop_class_without_lock(tsk);
+ spin_unlock(&memqos_class_lock);
+ return;
+ }
+ }
+ spin_unlock(&memqos_class_lock);
+ }
+
+ find = 0;
+
+ //if not, select lower class's tsk
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
+ if (!cpumask_test_cpu(cpu, p->prefer_cpus))
+ continue;
+
+ spin_lock(&memqos_class_lock);
+ //only find below class tsk
+ for (i = 0; i < memqos->class_id; i++) {
+ class = &per_cpu(memqos_classes, cpu)[i];
+ list_for_each_entry(memqos, &class->tasks_list, task_list) {
+ if (memqos->turbo)
+ continue;
+
+ tsk = container_of(memqos, struct task_struct, sched_memqos);
+ for (j = 0; j < 4; j++) {
+ if (!cpumask_intersects(cpumask_of_node(i), &tsk->cpus_allowed))
+ continue;
+ if (memqos->numa_score[j] > max_score) {
+ select_node = j;
+ max_score = memqos->numa_score[j];
+ }
+ find = 1;
+ }
+ if (!find)
+ continue;
+
+ cpumask_and(&cpumask_med, cpumask_of_node(select_node), &tsk->cpus_allowed);
+ cpumask_andnot(&cpumask_med, &cpumask_med, p->prefer_cpus);
+ if (cpumask_empty(&cpumask_med))
+ continue;
+ cpumask_copy(tsk->select_cpus, &cpumask_med);
+ //mem prefered
+ //tsk->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node);
+ tsk->sched_memqos.preferred_nid = memqos->turbo;
+ memqos_drop_class_without_lock(tsk);
+ spin_unlock(&memqos_class_lock);
+ return;
+ }
+ }
+ spin_unlock(&memqos_class_lock);
+ }
+
+ //do not care, this task may out
+ return;
+}
+
diff --git a/kernel/sched/memqos/phase_feature_sysctl.c b/kernel/sched/memqos/phase_feature_sysctl.c
new file mode 100644
index 0000000000000..9106a90868a3d
--- /dev/null
+++ b/kernel/sched/memqos/phase_feature_sysctl.c
@@ -0,0 +1,183 @@
+#include <linux/sched.h>
+#include <linux/sysctl.h>
+#include <linux/capability.h>
+#include <linux/cpumask.h>
+#include <linux/topology.h>
+#include <linux/sched/task.h>
+
+#include <linux/memqos.h>
+
+#ifdef CONFIG_PROC_SYSCTL
+
+//setup timer for counting
+#include <linux/sched.h>
+#include <linux/timer.h>
+#include <asm/ioctl.h>
+
+//at least 2 cpu
+static enum hrtimer_restart timer_fn_twin_a(struct hrtimer *timer_data)
+{
+ sched_memqos_task_collect_data_range(0, nr_cpu_ids / 2 - 1);
+ hrtimer_forward_now(timer_data, 1 * NSEC_PER_MSEC);
+ return HRTIMER_RESTART;
+}
+
+static enum hrtimer_restart timer_fn_twin_b(struct hrtimer *timer_data)
+{
+ sched_memqos_task_collect_data_range(nr_cpu_ids / 2, nr_cpu_ids - 1);
+ hrtimer_forward_now(timer_data, 1 * NSEC_PER_MSEC);
+ return HRTIMER_RESTART;
+}
+
+static struct hrtimer timer_twin_a;
+static struct hrtimer timer_twin_b;
+
+static void memqos_timer_init_func_a(void *info) {
+ hrtimer_init(&timer_twin_a, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ timer_twin_a.function = timer_fn_twin_a;
+ hrtimer_start(&timer_twin_a, ktime_add_ns(ktime_get(), 10000000), HRTIMER_MODE_ABS);
+}
+
+static void memqos_timer_init_func_b(void *info) {
+ hrtimer_init(&timer_twin_b, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ timer_twin_b.function = timer_fn_twin_b;
+ hrtimer_start(&timer_twin_b, ktime_add_ns(ktime_get(), 10000000), HRTIMER_MODE_ABS);
+}
+
+static void memqos_timer_init_a(void)
+{
+ smp_call_function_single(0, memqos_timer_init_func_b, NULL, 0);
+}
+
+static void memqos_timer_init_b(void)
+{
+ smp_call_function_single(nr_cpu_ids / 2, memqos_timer_init_func_a, NULL, 0);
+}
+
+static void memqos_timer_twin_init(void) {
+ memqos_timer_init_a();
+ memqos_timer_init_b();
+}
+
+static void memqos_timer_twin_exit(void) {
+ hrtimer_cancel(&timer_twin_a);
+ hrtimer_cancel(&timer_twin_b);
+}
+
+DEFINE_STATIC_KEY_FALSE(sched_phase);
+DEFINE_STATIC_KEY_FALSE(sched_phase_printk);
+
+static int set_phase_state(bool enabled)
+{
+ int err;
+ int state = static_branch_likely(&sched_phase);
+
+ if (enabled == state) {
+ pr_warn("phase has already %s\n", state ? "enabled" : "disabled");
+ return 0;
+ }
+
+ if (enabled) {
+ err = phase_perf_create();
+ if (err) {
+ pr_err("phase enable failed\n");
+ return err;
+ }
+ static_branch_enable(&sched_phase);
+ pr_info("phase enabled\n");
+ memqos_timer_twin_init();
+ } else {
+ static_branch_disable(&sched_phase);
+ phase_perf_release();
+ pr_info("phase disabled\n");
+ memqos_timer_twin_exit();
+ }
+
+ return 0;
+}
+
+/*
+ * the other procfs files of phase cannot be modified if sched_phase is already enabled
+ */
+static int phase_proc_state(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_phase);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ err = set_phase_state(state);
+
+ return err;
+}
+
+static int set_phase_state_printk(bool enabled)
+{
+ if (enabled) {
+ static_branch_enable(&sched_phase_printk);
+ } else {
+ static_branch_disable(&sched_phase_printk);
+ }
+
+ return 0;
+}
+
+/*
+ * the other procfs files of phase cannot be modified if sched_phase is already enabled
+ */
+static int phase_proc_state_printk(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int err;
+ int state = static_branch_likely(&sched_phase);
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ t = *table;
+ t.data = &state;
+ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+ if (write)
+ err = set_phase_state_printk(state);
+
+ return err;
+}
+
+
+static int __maybe_unused zero;
+static int __maybe_unused one = 1;
+
+struct ctl_table phase_table[] = {
+ {
+ .procname = "enabled",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = phase_proc_state,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
+ .procname = "trace_enabled",
+ .data = NULL,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = phase_proc_state_printk,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ { }
+};
+#endif /* CONFIG_PROC_SYSCTL */
diff --git a/kernel/sched/memqos/phase_memband.c b/kernel/sched/memqos/phase_memband.c
new file mode 100644
index 0000000000000..df8b2811f6ab7
--- /dev/null
+++ b/kernel/sched/memqos/phase_memband.c
@@ -0,0 +1,179 @@
+#include <linux/types.h>
+#include <linux/cpu.h>
+#include <linux/memqos.h>
+
+#include <asm/cpu.h>
+#include <asm/cputype.h>
+#include <asm/cpufeature.h>
+#include <asm/mpam_sched.h>
+
+static const int nr_partid = 15;
+static const int nr_monitor = 4;
+
+static LIST_HEAD(phase_mpam_waitqueue);
+
+//mpam_profile_res[0] not used
+struct memqos_mpam_profile mpam_profile_res[16] = {
+ { .partid = 0, .monitor = 0, .used = 1},
+ { .partid = 1, .monitor = 0,},
+ { .partid = 2, .monitor = 1,},
+ { .partid = 3, .monitor = 2,},
+ { .partid = 4, .monitor = 3,},
+ { .partid = 5, .monitor = 0,},
+ { .partid = 6, .monitor = 1,},
+ { .partid = 7, .monitor = 2,},
+ { .partid = 8, .monitor = 3,},
+ { .partid = 9, .monitor = 0,},
+ { .partid = 10, .monitor = 1,},
+ { .partid = 11, .monitor = 2,},
+ { .partid = 12, .monitor = 3,},
+ { .partid = 13, .monitor = 0,},
+ { .partid = 14, .monitor = 1,},
+ { .partid = 15, .monitor = 2,},
+};
+
+static DEFINE_SPINLOCK(phase_partid_lock);
+
+void phase_update_mpam_label(struct task_struct *tsk)
+{
+ int i = 0;
+ //unsigned long flag;
+
+ WARN_ON_ONCE(tsk->closid);
+
+ if (tsk->sched_memqos.corrupt) {
+ phase_release_mpam_label(tsk);
+ return;
+ }
+
+ spin_lock(&phase_partid_lock);
+ if (tsk->sched_memqos.mpam_profile.profile != &mpam_profile_res[0] &&
+ tsk->sched_memqos.mpam_profile.profile != NULL) {
+ tsk->closid = tsk->sched_memqos.mpam_profile.profile->partid;
+ tsk->sched_memqos.mpam_profile.profile->tsk = tsk;
+ //tsk->sched_memqos.mpam_profile.profile->used = 1;
+ tsk->rmid = 0;
+ spin_unlock(&phase_partid_lock);
+ //if (static_branch_unlikely(&sched_phase_printk)) {
+ // trace_printk("task pid:%d get partid%d succeed\n", tsk->pid, tsk->closid);
+ //}
+ __mpam_sched_in_v2(tsk);
+ return;
+ }
+
+ //is in profile queue, wait...
+ if (tsk->sched_memqos.mpam_profile.profile == &mpam_profile_res[0]) {
+ spin_unlock(&phase_partid_lock);
+ return;
+ }
+
+ for (i = 1; i < 16; i++) {
+ if (mpam_profile_res[i].used) {
+ if (static_branch_unlikely(&sched_phase_printk)) {
+ //if (mpam_profile_res[i].tsk)
+ // trace_printk("i%d want get partid, butpartid:%d get by pid:%d closid:%d\n",
+ //tsk->pid, i, mpam_profile_res[i].tsk->pid, mpam_profile_res[i].tsk->closid);
+ //else
+ // trace_printk("i%d want get partid, butpartid:%d get by pid:%d(NULL)\n",
+ //tsk->pid, i, tsk->pid);
+ }
+
+ continue;
+ }
+
+ tsk->sched_memqos.mpam_profile.profile = NULL;
+ break;
+ }
+
+ if (i == 16) {
+ list_move_tail(&tsk->sched_memqos.mpam_profile.wait_list, &phase_mpam_waitqueue);
+ tsk->sched_memqos.mpam_profile.profile = &mpam_profile_res[0];
+ spin_unlock(&phase_partid_lock);
+ //if (static_branch_unlikely(&sched_phase_printk)) {
+ // trace_printk("task pid:%d no partid found, go to list\n", tsk->pid);
+ //}
+ //wait...
+ return;
+ }
+
+ mpam_profile_res[i].used = 1;
+ tsk->closid = mpam_profile_res[i].partid;
+ mpam_profile_res[i].tsk = tsk;
+ tsk->sched_memqos.mpam_profile.profile = &mpam_profile_res[i];
+ tsk->rmid = 0;
+ spin_unlock(&phase_partid_lock);
+ //if (static_branch_unlikely(&sched_phase_printk)) {
+ //trace_printk("task pid:%d get partid%d succeed\n", tsk->pid, i);
+ //}
+
+ __mpam_sched_in_v2(tsk);
+}
+
+static void phase_release_mpam_label_without_lock(struct task_struct *tsk)
+{
+ int closid;
+ struct memqos_wait_profile *next;
+
+ //assert locked
+
+ if (tsk->sched_memqos.mpam_profile.profile &&
+ tsk->sched_memqos.mpam_profile.profile->partid) {
+ closid = tsk->sched_memqos.mpam_profile.profile->partid;
+ } else if (tsk->closid == 0) {
+ return;
+ } else {
+ closid = tsk->closid;
+ }
+
+ tsk->closid = 0;
+ tsk->sched_memqos.mpam_profile.profile = NULL;
+ mpam_profile_res[closid].used = 0;
+ mpam_profile_res[closid].tsk = NULL;
+
+ //if (static_branch_unlikely(&sched_phase_printk)) {
+ // trace_printk("task pid:%d release partid%d, list empty:%d\n", tsk->pid, closid, list_empty(&phase_mpam_waitqueue));
+ //}
+
+ next = list_first_entry_or_null(&phase_mpam_waitqueue, struct memqos_wait_profile, wait_list);
+ if (next) {
+ list_del_init(&next->wait_list);
+ mpam_profile_res[closid].used = 1;
+ next->profile = &mpam_profile_res[closid];
+ }
+
+ return;
+}
+
+//task shutdown
+void phase_destroy_waitqueue(struct task_struct *tsk)
+{
+ spin_lock(&phase_partid_lock);
+
+ //if (tsk->sched_memqos.mpam_profile.profile == &mpam_profile_res[0]) {
+ list_del_init(&tsk->sched_memqos.mpam_profile.wait_list);
+ //} else {
+ phase_release_mpam_label_without_lock(tsk);
+ //}
+ spin_unlock(&phase_partid_lock);
+}
+
+void phase_release_mpam_label(struct task_struct *tsk)
+{
+ spin_lock(&phase_partid_lock);
+ phase_release_mpam_label_without_lock(tsk);
+ spin_unlock(&phase_partid_lock);
+}
+
+#include <asm/mpam.h>
+void phase_get_memband(struct memqos_mpam_profile *pm, int *result, int nr)
+{
+ if (pm == &mpam_profile_res[0] || pm == NULL) {
+ result[0] = 0;
+ result[1] = 0;
+ result[2] = 0;
+ result[3] = 0;
+ return;
+ }
+
+ mpam_component_config_mbwu_mon(pm->partid, pm->pmg, pm->monitor, result, nr);
+}
diff --git a/kernel/sched/memqos/phase_perf.c b/kernel/sched/memqos/phase_perf.c
new file mode 100644
index 0000000000000..7b7f37e46f76c
--- /dev/null
+++ b/kernel/sched/memqos/phase_perf.c
@@ -0,0 +1,412 @@
+#include <linux/kernel.h>
+#include <linux/perf_event.h>
+#include <linux/percpu-defs.h>
+#include <linux/slab.h>
+#include <linux/stop_machine.h>
+#include <linux/memqos.h>
+#include <linux/sched.h>
+
+#define PHASE_FEVENT_NUM 3
+
+int *phase_perf_pevents = NULL;
+
+static DEFINE_PER_CPU(__typeof__(struct perf_event *)[PHASE_PEVENT_NUM], cpu_phase_perf_events);
+
+/******************************************
+ * Helpers for phase perf event
+ *****************************************/
+static inline struct perf_event *perf_event_of_cpu(int cpu, int index)
+{
+ return per_cpu(cpu_phase_perf_events, cpu)[index];
+}
+
+static inline struct perf_event **perf_events_of_cpu(int cpu)
+{
+ return per_cpu(cpu_phase_perf_events, cpu);
+}
+
+static inline u64 perf_event_local_pmu_read(struct perf_event *event)
+{
+ return 0;
+ if (event->state == PERF_EVENT_STATE_ACTIVE)
+ event->pmu->read(event);
+ return local64_read(&event->count);
+}
+
+/******************************************
+ * Helpers for cpu counters
+ *****************************************/
+static inline u64 read_cpu_counter(int cpu, int index)
+{
+ struct perf_event *event = perf_event_of_cpu(cpu, index);
+
+ if (!event || !event->pmu)
+ return 0;
+
+ return perf_event_local_pmu_read(event);
+}
+
+static struct perf_event_attr *alloc_attr(int event_id)
+{
+ struct perf_event_attr *attr;
+
+ attr = kzalloc(sizeof(struct perf_event_attr), GFP_KERNEL);
+ if (!attr)
+ return ERR_PTR(-ENOMEM);
+
+ attr->type = PERF_TYPE_RAW;
+ attr->config = event_id;
+ attr->size = sizeof(struct perf_event_attr);
+ attr->pinned = 1;
+ attr->disabled = 1;
+ //attr->exclude_hv;
+ //attr->exclude_idle;
+ //attr->exclude_kernel;
+
+ return attr;
+}
+
+static int create_cpu_counter(int cpu, int event_id, int index)
+{
+ struct perf_event_attr *attr = NULL;
+ struct perf_event **events = perf_events_of_cpu(cpu);
+ struct perf_event *event = NULL;
+
+ return 0;
+ attr = alloc_attr(event_id);
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+
+ event = perf_event_create_kernel_counter(attr, cpu, NULL, NULL, NULL);
+ if (IS_ERR(event)) {
+ pr_err("unable to create perf event (cpu:%i-type:%d-pinned:%d-config:0x%llx) : %ld",
+ cpu, attr->type, attr->pinned, attr->config, PTR_ERR(event));
+ kfree(attr);
+ return PTR_ERR(event);
+ } else {
+ events[index] = event;
+ perf_event_enable(events[index]);
+ if (event->hw.idx == -1) {
+ pr_err("pinned event unable to get onto hardware, perf event (cpu:%i-type:%d-config:0x%llx)",
+ cpu, attr->type, attr->config);
+ kfree(attr);
+ return -EINVAL;
+ }
+ pr_info("create perf_event (cpu:%i-idx:%d-type:%d-pinned:%d-exclude_hv:%d"
+ "-exclude_idle:%d-exclude_kernel:%d-config:0x%llx-addr:%px)",
+ event->cpu, event->hw.idx,
+ event->attr.type, event->attr.pinned, event->attr.exclude_hv,
+ event->attr.exclude_idle, event->attr.exclude_kernel,
+ event->attr.config, event);
+ }
+
+ kfree(attr);
+ return 0;
+}
+
+static int release_cpu_counter(int cpu, int event_id, int index)
+{
+ struct perf_event **events = perf_events_of_cpu(cpu);
+ struct perf_event *event = NULL;
+
+ return 0;
+ event = events[index];
+
+ if (!event)
+ return 0;
+
+ pr_info("release perf_event (cpu:%i-idx:%d-type:%d-pinned:%d-exclude_hv:%d"
+ "-exclude_idle:%d-exclude_kernel:%d-config:0x%llx)",
+ event->cpu, event->hw.idx,
+ event->attr.type, event->attr.pinned, event->attr.exclude_hv,
+ event->attr.exclude_idle, event->attr.exclude_kernel,
+ event->attr.config);
+
+ perf_event_release_kernel(event);
+ events[index] = NULL;
+
+ return 0;
+}
+
+enum {
+ CYCLES_INDEX = 0,
+ INST_RETIRED_INDEX,
+ PHASE_EVENT_FINAL_TERMINATOR
+};
+
+#define CYCLES 0x0011
+#define INST_RETIRED 0x0008
+
+static int pevents[PHASE_PEVENT_NUM] = {
+ CYCLES,
+ INST_RETIRED,
+ PHASE_EVENT_FINAL_TERMINATOR,
+};
+
+#define for_each_phase_pevents(index, events) \
+ for (index = 0; events != NULL && index < PHASE_PEVENT_NUM && \
+ events[index] != PHASE_EVENT_FINAL_TERMINATOR; index++)
+
+
+/******************************************
+ * Helpers for phase perf
+ *****************************************/
+static int do_pevents(int (*fn)(int, int, int), int cpu)
+{
+ int index;
+ int err;
+
+ for_each_phase_pevents(index, phase_perf_pevents) {
+ err = fn(cpu, phase_perf_pevents[index], index);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int __phase_perf_create(void *args)
+{
+ int err;
+ int cpu = raw_smp_processor_id();
+
+ /* create pinned events */
+ pr_info("create pinned events\n");
+ err = do_pevents(create_cpu_counter, cpu);
+ if (err) {
+ pr_err("create pinned events failed\n");
+ do_pevents(release_cpu_counter, cpu);
+ return err;
+ }
+
+ pr_info("[%d] phase class event create success\n", cpu);
+ return 0;
+}
+
+static int do_phase_perf_create(int *pevents, const struct cpumask *cpus)
+{
+ phase_perf_pevents = pevents;
+ return stop_machine(__phase_perf_create, NULL, cpus);
+}
+
+static int __do_phase_perf_release(void *args)
+{
+ int cpu = raw_smp_processor_id();
+
+ /* release pinned events */
+ pr_info("release pinned events\n");
+ do_pevents(release_cpu_counter, cpu);
+
+ pr_info("[%d] phase class event release success\n", cpu);
+ return 0;
+}
+
+static void do_phase_perf_release(const struct cpumask *cpus)
+{
+ stop_machine(__do_phase_perf_release, NULL, cpus);
+ phase_perf_pevents = NULL;
+}
+
+int phase_perf_create(void)
+{
+ return do_phase_perf_create(pevents, cpu_possible_mask);
+}
+
+void phase_perf_release(void)
+{
+ do_phase_perf_release(cpu_possible_mask);
+}
+
+DECLARE_STATIC_KEY_FALSE(sched_phase);
+DECLARE_STATIC_KEY_FALSE(sched_phase_printk);
+
+#define PHASE_EVENT_OVERFLOW (~0ULL)
+
+static inline u64 phase_event_count_sub(u64 curr, u64 prev)
+{
+ if (curr < prev) { /* ovewrflow */
+ u64 tmp = PHASE_EVENT_OVERFLOW - prev;
+ return curr + tmp;
+ } else {
+ return curr - prev;
+ }
+}
+
+static inline void phase_calc_delta(struct task_struct *p,
+ struct phase_event_count *prev,
+ struct phase_event_count *curr,
+ struct phase_event_count *delta)
+{
+ int *pevents = phase_perf_pevents;
+ int index;
+
+ for_each_phase_pevents(index, pevents) {
+ delta->pcount.data[index] = phase_event_count_sub(curr->pcount.data[index], prev->pcount.data[index]);
+ }
+}
+
+static inline u64 phase_data_of_pevent(struct phase_event_pcount *counter, int event_id)
+{
+ int index;
+ int *events = phase_perf_pevents;
+
+ for_each_phase_pevents(index, events) {
+ if (event_id == events[index])
+ return counter->data[index];
+ }
+
+ return 0;
+}
+
+static int cal_ring_history_average(int *history, int nr, int s_pos, int c_nr)
+{
+ int average = 0;
+ int start = ((s_pos - c_nr) + nr) % nr;
+
+ if (start < 0)
+ return 0;
+
+ for (;start != s_pos;) {
+ if (history[start] == 0) {
+ c_nr--;
+ if (c_nr == 0)
+ return 0;
+ continue;
+ }
+ average += history[start];
+ start = (start + 1) % nr;
+ }
+
+ return start / c_nr;
+}
+
+static void __phase_cal_ipcx10(struct task_struct *p, struct phase_event_count *delta)
+{
+ u64 ins;
+ u64 cycles;
+ //invalid zero
+ int ipcx10 = 0;
+
+ ins = phase_data_of_pevent(&delta->pcount, INST_RETIRED_INDEX);
+ cycles = phase_data_of_pevent(&delta->pcount, CYCLES_INDEX);
+
+ if (cycles)
+ ipcx10 = (ins * 10) / cycles;
+
+ //if (static_branch_unlikely(&sched_phase_printk)) {
+ // trace_printk("ins:%lld cycles:%lld\n", ins, cycles);
+ //}
+
+ p->sched_memqos.ipcx10_history[p->sched_memqos.ipc_ringpos] = ipcx10;
+ p->sched_memqos.ipc_ringpos = (p->sched_memqos.ipc_ringpos + 1) % 10;
+ cal_ring_history_average(p->sched_memqos.ipcx10_history, 10, p->sched_memqos.ipc_ringpos, 5);
+}
+
+static void __phase_cal_memband_div_10(struct task_struct *p)
+{
+ int pos;
+ int result[4];
+
+ pos = p->sched_memqos.memband_ringpos;
+
+ phase_get_memband(p->sched_memqos.mpam_profile.profile, result, 4);
+
+ //if (static_branch_unlikely(&sched_phase_printk)) {
+ // trace_printk("memband:%d %d %d %d profile:%llx\n", result[0], result[1], result[2], result[3], p->sched_memqos.mpam_profile.profile);
+ //}
+
+ p->sched_memqos.memband_div_10_total[0] = p->sched_memqos.memband_div_10_total[0] - p->sched_memqos.memband_div_10_history[0][pos];
+ p->sched_memqos.memband_div_10_total[0] = p->sched_memqos.memband_div_10_total[0] + result[0] / 10;
+ p->sched_memqos.memband_div_10_history[0][p->sched_memqos.memband_ringpos] = result[0] / 10;
+
+ p->sched_memqos.memband_div_10_total[1] = p->sched_memqos.memband_div_10_total[1] - p->sched_memqos.memband_div_10_history[1][pos];
+ p->sched_memqos.memband_div_10_total[1] = p->sched_memqos.memband_div_10_total[1] + result[1] / 10;
+ p->sched_memqos.memband_div_10_history[1][p->sched_memqos.memband_ringpos] = result[1] / 10;
+
+ p->sched_memqos.memband_div_10_total[2] = p->sched_memqos.memband_div_10_total[2] - p->sched_memqos.memband_div_10_history[2][pos];
+ p->sched_memqos.memband_div_10_total[2] = p->sched_memqos.memband_div_10_total[2] + result[2] / 10;
+ p->sched_memqos.memband_div_10_history[2][p->sched_memqos.memband_ringpos] = result[2] / 10;
+
+ p->sched_memqos.memband_div_10_total[3] = p->sched_memqos.memband_div_10_total[3] - p->sched_memqos.memband_div_10_history[3][pos];
+ p->sched_memqos.memband_div_10_total[3] = p->sched_memqos.memband_div_10_total[3] + result[3] / 10;
+ p->sched_memqos.memband_div_10_history[3][p->sched_memqos.memband_ringpos] = result[3] / 10;
+
+ p->sched_memqos.memband_ringpos = (pos + 1) % 10;
+
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[0], 10, pos, 5);
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[1], 10, pos, 5);
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[2], 10, pos, 5);
+ cal_ring_history_average(p->sched_memqos.memband_div_10_history[3], 10, pos, 5);
+}
+
+static DEFINE_PER_CPU(struct phase_event_count, prev_phase_event_count);
+static DEFINE_PER_CPU(struct phase_event_count, curr_phase_event_count);
+
+static void phase_perf_read_events(int cpu, u64 *pdata)
+{
+ int index;
+
+ for_each_phase_pevents(index, phase_perf_pevents) {
+ pdata[index] = read_cpu_counter(cpu, index);
+ }
+}
+
+static inline struct phase_event_count *phase_read_prev(unsigned int cpu)
+{
+ return &per_cpu(prev_phase_event_count, cpu);
+}
+
+static inline struct phase_event_count *phase_read_curr(unsigned int cpu)
+{
+ struct phase_event_count *curr = &per_cpu(curr_phase_event_count, cpu);
+
+ phase_perf_read_events(cpu, curr->pcount.data);
+
+ return curr;
+}
+
+void phase_account_task(struct task_struct *p, int cpu)
+{
+ struct phase_event_count delta;
+ struct phase_event_count *prev, *curr;
+
+ if (!static_branch_likely(&sched_phase))
+ return;
+
+ //if (!sched_core_enabled(cpu_rq(cpu)))
+ // return;
+
+ /* update phase_event_count */
+ prev = phase_read_prev(cpu);
+ curr = phase_read_curr(cpu);
+ phase_calc_delta(p, prev, curr, &delta);
+ *prev = *curr;
+
+ /* calculate phase */
+ __phase_cal_ipcx10(p, &delta);
+ __phase_cal_memband_div_10(p);
+ p->sched_memqos.sample_times++;
+ if ((p->sched_memqos.sample_times % 3) == 0)
+ p->sched_memqos.account_ready = 1;
+}
+
+
+void phase_trace_printk(struct task_struct *p)
+{
+ if (!static_branch_unlikely(&sched_phase_printk))
+ return;
+
+ trace_printk("p->comm:%s(%d) ipcpos:%d ipcx10:%d membandpos:%d memband_div_10:%d numa_score[0]:%d numa_score[1]:%d numa_score[2]:%d numa_score[3]:%d turbo:%d prefered_nid:%d classid:%d partid:%d\n",
+ p->comm, p->pid, p->sched_memqos.ipc_ringpos,\
+ p->sched_memqos.ipcx10, \
+ p->sched_memqos.memband_ringpos,\
+ p->sched_memqos.memband_div_10, \
+ p->sched_memqos.numa_score[0], \
+ p->sched_memqos.numa_score[1], \
+ p->sched_memqos.numa_score[2], \
+ p->sched_memqos.numa_score[3], \
+ p->sched_memqos.turbo, \
+ p->sched_memqos.preferred_nid, \
+ p->sched_memqos.class_id, \
+ p->closid);
+}
diff --git a/kernel/sched/memqos/phase_sim_knn.c b/kernel/sched/memqos/phase_sim_knn.c
new file mode 100644
index 0000000000000..b80bb6b9ae0a3
--- /dev/null
+++ b/kernel/sched/memqos/phase_sim_knn.c
@@ -0,0 +1,92 @@
+#include <linux/types.h>
+
+#define DATA_ROW 20
+void QuickSort(u64 arr[DATA_ROW][2], int L, int R) {
+ int i = L;
+ int j = R;
+ int kk = (L + R) / 2;
+ u64 pivot = arr[kk][0];
+
+ while (i <= j) {
+ while (pivot > arr[i][0]) {
+ i++;
+ }
+ while (pivot < arr[j][0]) {
+ j--;
+ }
+ if (i <= j) {
+ u64 temp = arr[i][0];
+
+ arr[i][0] = arr[j][0];
+ arr[j][0] = temp;
+ i++; j--;
+ }
+ }
+ if (L < j) {
+ QuickSort(arr, L, j);
+ }
+ if (i < R) {
+ QuickSort(arr, i, R);
+ }
+}
+
+u64 euclidean_distance(u64 *row1, u64 *row2, int col) {
+ u64 distance = 0;
+ int i;
+
+ for (i = 0; i < col - 1; i++) {
+ distance += ((row1[i] - row2[i]) * (row1[i] - row2[i]));
+ }
+ return distance;
+}
+
+#define num_neighbors 6
+#define MAX_TAG 8
+
+int get_neighbors_tag(u64 train_data[DATA_ROW][3], int train_row, int col, u64 *test_row) {
+ int i;
+ u64 neighbors[MAX_TAG] = {0};
+ int max_tag = 0;
+ u64 distances[DATA_ROW][2];
+
+ for (i = 0; i < train_row; i++) {
+ distances[i][0] = euclidean_distance(train_data[i], test_row, col);
+ distances[i][1] = train_data[i][col - 1];
+ }
+ QuickSort(distances, 0, train_row - 1);
+ for (i = 0; i < num_neighbors; i++) {
+ neighbors[distances[i][1]]++;
+ if (neighbors[distances[i][1]] > neighbors[max_tag])
+ max_tag = distances[i][1];
+ }
+ return max_tag;
+}
+
+static u64 train_data[DATA_ROW][3] = {
+ {0, 1, 0},
+ {0, 9, 0},
+ {0, 20, 1},
+ {0, 30, 1},
+ {0, 40, 2},
+ {0, 50, 3},
+ {0, 60, 3},
+ {0, 70, 3},
+ {0, 80, 4},
+ {0, 90, 4},
+ {0, 100, 4},
+ {0, 110, 5},
+ {0, 120, 5},
+ {0, 130, 6},
+ {0, 140, 6},
+ {0, 150, 7},
+};
+
+int knn_get_tag(int ipcx10, int memband_div_10)
+{
+ u64 test_data[2];
+
+ test_data[0] = ipcx10;
+ test_data[1] = memband_div_10;
+
+ return get_neighbors_tag(train_data, DATA_ROW, 3, test_data);
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 685f9881b8e23..0d2764c4449ce 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -465,6 +465,13 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
+ {
+ .procname = "phase",
+ .mode = 0555,
+ .child = phase_table,
+ },
+#endif
#endif /* CONFIG_SCHED_DEBUG */
{
.procname = "sched_rt_period_us",
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4cac46d56f387..d748c291e7047 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2164,12 +2164,15 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
{
struct mempolicy *pol;
struct page *page;
- int preferred_nid;
+ int preferred_nid = -1;
nodemask_t *nmask;
+ if (current->sched_memqos.preferred_nid)
+ preferred_nid = current->sched_memqos.preferred_nid - 1;
+
pol = get_vma_policy(vma, addr);
- if (pol->mode == MPOL_INTERLEAVE) {
+ if (pol->mode == MPOL_INTERLEAVE && preferred_nid != -1) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
@@ -2233,7 +2236,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
}
nmask = policy_nodemask(gfp, pol);
- preferred_nid = policy_node(gfp, pol, node);
+ if (preferred_nid == -1)
+ preferred_nid = policy_node(gfp, pol, node);
page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
mark_vma_cdm(nmask, page, vma);
mpol_cond_put(pol);
--
2.25.1