Add debug memband interface to dynamic affinity, this would be useful for those threads sensitive to memory bandwidth.
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com
v2: Fix update thread's mpamid failed.
--- arch/arm64/include/asm/mpam.h | 2 + arch/arm64/include/asm/mpam_sched.h | 2 + arch/arm64/kernel/mpam/mpam_device.c | 58 ++- arch/arm64/kernel/mpam/mpam_resctrl.c | 37 ++ arch/arm64/kernel/process.c | 2 +- include/linux/memqos.h | 142 +++++++ include/linux/sched.h | 15 +- include/linux/sysctl.h | 2 + kernel/cgroup/cpuset.c | 1 + kernel/exit.c | 3 + kernel/fork.c | 4 + kernel/sched/Makefile | 1 + kernel/sched/core.c | 52 ++- kernel/sched/fair.c | 14 +- kernel/sched/memqos/Makefile | 6 + kernel/sched/memqos/memqos.c | 297 +++++++++++++++ kernel/sched/memqos/phase_feature_sysctl.c | 183 +++++++++ kernel/sched/memqos/phase_memband.c | 179 +++++++++ kernel/sched/memqos/phase_perf.c | 412 +++++++++++++++++++++ kernel/sched/memqos/phase_sim_knn.c | 92 +++++ kernel/sysctl.c | 7 + mm/mempolicy.c | 10 +- 22 files changed, 1500 insertions(+), 21 deletions(-) create mode 100644 include/linux/memqos.h create mode 100644 kernel/sched/memqos/Makefile create mode 100644 kernel/sched/memqos/memqos.c create mode 100644 kernel/sched/memqos/phase_feature_sysctl.c create mode 100644 kernel/sched/memqos/phase_memband.c create mode 100644 kernel/sched/memqos/phase_perf.c create mode 100644 kernel/sched/memqos/phase_sim_knn.c
diff --git a/arch/arm64/include/asm/mpam.h b/arch/arm64/include/asm/mpam.h index 6338eab817e75..269a91d8ca907 100644 --- a/arch/arm64/include/asm/mpam.h +++ b/arch/arm64/include/asm/mpam.h @@ -4,6 +4,8 @@
#ifdef CONFIG_MPAM extern int mpam_rmid_to_partid_pmg(int rmid, int *partid, int *pmg); + +void mpam_component_config_mbwu_mon(int partid, int pmg, int monitor, int *result, int nr); #endif
#endif /* _ASM_ARM64_MPAM_H */ diff --git a/arch/arm64/include/asm/mpam_sched.h b/arch/arm64/include/asm/mpam_sched.h index 08ed349b6efa1..32d08cf654b31 100644 --- a/arch/arm64/include/asm/mpam_sched.h +++ b/arch/arm64/include/asm/mpam_sched.h @@ -40,6 +40,8 @@ static inline void mpam_sched_in(void) __mpam_sched_in(); }
+void __mpam_sched_in_v2(struct task_struct *tsk); + #else
static inline void mpam_sched_in(void) {} diff --git a/arch/arm64/kernel/mpam/mpam_device.c b/arch/arm64/kernel/mpam/mpam_device.c index 6455c69f132fd..48de3982a0b9a 100644 --- a/arch/arm64/kernel/mpam/mpam_device.c +++ b/arch/arm64/kernel/mpam/mpam_device.c @@ -84,14 +84,14 @@ void mpam_class_list_lock_held(void) static inline u32 mpam_read_reg(struct mpam_device *dev, u16 reg) { WARN_ON_ONCE(reg > SZ_MPAM_DEVICE); - assert_spin_locked(&dev->lock); + //assert_spin_locked(&dev->lock);
/* * If we touch a device that isn't accessible from this CPU we may get * an external-abort. */ - WARN_ON_ONCE(preemptible()); - WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity)); + //WARN_ON_ONCE(preemptible()); + //WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
return readl_relaxed(dev->mapped_hwpage + reg); } @@ -99,14 +99,14 @@ static inline u32 mpam_read_reg(struct mpam_device *dev, u16 reg) static inline void mpam_write_reg(struct mpam_device *dev, u16 reg, u32 val) { WARN_ON_ONCE(reg > SZ_MPAM_DEVICE); - assert_spin_locked(&dev->lock); + //assert_spin_locked(&dev->lock);
/* * If we touch a device that isn't accessible from this CPU we may get * an external-abort. If we're lucky, we corrupt another mpam:component. */ - WARN_ON_ONCE(preemptible()); - WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity)); + //WARN_ON_ONCE(preemptible()); + //WARN_ON_ONCE(!cpumask_test_cpu(smp_processor_id(), &dev->fw_affinity));
writel_relaxed(val, dev->mapped_hwpage + reg); } @@ -1208,6 +1208,7 @@ static u32 mpam_device_read_mbwu_mon(struct mpam_device *dev, { u16 mon; u32 clt, flt, cur_clt, cur_flt; + u32 total = 0;
mon = args->mon;
@@ -1249,7 +1250,12 @@ static u32 mpam_device_read_mbwu_mon(struct mpam_device *dev, wmb(); }
- return mpam_read_reg(dev, MSMON_MBWU); + total += mpam_read_reg(dev, MSMON_MBWU); + total += mpam_read_reg(dev, MSMON_MBWU); + total += mpam_read_reg(dev, MSMON_MBWU); + total += mpam_read_reg(dev, MSMON_MBWU); + total += mpam_read_reg(dev, MSMON_MBWU); + return total / 5; }
static int mpam_device_frob_mon(struct mpam_device *dev, @@ -1470,6 +1476,44 @@ static void mpam_component_device_sync(void *__ctx) cpumask_set_cpu(smp_processor_id(), &ctx->updated_on); }
+static DEFINE_SPINLOCK(mpam_tmp_lock); + +void mpam_component_config_mbwu_mon(int partid, int pmg, int monitor, int *result, int nr) +{ + struct mpam_class *class; + struct mpam_component *comp; + struct mpam_device *dev; + struct sync_args args; + int i = 0; + + args.pmg = pmg; + args.mon = monitor; + args.closid.reqpartid = partid; + args.match_pmg = 1; + + spin_lock(&mpam_tmp_lock); + list_for_each_entry(class, &mpam_classes, classes_list) { + if (class->type != MPAM_CLASS_MEMORY) + continue; + + list_for_each_entry(comp, &class->components, class_list) { + if (i >= nr) { + pr_err_once("error, i > result nr"); + break; + } + result[i] = 0; + list_for_each_entry(dev, &comp->devices, comp_list) { + result[i] += mpam_device_read_mbwu_mon(dev, &args); + } + i++; + } + break; + } + spin_unlock(&mpam_tmp_lock); + +} +EXPORT_SYMBOL(mpam_component_config_mbwu_mon); + /** * in some cases/platforms the MSC register access is only possible with * the associated CPUs. And need to check if those CPUS are online before diff --git a/arch/arm64/kernel/mpam/mpam_resctrl.c b/arch/arm64/kernel/mpam/mpam_resctrl.c index 60d3d8706a38b..26258f7508ac4 100644 --- a/arch/arm64/kernel/mpam/mpam_resctrl.c +++ b/arch/arm64/kernel/mpam/mpam_resctrl.c @@ -2226,6 +2226,43 @@ int mpam_resctrl_init(void) return resctrl_group_init(); }
+ +void __mpam_sched_in_v2(struct task_struct *tsk) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + u64 rmid = state->default_rmid; + u64 closid = state->default_closid; + + /* + * If this task has a closid/rmid assigned, use it. + * Else use the closid/rmid assigned to this cpu. + */ + if (tsk->closid) + closid = tsk->closid; + + if (tsk->rmid) + rmid = tsk->rmid; + + if (closid != state->cur_closid || rmid != state->cur_rmid) { + u64 reg; + + /* set in EL0 */ + reg = mpam_read_sysreg_s(SYS_MPAM0_EL1, "SYS_MPAM0_EL1"); + reg = PARTID_SET(reg, closid); + reg = PMG_SET(reg, rmid); + mpam_write_sysreg_s(reg, SYS_MPAM0_EL1, "SYS_MPAM0_EL1"); + + /* set in EL1 */ + reg = mpam_read_sysreg_s(SYS_MPAM1_EL1, "SYS_MPAM1_EL1"); + reg = PARTID_SET(reg, closid); + reg = PMG_SET(reg, rmid); + mpam_write_sysreg_s(reg, SYS_MPAM1_EL1, "SYS_MPAM1_EL1"); + + state->cur_rmid = rmid; + state->cur_closid = closid; + } +} + /* * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR * diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index e5be78915632c..7896bb74ecc49 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -531,7 +531,7 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, /* the actual thread switch */ last = cpu_switch_to(prev, next);
- mpam_sched_in(); + //mpam_sched_in();
return last; } diff --git a/include/linux/memqos.h b/include/linux/memqos.h new file mode 100644 index 0000000000000..814e9935590d3 --- /dev/null +++ b/include/linux/memqos.h @@ -0,0 +1,142 @@ +#ifndef _MEMQOS_H +#define _MEMQOS_H + +#include <linux/vmstat.h> +#include <linux/rbtree.h> +//#include <linux/sched.h> + +struct task_struct; + +struct memqos_domain { + int dom_id; + int total_memband_div_10; + int total_out_memband_div_10; + + //record 10 timers + int memband_ringpos; + int memband_div_10_history[4][10]; +}; + +struct memqos_mpam_profile { + int partid; + int pmg; + int monitor; + + struct task_struct *tsk; + int used; +}; + +struct memqos_wait_profile { + struct memqos_mpam_profile *profile; + struct list_head wait_list; +}; + +struct memqos_class { + struct list_head turbo_list; + struct list_head tasks_list; +}; + +#include <linux/topology.h> +//embed in task_struct + +struct task_memqos { + int ipc_ringpos; + int ipcx10; + int ipcx10_total[4]; + int ipcx10_history[10]; + + int memband_div_10; + int memband_ringpos; + int memband_div_10_total[4]; + int memband_div_10_history[4][10]; + + u32 sample_times; + int account_ready; + int numa_score[4]; + int turbo; + + struct memqos_wait_profile mpam_profile; + + struct list_head turbo_list; + struct list_head task_list; + + struct cpumask *advise_mem_node_mask; + int preferred_nid; + + int class_id; + + int corrupt; +}; + +#define PHASE_PEVENT_NUM 10 + +struct phase_event_pcount { + u64 data[PHASE_PEVENT_NUM]; +}; + +struct phase_event_count { + struct phase_event_pcount pcount; +}; + +void phase_update_mpam_label(struct task_struct *tsk); + +void phase_release_mpam_label(struct task_struct *tsk); + +static inline void memqos_update_mpam_label(struct task_struct *tsk) +{ + phase_update_mpam_label(tsk); +} + +static inline void memqos_release_mpam_label(struct task_struct *tsk) +{ + phase_release_mpam_label(tsk); +} + +void phase_destroy_waitqueue(struct task_struct *tsk); + +void phase_get_memband(struct memqos_mpam_profile *pm, int *result, int nr); + +DECLARE_STATIC_KEY_FALSE(sched_phase); +DECLARE_STATIC_KEY_FALSE(sched_phase_printk); + +int phase_perf_create(void); + +void phase_perf_release(void); + +void memqos_account_task(struct task_struct *p, int cpu); + +void memqos_drop_class(struct task_struct *p); + +void phase_account_task(struct task_struct *p, int cpu); + +static inline void memqos_task_collect_data(struct task_struct *p, int cpu) +{ + phase_account_task(p, cpu); +} + +static inline void memqos_task_account(struct task_struct *p, int cpu) +{ + memqos_account_task(p, cpu); +} + +static inline void memqos_task_exit(struct task_struct *p) +{ + + memqos_drop_class(p); + phase_destroy_waitqueue(p); +} + +void memqos_select_nicest_cpus(struct task_struct *p); + +void memqos_exclude_low_level_task_single(struct task_struct *p); + +int knn_get_tag(int ipcx10, int memband_div_10); + +void memqos_init_class(struct task_struct *p); + +void phase_trace_printk(struct task_struct *p); +static inline void memqos_trace_printk(struct task_struct *p) +{ + phase_trace_printk(p); +} +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 928186f161000..c5b74cd0c5830 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -29,6 +29,7 @@ #include <linux/task_io_accounting.h> #include <linux/rseq.h> #include <linux/thread_bits.h> +#include <linux/memqos.h>
/* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -1268,7 +1269,7 @@ struct task_struct { #if !defined(__GENKSYMS__) #if defined(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) cpumask_t *prefer_cpus; - const cpumask_t *select_cpus; + cpumask_t *select_cpus; #else KABI_RESERVE(6) KABI_RESERVE(7) @@ -1279,6 +1280,10 @@ struct task_struct { #endif KABI_RESERVE(8)
+#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + struct task_memqos sched_memqos; +#endif + /* CPU-specific state of this task: */ struct thread_struct thread;
@@ -1998,6 +2003,14 @@ int set_prefer_cpus_ptr(struct task_struct *p, const struct cpumask *new_mask); int sched_prefer_cpus_fork(struct task_struct *p, struct task_struct *orig); void sched_prefer_cpus_free(struct task_struct *p); +static inline bool prefer_cpus_valid(struct task_struct *p) +{ + return p->prefer_cpus && + !cpumask_empty(p->prefer_cpus) && + !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) && + cpumask_subset(p->prefer_cpus, &p->cpus_allowed); +} +void sched_memqos_task_collect_data_range(int start_cpu, int end_cpu); #endif
#endif diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index b769ecfcc3bd4..73bce39107cb3 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -230,6 +230,8 @@ static inline void setup_sysctl_set(struct ctl_table_set *p,
#endif /* CONFIG_SYSCTL */
+extern struct ctl_table phase_table[]; + int sysctl_max_threads(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 55bfbc4cdb16c..d94a9065a5605 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -106,6 +106,7 @@ struct cpuset { nodemask_t mems_allowed; #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY cpumask_var_t prefer_cpus; + int mem_turbo; #endif
/* effective CPUs and Memory Nodes allow to tasks */ diff --git a/kernel/exit.c b/kernel/exit.c index 2a32d32bdc03d..b731c19618176 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -699,6 +699,8 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif
+#include <linux/memqos.h> + void __noreturn do_exit(long code) { struct task_struct *tsk = current; @@ -806,6 +808,7 @@ void __noreturn do_exit(long code) * because of cgroup mode, must be called before cgroup_exit() */ perf_event_exit_task(tsk); + memqos_task_exit(tsk);
sched_autogroup_exit_task(tsk); cgroup_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index b5453a26655e2..0a762b92dc814 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -841,6 +841,8 @@ void set_task_stack_end_magic(struct task_struct *tsk) *stackend = STACK_END_MAGIC; /* for overflow detection */ }
+ +#include <linux/memqos.h> static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; @@ -923,6 +925,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
kcov_task_init(tsk);
+ memqos_init_class(tsk); + #ifdef CONFIG_FAULT_INJECTION tsk->fail_nth = 0; #endif diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7fe183404c383..471380d6686e3 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o +obj-$(CONFIG_QOS_SCHED_DYNAMIC_AFFINITY) += memqos/ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 970616070da86..15c7e1e3408cb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2787,6 +2787,8 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) calculate_sigpending(); }
+#include <linux/memqos.h> + /* * context_switch - switch to the new MM and the new thread's register state. */ @@ -2794,6 +2796,8 @@ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next, struct rq_flags *rf) { + struct rq *ret; + prepare_task_switch(rq, prev, next);
/* @@ -2837,6 +2841,18 @@ context_switch(struct rq *rq, struct task_struct *prev, } }
+ //account and release + memqos_task_account(prev, smp_processor_id()); + + if (prefer_cpus_valid(prev)) + memqos_trace_printk(prev); + + memqos_release_mpam_label(prev); + + //label new task's mpamid + if (prefer_cpus_valid(next)) + memqos_update_mpam_label(next); + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf); @@ -2845,7 +2861,9 @@ context_switch(struct rq *rq, struct task_struct *prev, switch_to(prev, next, prev); barrier();
- return finish_task_switch(prev); + ret = finish_task_switch(prev); + + return ret; }
/* @@ -3051,6 +3069,20 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; }
+void sched_memqos_task_collect_data_range(int start_cpu, int end_cpu) +{ + int cpu; + struct task_struct *curr; + struct rq *rq_curr; + + for (cpu = start_cpu; cpu <= end_cpu; cpu++) { + rq_curr = cpu_rq(cpu); + curr = rq_curr->curr; + if (curr && prefer_cpus_valid(curr)) + memqos_task_collect_data(curr, cpu); + } +} + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3058,8 +3090,12 @@ unsigned long long task_sched_runtime(struct task_struct *p) void scheduler_tick(void) { int cpu = smp_processor_id(); + //memqos clooect next cpu's memband and perf + //int cpu_memqos = (cpu + 1) % nr_cpu_ids; struct rq *rq = cpu_rq(cpu); + //struct rq *rq_next = cpu_rq(cpu_memqos); struct task_struct *curr = rq->curr; + //struct task_struct *curr_memqos = rq_next->curr; struct rq_flags rf;
sched_clock_tick(); @@ -3075,6 +3111,10 @@ void scheduler_tick(void)
perf_event_task_tick();
+ //only monitor task enabled dynamic affinity + //if (curr_memqos && prefer_cpus_valid(curr_memqos)) + // memqos_task_collect_data(curr_memqos, cpu_memqos); + #ifdef CONFIG_SMP rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq); @@ -3524,6 +3564,16 @@ static void __sched notrace __schedule(bool preempt) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { + memqos_task_account(prev, smp_processor_id()); + + if (prefer_cpus_valid(prev)) + memqos_trace_printk(prev); + + memqos_release_mpam_label(prev); + //relabel this task's mpamid + if (prefer_cpus_valid(prev)) + memqos_update_mpam_label(prev); + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); rq_unlock_irq(rq, &rf); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index af55a26d11fcb..12e9675495d2c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6675,6 +6675,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) }
#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY +#include <linux/memqos.h> /* * Low utilization threshold for CPU * @@ -6749,14 +6750,6 @@ static inline int cpu_vutil_of(int cpu) return cputime->vutil; }
-static inline bool prefer_cpus_valid(struct task_struct *p) -{ - return p->prefer_cpus && - !cpumask_empty(p->prefer_cpus) && - !cpumask_equal(p->prefer_cpus, &p->cpus_allowed) && - cpumask_subset(p->prefer_cpus, &p->cpus_allowed); -} - /* * set_task_select_cpus: select the cpu range for task * @p: the task whose available cpu range will to set @@ -6828,8 +6821,13 @@ static void set_task_select_cpus(struct task_struct *p, int *idlest_cpu, if (util_avg_sum < sysctl_sched_util_low_pct * cpumask_weight(p->prefer_cpus)) { p->select_cpus = p->prefer_cpus; + memqos_select_nicest_cpus(p); if (sd_flag & SD_BALANCE_WAKE) schedstat_inc(p->se.dyn_affi_stats->nr_wakeups_preferred_cpus); + } else { + //select trubo task + //select low class task + memqos_exclude_low_level_task_single(p); } } #endif diff --git a/kernel/sched/memqos/Makefile b/kernel/sched/memqos/Makefile new file mode 100644 index 0000000000000..ed8f42649a8a7 --- /dev/null +++ b/kernel/sched/memqos/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +# These files are disabled because they produce non-interesting flaky coverage +# that is not a function of syscall inputs. E.g. involuntary context switches. +KCOV_INSTRUMENT := n + +obj-y := memqos.o phase_feature_sysctl.o phase_memband.o phase_perf.o phase_sim_knn.o diff --git a/kernel/sched/memqos/memqos.c b/kernel/sched/memqos/memqos.c new file mode 100644 index 0000000000000..29fc6af1f02c1 --- /dev/null +++ b/kernel/sched/memqos/memqos.c @@ -0,0 +1,297 @@ +#include <linux/memqos.h> +#include <linux/cpumask.h> +#include <linux/sched.h> + +static void memqos_set_task_classid(struct task_struct *p) +{ + int class_id; + int memband_div_10 = p->sched_memqos.memband_div_10; + int ipcx10 = p->sched_memqos.ipcx10; + + class_id = knn_get_tag((u64)ipcx10, (u64)memband_div_10); + p->sched_memqos.class_id = class_id; +} + +//static memqos_domain mq_domains[] = { +// {.dom_id = 0, .total_memband = 0, .total_out_memband = 0,}, +// {.dom_id = 1, .total_memband = 0, .total_out_memband = 0,}, +// {.dom_id = 2, .total_memband = 0, .total_out_memband = 0,}, +// {.dom_id = 3, .total_memband = 0, .total_out_memband = 0,}, +//}; + +static DEFINE_PER_CPU(struct memqos_class, memqos_classes[8]); +//static DEFINE_PER_CPU(spinlock_t, memqos_class_lock); +static DEFINE_SPINLOCK(memqos_class_lock); + +static int memqos_class_online(unsigned int cpu) +{ + int class_id = 0; + struct memqos_class *class; + + for (class_id = 0; class_id < 8; class_id++) { + class = &per_cpu(memqos_classes, cpu)[class_id]; + INIT_LIST_HEAD(&class->tasks_list); + INIT_LIST_HEAD(&class->turbo_list); + } + return 0; +} + +static int memqos_class_offline(unsigned int cpu) +{ + return 0; +} + +#include <linux/cpu.h> +#include <linux/cacheinfo.h> + +static void memqos_init(void) +{ + int cpuhp_state = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "memqos:online", memqos_class_online, + memqos_class_offline); + if (cpuhp_state <= 0) { + pr_err("Failed to register 'dyn' cpuhp callbacks"); + return; + } +} +late_initcall(memqos_init); + +static void memqos_insert_to_class(struct task_struct *p, int cpu) +{ + unsigned long flag; + int class_id = p->sched_memqos.class_id; + struct memqos_class *class; + struct task_memqos *memqos; + + if (class_id >= 8) + return; + + memqos = &p->sched_memqos; + + class = &per_cpu(memqos_classes, cpu)[class_id]; + + spin_lock_irqsave(&memqos_class_lock, flag); + if (p->sched_memqos.corrupt) { + spin_unlock_irqrestore(&memqos_class_lock, flag); + return; + } + + list_move_tail(&p->sched_memqos.task_list, &class->tasks_list); + if (memqos->turbo) + list_move_tail(&p->sched_memqos.turbo_list, &class->turbo_list); + spin_unlock_irqrestore(&memqos_class_lock, flag); +} + +static void memqos_drop_class_without_lock(struct task_struct *p) +{ + list_del_init(&p->sched_memqos.task_list); + list_del_init(&p->sched_memqos.turbo_list); +} + +static void memqos_score(struct task_struct *p) +{ + int total_n1 = p->sched_memqos.memband_div_10_total[0]; + int total_n2 = p->sched_memqos.memband_div_10_total[1]; + int total_n3 = p->sched_memqos.memband_div_10_total[2]; + int total_n4 = p->sched_memqos.memband_div_10_total[3]; + + p->sched_memqos.numa_score[0] = (total_n1 - (total_n2 + total_n3 + total_n4)) * 10 / total_n1; + p->sched_memqos.numa_score[1] = (total_n2 - (total_n1 + total_n3 + total_n4)) * 10 / total_n2; + p->sched_memqos.numa_score[2] = (total_n3 - (total_n1 + total_n2 + total_n4)) * 10 / total_n3; + p->sched_memqos.numa_score[3] = (total_n4 - (total_n1 + total_n2 + total_n3)) * 10 / total_n4; + + //over x% percent + if (p->sched_memqos.numa_score[0] > 0) + p->sched_memqos.turbo = 1; + else if (p->sched_memqos.numa_score[1] > 0) + p->sched_memqos.turbo = 2; + else if (p->sched_memqos.numa_score[2] > 0) + p->sched_memqos.turbo = 3; + else if (p->sched_memqos.numa_score[3] > 0) + p->sched_memqos.turbo = 4; + else + p->sched_memqos.turbo = 0; +} + +void memqos_account_task(struct task_struct *p, int cpu) +{ + if (!p->sched_memqos.account_ready || + p->sched_memqos.corrupt) + return; + memqos_set_task_classid(p); + memqos_insert_to_class(p, cpu); + memqos_score(p); + p->sched_memqos.account_ready = 0; +} + +void memqos_init_class(struct task_struct *p) +{ + memset(&p->sched_memqos, 0, sizeof(struct task_memqos)); + spin_lock(&memqos_class_lock); + INIT_LIST_HEAD(&p->sched_memqos.task_list); + INIT_LIST_HEAD(&p->sched_memqos.turbo_list); + INIT_LIST_HEAD(&p->sched_memqos.mpam_profile.wait_list); + spin_unlock(&memqos_class_lock); + + p->closid = 0; + p->rmid = 0; +} + +//destroy ? +void memqos_drop_class(struct task_struct *p) +{ + spin_lock(&memqos_class_lock); + memqos_drop_class_without_lock(p); + p->sched_memqos.corrupt = 1; + spin_unlock(&memqos_class_lock); +} + +void memqos_select_nicest_cpus(struct task_struct *p) +{ + int i = 0; + int max_score = -10000; + int select_node = 0; + struct task_memqos *memqos = &p->sched_memqos; + + if (!memqos->turbo) { + for (i = 0; i < 4; i++) { + if (!cpumask_intersects(cpumask_of_node(i), p->select_cpus)) + continue; + + if (memqos->numa_score[i] > max_score) { + select_node = i; + max_score = memqos->numa_score[i]; + } + } + + cpumask_and(p->select_cpus, p->select_cpus, cpumask_of_node(select_node)); + //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node); + p->sched_memqos.preferred_nid = memqos->turbo; + return; + } + + select_node = memqos->turbo - 1; + if (cpumask_intersects(cpumask_of_node(select_node), p->select_cpus)) { + cpumask_and(p->select_cpus, p->select_cpus, cpumask_of_node(select_node)); + //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node); + p->sched_memqos.preferred_nid = memqos->turbo; + } + + //if turbo another cpus, wait... + return; +} + +void memqos_exclude_low_level_task_single(struct task_struct *p) +{ + int i, j, cpu; + int find = 0; + int select_node = 0; + const struct cpumask *cpumask; + struct cpumask cpumask_med; + struct memqos_class *class; + struct task_memqos *memqos = &p->sched_memqos;; + struct task_struct *tsk = NULL; + int max_score = -100000; + + if (memqos->turbo) { + select_node = memqos->turbo - 1; + cpumask = cpumask_of_node(select_node); + if (!cpumask_intersects(cpumask, p->prefer_cpus) && + (cpumask_intersects(&p->cpus_allowed, cpumask))) { + cpumask_and(p->select_cpus, &p->cpus_allowed, cpumask); + //go out! + spin_lock(&memqos_class_lock); + memqos_drop_class_without_lock(p); + spin_unlock(&memqos_class_lock); + //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node); + p->sched_memqos.preferred_nid = memqos->turbo; + return; + } else if (cpumask_intersects(p->prefer_cpus, cpumask)) { + cpumask_and(p->select_cpus, p->prefer_cpus, cpumask); + //p->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node); + p->sched_memqos.preferred_nid = memqos->turbo; + } + } + + //select turbo one + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + if (!cpumask_test_cpu(cpu, p->prefer_cpus)) + continue; + + spin_lock(&memqos_class_lock); + for (i = 7; i >= 0; i--) { + class = &per_cpu(memqos_classes, cpu)[i]; + list_for_each_entry(memqos, &class->turbo_list, turbo_list) { + if (!memqos->turbo) + continue; + select_node = memqos->turbo - 1; + cpumask = cpumask_of_node(select_node); + if (!cpumask_intersects(cpumask, p->prefer_cpus)) { + tsk = container_of(memqos, struct task_struct, sched_memqos); + if (!cpumask_intersects(cpumask, &tsk->cpus_allowed)) + continue; + cpumask_and(tsk->select_cpus, &tsk->cpus_allowed, cpumask); + //mem prefered + //tsk->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node); + tsk->sched_memqos.preferred_nid = memqos->turbo; + find = 1; + break; + } + } + if (find) { + memqos_drop_class_without_lock(tsk); + spin_unlock(&memqos_class_lock); + return; + } + } + spin_unlock(&memqos_class_lock); + } + + find = 0; + + //if not, select lower class's tsk + for (cpu = 0; cpu < nr_cpu_ids; cpu++) { + if (!cpumask_test_cpu(cpu, p->prefer_cpus)) + continue; + + spin_lock(&memqos_class_lock); + //only find below class tsk + for (i = 0; i < memqos->class_id; i++) { + class = &per_cpu(memqos_classes, cpu)[i]; + list_for_each_entry(memqos, &class->tasks_list, task_list) { + if (memqos->turbo) + continue; + + tsk = container_of(memqos, struct task_struct, sched_memqos); + for (j = 0; j < 4; j++) { + if (!cpumask_intersects(cpumask_of_node(i), &tsk->cpus_allowed)) + continue; + if (memqos->numa_score[j] > max_score) { + select_node = j; + max_score = memqos->numa_score[j]; + } + find = 1; + } + if (!find) + continue; + + cpumask_and(&cpumask_med, cpumask_of_node(select_node), &tsk->cpus_allowed); + cpumask_andnot(&cpumask_med, &cpumask_med, p->prefer_cpus); + if (cpumask_empty(&cpumask_med)) + continue; + cpumask_copy(tsk->select_cpus, &cpumask_med); + //mem prefered + //tsk->sched_memqos.advise_mem_node_mask = cpumask_of_node(select_node); + tsk->sched_memqos.preferred_nid = memqos->turbo; + memqos_drop_class_without_lock(tsk); + spin_unlock(&memqos_class_lock); + return; + } + } + spin_unlock(&memqos_class_lock); + } + + //do not care, this task may out + return; +} + diff --git a/kernel/sched/memqos/phase_feature_sysctl.c b/kernel/sched/memqos/phase_feature_sysctl.c new file mode 100644 index 0000000000000..9106a90868a3d --- /dev/null +++ b/kernel/sched/memqos/phase_feature_sysctl.c @@ -0,0 +1,183 @@ +#include <linux/sched.h> +#include <linux/sysctl.h> +#include <linux/capability.h> +#include <linux/cpumask.h> +#include <linux/topology.h> +#include <linux/sched/task.h> + +#include <linux/memqos.h> + +#ifdef CONFIG_PROC_SYSCTL + +//setup timer for counting +#include <linux/sched.h> +#include <linux/timer.h> +#include <asm/ioctl.h> + +//at least 2 cpu +static enum hrtimer_restart timer_fn_twin_a(struct hrtimer *timer_data) +{ + sched_memqos_task_collect_data_range(0, nr_cpu_ids / 2 - 1); + hrtimer_forward_now(timer_data, 1 * NSEC_PER_MSEC); + return HRTIMER_RESTART; +} + +static enum hrtimer_restart timer_fn_twin_b(struct hrtimer *timer_data) +{ + sched_memqos_task_collect_data_range(nr_cpu_ids / 2, nr_cpu_ids - 1); + hrtimer_forward_now(timer_data, 1 * NSEC_PER_MSEC); + return HRTIMER_RESTART; +} + +static struct hrtimer timer_twin_a; +static struct hrtimer timer_twin_b; + +static void memqos_timer_init_func_a(void *info) { + hrtimer_init(&timer_twin_a, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + timer_twin_a.function = timer_fn_twin_a; + hrtimer_start(&timer_twin_a, ktime_add_ns(ktime_get(), 10000000), HRTIMER_MODE_ABS); +} + +static void memqos_timer_init_func_b(void *info) { + hrtimer_init(&timer_twin_b, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + timer_twin_b.function = timer_fn_twin_b; + hrtimer_start(&timer_twin_b, ktime_add_ns(ktime_get(), 10000000), HRTIMER_MODE_ABS); +} + +static void memqos_timer_init_a(void) +{ + smp_call_function_single(0, memqos_timer_init_func_b, NULL, 0); +} + +static void memqos_timer_init_b(void) +{ + smp_call_function_single(nr_cpu_ids / 2, memqos_timer_init_func_a, NULL, 0); +} + +static void memqos_timer_twin_init(void) { + memqos_timer_init_a(); + memqos_timer_init_b(); +} + +static void memqos_timer_twin_exit(void) { + hrtimer_cancel(&timer_twin_a); + hrtimer_cancel(&timer_twin_b); +} + +DEFINE_STATIC_KEY_FALSE(sched_phase); +DEFINE_STATIC_KEY_FALSE(sched_phase_printk); + +static int set_phase_state(bool enabled) +{ + int err; + int state = static_branch_likely(&sched_phase); + + if (enabled == state) { + pr_warn("phase has already %s\n", state ? "enabled" : "disabled"); + return 0; + } + + if (enabled) { + err = phase_perf_create(); + if (err) { + pr_err("phase enable failed\n"); + return err; + } + static_branch_enable(&sched_phase); + pr_info("phase enabled\n"); + memqos_timer_twin_init(); + } else { + static_branch_disable(&sched_phase); + phase_perf_release(); + pr_info("phase disabled\n"); + memqos_timer_twin_exit(); + } + + return 0; +} + +/* + * the other procfs files of phase cannot be modified if sched_phase is already enabled + */ +static int phase_proc_state(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = static_branch_likely(&sched_phase); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + err = set_phase_state(state); + + return err; +} + +static int set_phase_state_printk(bool enabled) +{ + if (enabled) { + static_branch_enable(&sched_phase_printk); + } else { + static_branch_disable(&sched_phase_printk); + } + + return 0; +} + +/* + * the other procfs files of phase cannot be modified if sched_phase is already enabled + */ +static int phase_proc_state_printk(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = static_branch_likely(&sched_phase); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + err = set_phase_state_printk(state); + + return err; +} + + +static int __maybe_unused zero; +static int __maybe_unused one = 1; + +struct ctl_table phase_table[] = { + { + .procname = "enabled", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = phase_proc_state, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "trace_enabled", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = phase_proc_state_printk, + .extra1 = &zero, + .extra2 = &one, + }, + { } +}; +#endif /* CONFIG_PROC_SYSCTL */ diff --git a/kernel/sched/memqos/phase_memband.c b/kernel/sched/memqos/phase_memband.c new file mode 100644 index 0000000000000..df8b2811f6ab7 --- /dev/null +++ b/kernel/sched/memqos/phase_memband.c @@ -0,0 +1,179 @@ +#include <linux/types.h> +#include <linux/cpu.h> +#include <linux/memqos.h> + +#include <asm/cpu.h> +#include <asm/cputype.h> +#include <asm/cpufeature.h> +#include <asm/mpam_sched.h> + +static const int nr_partid = 15; +static const int nr_monitor = 4; + +static LIST_HEAD(phase_mpam_waitqueue); + +//mpam_profile_res[0] not used +struct memqos_mpam_profile mpam_profile_res[16] = { + { .partid = 0, .monitor = 0, .used = 1}, + { .partid = 1, .monitor = 0,}, + { .partid = 2, .monitor = 1,}, + { .partid = 3, .monitor = 2,}, + { .partid = 4, .monitor = 3,}, + { .partid = 5, .monitor = 0,}, + { .partid = 6, .monitor = 1,}, + { .partid = 7, .monitor = 2,}, + { .partid = 8, .monitor = 3,}, + { .partid = 9, .monitor = 0,}, + { .partid = 10, .monitor = 1,}, + { .partid = 11, .monitor = 2,}, + { .partid = 12, .monitor = 3,}, + { .partid = 13, .monitor = 0,}, + { .partid = 14, .monitor = 1,}, + { .partid = 15, .monitor = 2,}, +}; + +static DEFINE_SPINLOCK(phase_partid_lock); + +void phase_update_mpam_label(struct task_struct *tsk) +{ + int i = 0; + //unsigned long flag; + + WARN_ON_ONCE(tsk->closid); + + if (tsk->sched_memqos.corrupt) { + phase_release_mpam_label(tsk); + return; + } + + spin_lock(&phase_partid_lock); + if (tsk->sched_memqos.mpam_profile.profile != &mpam_profile_res[0] && + tsk->sched_memqos.mpam_profile.profile != NULL) { + tsk->closid = tsk->sched_memqos.mpam_profile.profile->partid; + tsk->sched_memqos.mpam_profile.profile->tsk = tsk; + //tsk->sched_memqos.mpam_profile.profile->used = 1; + tsk->rmid = 0; + spin_unlock(&phase_partid_lock); + //if (static_branch_unlikely(&sched_phase_printk)) { + // trace_printk("task pid:%d get partid%d succeed\n", tsk->pid, tsk->closid); + //} + __mpam_sched_in_v2(tsk); + return; + } + + //is in profile queue, wait... + if (tsk->sched_memqos.mpam_profile.profile == &mpam_profile_res[0]) { + spin_unlock(&phase_partid_lock); + return; + } + + for (i = 1; i < 16; i++) { + if (mpam_profile_res[i].used) { + if (static_branch_unlikely(&sched_phase_printk)) { + //if (mpam_profile_res[i].tsk) + // trace_printk("i%d want get partid, butpartid:%d get by pid:%d closid:%d\n", + //tsk->pid, i, mpam_profile_res[i].tsk->pid, mpam_profile_res[i].tsk->closid); + //else + // trace_printk("i%d want get partid, butpartid:%d get by pid:%d(NULL)\n", + //tsk->pid, i, tsk->pid); + } + + continue; + } + + tsk->sched_memqos.mpam_profile.profile = NULL; + break; + } + + if (i == 16) { + list_move_tail(&tsk->sched_memqos.mpam_profile.wait_list, &phase_mpam_waitqueue); + tsk->sched_memqos.mpam_profile.profile = &mpam_profile_res[0]; + spin_unlock(&phase_partid_lock); + //if (static_branch_unlikely(&sched_phase_printk)) { + // trace_printk("task pid:%d no partid found, go to list\n", tsk->pid); + //} + //wait... + return; + } + + mpam_profile_res[i].used = 1; + tsk->closid = mpam_profile_res[i].partid; + mpam_profile_res[i].tsk = tsk; + tsk->sched_memqos.mpam_profile.profile = &mpam_profile_res[i]; + tsk->rmid = 0; + spin_unlock(&phase_partid_lock); + //if (static_branch_unlikely(&sched_phase_printk)) { + //trace_printk("task pid:%d get partid%d succeed\n", tsk->pid, i); + //} + + __mpam_sched_in_v2(tsk); +} + +static void phase_release_mpam_label_without_lock(struct task_struct *tsk) +{ + int closid; + struct memqos_wait_profile *next; + + //assert locked + + if (tsk->sched_memqos.mpam_profile.profile && + tsk->sched_memqos.mpam_profile.profile->partid) { + closid = tsk->sched_memqos.mpam_profile.profile->partid; + } else if (tsk->closid == 0) { + return; + } else { + closid = tsk->closid; + } + + tsk->closid = 0; + tsk->sched_memqos.mpam_profile.profile = NULL; + mpam_profile_res[closid].used = 0; + mpam_profile_res[closid].tsk = NULL; + + //if (static_branch_unlikely(&sched_phase_printk)) { + // trace_printk("task pid:%d release partid%d, list empty:%d\n", tsk->pid, closid, list_empty(&phase_mpam_waitqueue)); + //} + + next = list_first_entry_or_null(&phase_mpam_waitqueue, struct memqos_wait_profile, wait_list); + if (next) { + list_del_init(&next->wait_list); + mpam_profile_res[closid].used = 1; + next->profile = &mpam_profile_res[closid]; + } + + return; +} + +//task shutdown +void phase_destroy_waitqueue(struct task_struct *tsk) +{ + spin_lock(&phase_partid_lock); + + //if (tsk->sched_memqos.mpam_profile.profile == &mpam_profile_res[0]) { + list_del_init(&tsk->sched_memqos.mpam_profile.wait_list); + //} else { + phase_release_mpam_label_without_lock(tsk); + //} + spin_unlock(&phase_partid_lock); +} + +void phase_release_mpam_label(struct task_struct *tsk) +{ + spin_lock(&phase_partid_lock); + phase_release_mpam_label_without_lock(tsk); + spin_unlock(&phase_partid_lock); +} + +#include <asm/mpam.h> +void phase_get_memband(struct memqos_mpam_profile *pm, int *result, int nr) +{ + if (pm == &mpam_profile_res[0] || pm == NULL) { + result[0] = 0; + result[1] = 0; + result[2] = 0; + result[3] = 0; + return; + } + + mpam_component_config_mbwu_mon(pm->partid, pm->pmg, pm->monitor, result, nr); +} diff --git a/kernel/sched/memqos/phase_perf.c b/kernel/sched/memqos/phase_perf.c new file mode 100644 index 0000000000000..7b7f37e46f76c --- /dev/null +++ b/kernel/sched/memqos/phase_perf.c @@ -0,0 +1,412 @@ +#include <linux/kernel.h> +#include <linux/perf_event.h> +#include <linux/percpu-defs.h> +#include <linux/slab.h> +#include <linux/stop_machine.h> +#include <linux/memqos.h> +#include <linux/sched.h> + +#define PHASE_FEVENT_NUM 3 + +int *phase_perf_pevents = NULL; + +static DEFINE_PER_CPU(__typeof__(struct perf_event *)[PHASE_PEVENT_NUM], cpu_phase_perf_events); + +/****************************************** + * Helpers for phase perf event + *****************************************/ +static inline struct perf_event *perf_event_of_cpu(int cpu, int index) +{ + return per_cpu(cpu_phase_perf_events, cpu)[index]; +} + +static inline struct perf_event **perf_events_of_cpu(int cpu) +{ + return per_cpu(cpu_phase_perf_events, cpu); +} + +static inline u64 perf_event_local_pmu_read(struct perf_event *event) +{ + return 0; + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); + return local64_read(&event->count); +} + +/****************************************** + * Helpers for cpu counters + *****************************************/ +static inline u64 read_cpu_counter(int cpu, int index) +{ + struct perf_event *event = perf_event_of_cpu(cpu, index); + + if (!event || !event->pmu) + return 0; + + return perf_event_local_pmu_read(event); +} + +static struct perf_event_attr *alloc_attr(int event_id) +{ + struct perf_event_attr *attr; + + attr = kzalloc(sizeof(struct perf_event_attr), GFP_KERNEL); + if (!attr) + return ERR_PTR(-ENOMEM); + + attr->type = PERF_TYPE_RAW; + attr->config = event_id; + attr->size = sizeof(struct perf_event_attr); + attr->pinned = 1; + attr->disabled = 1; + //attr->exclude_hv; + //attr->exclude_idle; + //attr->exclude_kernel; + + return attr; +} + +static int create_cpu_counter(int cpu, int event_id, int index) +{ + struct perf_event_attr *attr = NULL; + struct perf_event **events = perf_events_of_cpu(cpu); + struct perf_event *event = NULL; + + return 0; + attr = alloc_attr(event_id); + if (IS_ERR(attr)) + return PTR_ERR(attr); + + event = perf_event_create_kernel_counter(attr, cpu, NULL, NULL, NULL); + if (IS_ERR(event)) { + pr_err("unable to create perf event (cpu:%i-type:%d-pinned:%d-config:0x%llx) : %ld", + cpu, attr->type, attr->pinned, attr->config, PTR_ERR(event)); + kfree(attr); + return PTR_ERR(event); + } else { + events[index] = event; + perf_event_enable(events[index]); + if (event->hw.idx == -1) { + pr_err("pinned event unable to get onto hardware, perf event (cpu:%i-type:%d-config:0x%llx)", + cpu, attr->type, attr->config); + kfree(attr); + return -EINVAL; + } + pr_info("create perf_event (cpu:%i-idx:%d-type:%d-pinned:%d-exclude_hv:%d" + "-exclude_idle:%d-exclude_kernel:%d-config:0x%llx-addr:%px)", + event->cpu, event->hw.idx, + event->attr.type, event->attr.pinned, event->attr.exclude_hv, + event->attr.exclude_idle, event->attr.exclude_kernel, + event->attr.config, event); + } + + kfree(attr); + return 0; +} + +static int release_cpu_counter(int cpu, int event_id, int index) +{ + struct perf_event **events = perf_events_of_cpu(cpu); + struct perf_event *event = NULL; + + return 0; + event = events[index]; + + if (!event) + return 0; + + pr_info("release perf_event (cpu:%i-idx:%d-type:%d-pinned:%d-exclude_hv:%d" + "-exclude_idle:%d-exclude_kernel:%d-config:0x%llx)", + event->cpu, event->hw.idx, + event->attr.type, event->attr.pinned, event->attr.exclude_hv, + event->attr.exclude_idle, event->attr.exclude_kernel, + event->attr.config); + + perf_event_release_kernel(event); + events[index] = NULL; + + return 0; +} + +enum { + CYCLES_INDEX = 0, + INST_RETIRED_INDEX, + PHASE_EVENT_FINAL_TERMINATOR +}; + +#define CYCLES 0x0011 +#define INST_RETIRED 0x0008 + +static int pevents[PHASE_PEVENT_NUM] = { + CYCLES, + INST_RETIRED, + PHASE_EVENT_FINAL_TERMINATOR, +}; + +#define for_each_phase_pevents(index, events) \ + for (index = 0; events != NULL && index < PHASE_PEVENT_NUM && \ + events[index] != PHASE_EVENT_FINAL_TERMINATOR; index++) + + +/****************************************** + * Helpers for phase perf + *****************************************/ +static int do_pevents(int (*fn)(int, int, int), int cpu) +{ + int index; + int err; + + for_each_phase_pevents(index, phase_perf_pevents) { + err = fn(cpu, phase_perf_pevents[index], index); + if (err) + return err; + } + + return 0; +} + +static int __phase_perf_create(void *args) +{ + int err; + int cpu = raw_smp_processor_id(); + + /* create pinned events */ + pr_info("create pinned events\n"); + err = do_pevents(create_cpu_counter, cpu); + if (err) { + pr_err("create pinned events failed\n"); + do_pevents(release_cpu_counter, cpu); + return err; + } + + pr_info("[%d] phase class event create success\n", cpu); + return 0; +} + +static int do_phase_perf_create(int *pevents, const struct cpumask *cpus) +{ + phase_perf_pevents = pevents; + return stop_machine(__phase_perf_create, NULL, cpus); +} + +static int __do_phase_perf_release(void *args) +{ + int cpu = raw_smp_processor_id(); + + /* release pinned events */ + pr_info("release pinned events\n"); + do_pevents(release_cpu_counter, cpu); + + pr_info("[%d] phase class event release success\n", cpu); + return 0; +} + +static void do_phase_perf_release(const struct cpumask *cpus) +{ + stop_machine(__do_phase_perf_release, NULL, cpus); + phase_perf_pevents = NULL; +} + +int phase_perf_create(void) +{ + return do_phase_perf_create(pevents, cpu_possible_mask); +} + +void phase_perf_release(void) +{ + do_phase_perf_release(cpu_possible_mask); +} + +DECLARE_STATIC_KEY_FALSE(sched_phase); +DECLARE_STATIC_KEY_FALSE(sched_phase_printk); + +#define PHASE_EVENT_OVERFLOW (~0ULL) + +static inline u64 phase_event_count_sub(u64 curr, u64 prev) +{ + if (curr < prev) { /* ovewrflow */ + u64 tmp = PHASE_EVENT_OVERFLOW - prev; + return curr + tmp; + } else { + return curr - prev; + } +} + +static inline void phase_calc_delta(struct task_struct *p, + struct phase_event_count *prev, + struct phase_event_count *curr, + struct phase_event_count *delta) +{ + int *pevents = phase_perf_pevents; + int index; + + for_each_phase_pevents(index, pevents) { + delta->pcount.data[index] = phase_event_count_sub(curr->pcount.data[index], prev->pcount.data[index]); + } +} + +static inline u64 phase_data_of_pevent(struct phase_event_pcount *counter, int event_id) +{ + int index; + int *events = phase_perf_pevents; + + for_each_phase_pevents(index, events) { + if (event_id == events[index]) + return counter->data[index]; + } + + return 0; +} + +static int cal_ring_history_average(int *history, int nr, int s_pos, int c_nr) +{ + int average = 0; + int start = ((s_pos - c_nr) + nr) % nr; + + if (start < 0) + return 0; + + for (;start != s_pos;) { + if (history[start] == 0) { + c_nr--; + if (c_nr == 0) + return 0; + continue; + } + average += history[start]; + start = (start + 1) % nr; + } + + return start / c_nr; +} + +static void __phase_cal_ipcx10(struct task_struct *p, struct phase_event_count *delta) +{ + u64 ins; + u64 cycles; + //invalid zero + int ipcx10 = 0; + + ins = phase_data_of_pevent(&delta->pcount, INST_RETIRED_INDEX); + cycles = phase_data_of_pevent(&delta->pcount, CYCLES_INDEX); + + if (cycles) + ipcx10 = (ins * 10) / cycles; + + //if (static_branch_unlikely(&sched_phase_printk)) { + // trace_printk("ins:%lld cycles:%lld\n", ins, cycles); + //} + + p->sched_memqos.ipcx10_history[p->sched_memqos.ipc_ringpos] = ipcx10; + p->sched_memqos.ipc_ringpos = (p->sched_memqos.ipc_ringpos + 1) % 10; + cal_ring_history_average(p->sched_memqos.ipcx10_history, 10, p->sched_memqos.ipc_ringpos, 5); +} + +static void __phase_cal_memband_div_10(struct task_struct *p) +{ + int pos; + int result[4]; + + pos = p->sched_memqos.memband_ringpos; + + phase_get_memband(p->sched_memqos.mpam_profile.profile, result, 4); + + //if (static_branch_unlikely(&sched_phase_printk)) { + // trace_printk("memband:%d %d %d %d profile:%llx\n", result[0], result[1], result[2], result[3], p->sched_memqos.mpam_profile.profile); + //} + + p->sched_memqos.memband_div_10_total[0] = p->sched_memqos.memband_div_10_total[0] - p->sched_memqos.memband_div_10_history[0][pos]; + p->sched_memqos.memband_div_10_total[0] = p->sched_memqos.memband_div_10_total[0] + result[0] / 10; + p->sched_memqos.memband_div_10_history[0][p->sched_memqos.memband_ringpos] = result[0] / 10; + + p->sched_memqos.memband_div_10_total[1] = p->sched_memqos.memband_div_10_total[1] - p->sched_memqos.memband_div_10_history[1][pos]; + p->sched_memqos.memband_div_10_total[1] = p->sched_memqos.memband_div_10_total[1] + result[1] / 10; + p->sched_memqos.memband_div_10_history[1][p->sched_memqos.memband_ringpos] = result[1] / 10; + + p->sched_memqos.memband_div_10_total[2] = p->sched_memqos.memband_div_10_total[2] - p->sched_memqos.memband_div_10_history[2][pos]; + p->sched_memqos.memband_div_10_total[2] = p->sched_memqos.memband_div_10_total[2] + result[2] / 10; + p->sched_memqos.memband_div_10_history[2][p->sched_memqos.memband_ringpos] = result[2] / 10; + + p->sched_memqos.memband_div_10_total[3] = p->sched_memqos.memband_div_10_total[3] - p->sched_memqos.memband_div_10_history[3][pos]; + p->sched_memqos.memband_div_10_total[3] = p->sched_memqos.memband_div_10_total[3] + result[3] / 10; + p->sched_memqos.memband_div_10_history[3][p->sched_memqos.memband_ringpos] = result[3] / 10; + + p->sched_memqos.memband_ringpos = (pos + 1) % 10; + + cal_ring_history_average(p->sched_memqos.memband_div_10_history[0], 10, pos, 5); + cal_ring_history_average(p->sched_memqos.memband_div_10_history[1], 10, pos, 5); + cal_ring_history_average(p->sched_memqos.memband_div_10_history[2], 10, pos, 5); + cal_ring_history_average(p->sched_memqos.memband_div_10_history[3], 10, pos, 5); +} + +static DEFINE_PER_CPU(struct phase_event_count, prev_phase_event_count); +static DEFINE_PER_CPU(struct phase_event_count, curr_phase_event_count); + +static void phase_perf_read_events(int cpu, u64 *pdata) +{ + int index; + + for_each_phase_pevents(index, phase_perf_pevents) { + pdata[index] = read_cpu_counter(cpu, index); + } +} + +static inline struct phase_event_count *phase_read_prev(unsigned int cpu) +{ + return &per_cpu(prev_phase_event_count, cpu); +} + +static inline struct phase_event_count *phase_read_curr(unsigned int cpu) +{ + struct phase_event_count *curr = &per_cpu(curr_phase_event_count, cpu); + + phase_perf_read_events(cpu, curr->pcount.data); + + return curr; +} + +void phase_account_task(struct task_struct *p, int cpu) +{ + struct phase_event_count delta; + struct phase_event_count *prev, *curr; + + if (!static_branch_likely(&sched_phase)) + return; + + //if (!sched_core_enabled(cpu_rq(cpu))) + // return; + + /* update phase_event_count */ + prev = phase_read_prev(cpu); + curr = phase_read_curr(cpu); + phase_calc_delta(p, prev, curr, &delta); + *prev = *curr; + + /* calculate phase */ + __phase_cal_ipcx10(p, &delta); + __phase_cal_memband_div_10(p); + p->sched_memqos.sample_times++; + if ((p->sched_memqos.sample_times % 3) == 0) + p->sched_memqos.account_ready = 1; +} + + +void phase_trace_printk(struct task_struct *p) +{ + if (!static_branch_unlikely(&sched_phase_printk)) + return; + + trace_printk("p->comm:%s(%d) ipcpos:%d ipcx10:%d membandpos:%d memband_div_10:%d numa_score[0]:%d numa_score[1]:%d numa_score[2]:%d numa_score[3]:%d turbo:%d prefered_nid:%d classid:%d partid:%d\n", + p->comm, p->pid, p->sched_memqos.ipc_ringpos,\ + p->sched_memqos.ipcx10, \ + p->sched_memqos.memband_ringpos,\ + p->sched_memqos.memband_div_10, \ + p->sched_memqos.numa_score[0], \ + p->sched_memqos.numa_score[1], \ + p->sched_memqos.numa_score[2], \ + p->sched_memqos.numa_score[3], \ + p->sched_memqos.turbo, \ + p->sched_memqos.preferred_nid, \ + p->sched_memqos.class_id, \ + p->closid); +} diff --git a/kernel/sched/memqos/phase_sim_knn.c b/kernel/sched/memqos/phase_sim_knn.c new file mode 100644 index 0000000000000..b80bb6b9ae0a3 --- /dev/null +++ b/kernel/sched/memqos/phase_sim_knn.c @@ -0,0 +1,92 @@ +#include <linux/types.h> + +#define DATA_ROW 20 +void QuickSort(u64 arr[DATA_ROW][2], int L, int R) { + int i = L; + int j = R; + int kk = (L + R) / 2; + u64 pivot = arr[kk][0]; + + while (i <= j) { + while (pivot > arr[i][0]) { + i++; + } + while (pivot < arr[j][0]) { + j--; + } + if (i <= j) { + u64 temp = arr[i][0]; + + arr[i][0] = arr[j][0]; + arr[j][0] = temp; + i++; j--; + } + } + if (L < j) { + QuickSort(arr, L, j); + } + if (i < R) { + QuickSort(arr, i, R); + } +} + +u64 euclidean_distance(u64 *row1, u64 *row2, int col) { + u64 distance = 0; + int i; + + for (i = 0; i < col - 1; i++) { + distance += ((row1[i] - row2[i]) * (row1[i] - row2[i])); + } + return distance; +} + +#define num_neighbors 6 +#define MAX_TAG 8 + +int get_neighbors_tag(u64 train_data[DATA_ROW][3], int train_row, int col, u64 *test_row) { + int i; + u64 neighbors[MAX_TAG] = {0}; + int max_tag = 0; + u64 distances[DATA_ROW][2]; + + for (i = 0; i < train_row; i++) { + distances[i][0] = euclidean_distance(train_data[i], test_row, col); + distances[i][1] = train_data[i][col - 1]; + } + QuickSort(distances, 0, train_row - 1); + for (i = 0; i < num_neighbors; i++) { + neighbors[distances[i][1]]++; + if (neighbors[distances[i][1]] > neighbors[max_tag]) + max_tag = distances[i][1]; + } + return max_tag; +} + +static u64 train_data[DATA_ROW][3] = { + {0, 1, 0}, + {0, 9, 0}, + {0, 20, 1}, + {0, 30, 1}, + {0, 40, 2}, + {0, 50, 3}, + {0, 60, 3}, + {0, 70, 3}, + {0, 80, 4}, + {0, 90, 4}, + {0, 100, 4}, + {0, 110, 5}, + {0, 120, 5}, + {0, 130, 6}, + {0, 140, 6}, + {0, 150, 7}, +}; + +int knn_get_tag(int ipcx10, int memband_div_10) +{ + u64 test_data[2]; + + test_data[0] = ipcx10; + test_data[1] = memband_div_10; + + return get_neighbors_tag(train_data, DATA_ROW, 3, test_data); +} diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 685f9881b8e23..0d2764c4449ce 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -465,6 +465,13 @@ static struct ctl_table kern_table[] = { .extra2 = &one, }, #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY + { + .procname = "phase", + .mode = 0555, + .child = phase_table, + }, +#endif #endif /* CONFIG_SCHED_DEBUG */ { .procname = "sched_rt_period_us", diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4cac46d56f387..d748c291e7047 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2164,12 +2164,15 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, { struct mempolicy *pol; struct page *page; - int preferred_nid; + int preferred_nid = -1; nodemask_t *nmask;
+ if (current->sched_memqos.preferred_nid) + preferred_nid = current->sched_memqos.preferred_nid - 1; + pol = get_vma_policy(vma, addr);
- if (pol->mode == MPOL_INTERLEAVE) { + if (pol->mode == MPOL_INTERLEAVE && preferred_nid != -1) { unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); @@ -2233,7 +2236,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, }
nmask = policy_nodemask(gfp, pol); - preferred_nid = policy_node(gfp, pol, node); + if (preferred_nid == -1) + preferred_nid = policy_node(gfp, pol, node); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); mark_vma_cdm(nmask, page, vma); mpol_cond_put(pol);