As smart grid scheduler (SGS) may shrink resources and affect QOS, We provide methods for evaluating task QOS in smart grid, we grouped the problems into two categories:
1. Evaluate whether (such as CPU or memory) resources meet our demand 2. Ensure the least impact when working with (cpufreq and cpuidle) governors
For tackling this questions, we have summarized several sampling methods to obtain tasks' characteristics at same time reducing scheduling noise as much as possible:
1. we detected the key factors that how sensitive a process is in cpufreq or cpuidle adjustment, and to guide the cpufreq/cpuidle governor 2. We dynamically monitor process memory bandwidth and adjust memory allocation to minimize cross-remote memory access 3. We provide a variety of load tracking mechanisms to adapt to different types of task's load change
--------------------------------- ----------------- | class A | | class B | | -------- -------- | | -------- | | | group0 | | group1 | |---| | group2 | |----------+ | -------- -------- | | -------- | | | CPU/memory sensitive type | | balance type | | ----------------+---------------- --------+-------- | v v | (target cpufreq) ------------------------------------------------------- | (sensitivity) | Not satisfied with QOS? | | --------------------------+---------------------------- | v v ------------------------------------------------------- ---------------- | expand or shrink resource |<--| energy model | ----------------------------+-------------------------- ---------------- v | ----------- ----------- ------------ v | | | | | | --------------- | GRID0 +--------+ GRID1 +--------+ GRID2 |<-- | governor | | | | | | | --------------- ---------- ---------- ------------
We will introduce the energy model in the follow-up implementation, and consider the dynamic affinity adjustment between each divided grid in the runtime.
Signed-off-by: Wang ShaoBo bobo.shaobowang@huawei.com --- include/linux/sched.h | 9 ++++ include/linux/sched/grid_qos.h | 77 ++++++++++++++++++++++++++++++++++ kernel/fork.c | 9 ++++ kernel/sched/Makefile | 1 + kernel/sched/fair.c | 9 ++++ kernel/sched/grid/Makefile | 2 + kernel/sched/grid/power.c | 8 ++++ kernel/sched/grid/qos.c | 46 ++++++++++++++++++++ kernel/sched/grid/stat.c | 11 +++++ mm/mempolicy.c | 25 +++++++++++ 10 files changed, 197 insertions(+) create mode 100644 include/linux/sched/grid_qos.h create mode 100644 kernel/sched/grid/Makefile create mode 100644 kernel/sched/grid/power.c create mode 100644 kernel/sched/grid/qos.c create mode 100644 kernel/sched/grid/stat.c
diff --git a/include/linux/sched.h b/include/linux/sched.h index 8be102dc9a193..464c329c5f8ec 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1277,7 +1277,16 @@ struct task_struct { KABI_RESERVE(6) KABI_RESERVE(7) #endif + +#if !defined(__GENKSYMS__) +#if defined(CONFIG_QOS_SCHED_SMART_GRID) + struct sched_grid_qos *qos; +#else KABI_RESERVE(8) +#endif +#else + KABI_RESERVE(8) +#endif
/* CPU-specific state of this task: */ struct thread_struct thread; diff --git a/include/linux/sched/grid_qos.h b/include/linux/sched/grid_qos.h new file mode 100644 index 0000000000000..5439d66ee51aa --- /dev/null +++ b/include/linux/sched/grid_qos.h @@ -0,0 +1,77 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_GRID_QOS_H +#define _LINUX_SCHED_GRID_QOS_H +#include <linux/nodemask.h> + +#ifdef CONFIG_QOS_SCHED_SMART_GRID +enum sched_grid_qos_class { + SCHED_GRID_QOS_CLASS_LEVEL_1 = 0, + SCHED_GRID_QOS_CLASS_LEVEL_2 = 1, + SCHED_GRID_QOS_CLASS_LEVEL_3 = 2, + SCHED_GRID_QOS_CLASS_LEVEL_4 = 3, + SCHED_GRID_QOS_CLASS_LEVEL_5 = 4, + SCHED_GRID_QOS_CLASS_LEVEL_6 = 5, + SCHED_GRID_QOS_CLASS_LEVEL_7 = 6, + SCHED_GRID_QOS_CLASS_LEVEL_8 = 7, + SCHED_GRID_QOS_CLASS_LEVEL_NR +}; + +enum { + SCHED_GRID_QOS_IPS_INDEX = 0, + SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX = 1, + SCHED_GRID_QOS_MEMBANDWIDTH_INDEX = 2, + SCHED_GRID_QOS_SAMPLE_NR +}; + +#define SCHED_GRID_QOS_RING_BUFFER_MAXLEN 100 + +struct sched_grid_qos_ring_buffer { + u64 vecs[SCHED_GRID_QOS_RING_BUFFER_MAXLEN]; + unsigned int head; + void (*push)(u64 *data, int stepsize, struct sched_grid_qos_ring_buffer *ring_buffer); +}; + +struct sched_grid_qos_sample { + const char *name; + int index; + int sample_bypass; + int sample_times; + struct sched_grid_qos_ring_buffer ring_buffer; + u64 pred_target[MAX_NUMNODES]; + void (*cal_target)(int stepsize, struct sched_grid_qos_ring_buffer *ring_buffer); + + int account_ready; + int (*start)(void *arg); + int (*account)(void *arg); +}; + +struct sched_grid_qos_stat { + enum sched_grid_qos_class class_lvl; + int (*set_class_lvl)(struct sched_grid_qos_stat *qos_stat); + struct sched_grid_qos_sample sample[SCHED_GRID_QOS_SAMPLE_NR]; +}; + +struct sched_grid_qos_power { + int cpufreq_sense_ratio; + int target_cpufreq; + int cstate_sense_ratio; +}; + +struct sched_grid_qos_affinity { + nodemask_t mem_preferred_node_mask; +}; + +struct task_struct; +struct sched_grid_qos { + struct sched_grid_qos_stat stat; + struct sched_grid_qos_power power; + struct sched_grid_qos_affinity affinity; + + int (*affinity_set)(struct task_struct *p); +}; + +int sched_grid_qos_init(struct task_struct *p); +void sched_grid_qos_free(struct task_struct *p); +#endif + +#endif diff --git a/kernel/fork.c b/kernel/fork.c index c256525d4ce5e..e85c8afbdfbb4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -21,6 +21,7 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/sched/grid_qos.h> #include <linux/rtmutex.h> #include <linux/init.h> #include <linux/unistd.h> @@ -461,6 +462,9 @@ void free_task(struct task_struct *tsk) free_kthread_struct(tsk); #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY sched_prefer_cpus_free(tsk); +#endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + sched_grid_qos_free(tsk); #endif free_task_struct(tsk); } @@ -1876,6 +1880,11 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_free; #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID + retval = sched_grid_qos_init(p); + if (retval) + goto bad_fork_free; +#endif
/* * If multiple threads are within copy_process(), then this check diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7fe183404c383..0612af002ae57 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o obj-$(CONFIG_CPU_ISOLATION) += isolation.o +obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 093dc714090c6..fbfe7921df8da 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -28,6 +28,9 @@ #include <linux/delay.h> #include <linux/tracehook.h> #endif +#ifdef CONFIG_QOS_SCHED_SMART_GRID +#include <linux/sched/grid_qos.h> +#endif #include <trace/events/sched.h>
/* @@ -5316,6 +5319,8 @@ static void affinity_domain_up(struct task_group *tg) css_task_iter_start(&tg->css, 0, &it); while ((task = css_task_iter_next(&it))) { set_prefer_cpus_ptr(task, ad->domains[ad->curr_level]); + if (likely(task->qos)) + task->qos->affinity_set(task); } css_task_iter_end(&it); } @@ -5333,6 +5338,8 @@ static void affinity_domain_down(struct task_group *tg) css_task_iter_start(&tg->css, 0, &it); while ((task = css_task_iter_next(&it))) { set_prefer_cpus_ptr(task, ad->domains[ad->curr_level]); + if (likely(task->qos)) + task->qos->affinity_set(task); } css_task_iter_end(&it); } @@ -5403,6 +5410,8 @@ static int tg_update_affinity_domain_down(struct task_group *tg, void *data) css_task_iter_start(&tg->css, 0, &it); while ((task = css_task_iter_next(&it))) { set_prefer_cpus_ptr(task, ad->domains[ad->curr_level]); + if (likely(task->qos)) + task->qos->affinity_set(task); } css_task_iter_end(&it); return 0; diff --git a/kernel/sched/grid/Makefile b/kernel/sched/grid/Makefile new file mode 100644 index 0000000000000..82f2a09c3c309 --- /dev/null +++ b/kernel/sched/grid/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_QOS_SCHED_SMART_GRID) += qos.o power.o stat.o diff --git a/kernel/sched/grid/power.c b/kernel/sched/grid/power.c new file mode 100644 index 0000000000000..dc14eec27ecec --- /dev/null +++ b/kernel/sched/grid/power.c @@ -0,0 +1,8 @@ +#include <linux/sched/grid_qos.h> + +void qos_power_init(struct sched_grid_qos_power *power) +{ + power->cpufreq_sense_ratio = 0; + power->target_cpufreq = 0; + power->cstate_sense_ratio = 0; +} diff --git a/kernel/sched/grid/qos.c b/kernel/sched/grid/qos.c new file mode 100644 index 0000000000000..60ed83b0e6c68 --- /dev/null +++ b/kernel/sched/grid/qos.c @@ -0,0 +1,46 @@ +#include <linux/nodemask.h> +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/sched/grid_qos.h> + +extern void qos_power_init(struct sched_grid_qos_power *power); +extern void qos_stat_init(struct sched_grid_qos_stat *stat); + +static int qos_affinity_set(struct task_struct *p) +{ + int n; + struct sched_grid_qos_affinity *affinity = &p->qos->affinity; + + /* + * We want the memory allocation to be as close to the CPU + * as possible, and adjust after getting memory bandwidth usage. + */ + for (n = 0; n < nr_node_ids; n++) + if (cpumask_intersects(cpumask_of_node(n), p->prefer_cpus)) + node_set(n, affinity->mem_preferred_node_mask); + + return 0; +} + +int sched_grid_qos_init(struct task_struct *p) +{ + struct sched_grid_qos *qos; + + qos = kzalloc(sizeof(*qos), GFP_KERNEL); + if (!qos) + return -ENOMEM; + + qos_power_init(&qos->power); + qos_stat_init(&qos->stat); + + nodes_clear(qos->affinity.mem_preferred_node_mask); + qos->affinity_set = qos_affinity_set; + p->qos = qos; + + return 0; +} + +void sched_grid_qos_free(struct task_struct *p) +{ + kfree(p->qos); +} diff --git a/kernel/sched/grid/stat.c b/kernel/sched/grid/stat.c new file mode 100644 index 0000000000000..055b1c9d18ef8 --- /dev/null +++ b/kernel/sched/grid/stat.c @@ -0,0 +1,11 @@ +#include <linux/sched/grid_qos.h> + +void qos_stat_init(struct sched_grid_qos_stat *stat) +{ + stat->sample[SCHED_GRID_QOS_IPS_INDEX].name = "ips"; + stat->sample[SCHED_GRID_QOS_IPS_INDEX].index = SCHED_GRID_QOS_IPS_INDEX; + stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].name = "membound_ratio"; + stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].index = SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX; + stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].name = "memband_width"; + stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].index = SCHED_GRID_QOS_MEMBANDWIDTH_INDEX; +} diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4769ed2ed7f38..46b8906d12c21 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -76,6 +76,7 @@ #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> +#include <linux/sched/grid_qos.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/slab.h> @@ -1882,8 +1883,22 @@ static unsigned interleave_nodes(struct mempolicy *policy) { unsigned next; struct task_struct *me = current; +#ifdef CONFIG_QOS_SCHED_SMART_GRID + nodemask_t nodemask = policy->v.nodes; + struct sched_grid_qos *qos = me->qos;
+ /* + * We perceive the actual consumption of memory bandwidth + * in each node and interleave in more appropriate range. + */ + if (likely(qos) && nodes_intersects(nodemask, qos->affinity.mem_preferred_node_mask)) + nodes_and(nodemask, nodemask, qos->affinity.mem_preferred_node_mask); + + next = next_node_in(me->il_prev, nodemask); +#else next = next_node_in(me->il_prev, policy->v.nodes); +#endif + if (next < MAX_NUMNODES) me->il_prev = next; return next; @@ -1946,6 +1961,16 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) unsigned int target, nnodes; int i; int nid; +#ifdef CONFIG_QOS_SCHED_SMART_GRID + struct sched_grid_qos *qos = current->qos; + + /* + * We perceive the actual consumption of memory bandwidth + * in each node and interleave in more appropriate range. + */ + if (likely(qos) && nodes_intersects(nodemask, qos->affinity.mem_preferred_node_mask)) + nodes_and(nodemask, nodemask, qos->affinity.mem_preferred_node_mask); +#endif /* * The barrier will stabilize the nodemask in a register or on * the stack so that it will stop changing under the code.