[PATCH openEuler-1.0-LTS v8 2/2] sched: smart grid: init sched_grid_qos structure on QOS purpose

8 Jun 2023

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7BQZ0
CVE: NA

----------------------------------------

As smart grid scheduling (SGS) may shrink resources and affect task QOS,
We provide methods for evaluating task QOS in divided grid, we mainly
focus on the following two aspects:

   1. Evaluate whether (such as CPU or memory) resources meet our demand
   2. Ensure the least impact when working with (cpufreq and cpuidle) governors

For tackling this questions, we have summarized several sampling methods
to obtain tasks' characteristics at same time reducing scheduling noise
as much as possible:

  1. we detected the key factors that how sensitive a process is in cpufreq
     or cpuidle adjustment, and to guide the cpufreq/cpuidle governor
  2. We dynamically monitor process memory bandwidth and adjust memory
     allocation to minimize cross-remote memory access
  3. We provide a variety of load tracking mechanisms to adapt to different
     types of task's load change

     ---------------------------------     -----------------
    |            class A              |   |     class B     |
    |    --------        --------     |   |     --------    |
    |   | group0 |      | group1 |    |---|    | group2 |   |----------+
    |    --------        --------     |   |     --------    |          |
    |    CPU/memory sensitive type    |   |   balance type  |          |
     ----------------+----------------     --------+--------           |
                     v                             v                   | (target cpufreq)
     -------------------------------------------------------           | (sensitivity)
    |              Not satisfied with QOS?                  |          |
     --------------------------+----------------------------           |
                               v                                       v
     -------------------------------------------------------     ----------------
    |              expand or shrink resource                |<--|  energy model  |
     ----------------------------+--------------------------     ----------------
                                 v                                     |
     -----------          -----------          ------------            v
    |           |        |           |        |            |     ---------------
    |   GRID0   +--------+   GRID1   +--------+   GRID2    |<-- |   governor    |
    |           |        |           |        |            |     ---------------
     -----------          -----------          ------------
                   \            |            /
                    \  -------------------  /
                      |  pages migration  |
                       -------------------

We will introduce the energy model in the follow-up implementation, and consider
the dynamic affinity adjustment between each divided grid in the runtime.

Signed-off-by: Wang ShaoBo <bobo.shaobowang@huawei.com>
---
 include/linux/sched.h          |   9 +++
 include/linux/sched/grid_qos.h |  92 +++++++++++++++++++++
 kernel/fork.c                  |   9 +++
 kernel/sched/Makefile          |   1 +
 kernel/sched/fair.c            |   5 ++
 kernel/sched/grid/Makefile     |   2 +
 kernel/sched/grid/internal.h   |   6 ++
 kernel/sched/grid/power.c      |  27 +++++++
 kernel/sched/grid/qos.c        | 143 +++++++++++++++++++++++++++++++++
 kernel/sched/grid/stat.c       |  32 ++++++++
 mm/mempolicy.c                 |  12 ++-
 11 files changed, 337 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/grid_qos.h
 create mode 100644 kernel/sched/grid/Makefile
 create mode 100644 kernel/sched/grid/internal.h
 create mode 100644 kernel/sched/grid/power.c
 create mode 100644 kernel/sched/grid/qos.c
 create mode 100644 kernel/sched/grid/stat.c

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8346294d25838..f561defa325e4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1277,7 +1277,16 @@ struct task_struct {
 	KABI_RESERVE(6)
 	KABI_RESERVE(7)
 #endif
+
+#if !defined(__GENKSYMS__)
+#if defined(CONFIG_QOS_SCHED_SMART_GRID)
+	struct sched_grid_qos *grid_qos;
+#else
 	KABI_RESERVE(8)
+#endif
+#else
+	KABI_RESERVE(8)
+#endif
 
 	/* CPU-specific state of this task: */
 	struct thread_struct		thread;
diff --git a/include/linux/sched/grid_qos.h b/include/linux/sched/grid_qos.h
new file mode 100644
index 0000000000000..5a6b0de8c3d20
--- /dev/null
+++ b/include/linux/sched/grid_qos.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_GRID_QOS_H
+#define _LINUX_SCHED_GRID_QOS_H
+#include <linux/nodemask.h>
+
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+enum sched_grid_qos_class {
+	SCHED_GRID_QOS_CLASS_LEVEL_1 = 0,
+	SCHED_GRID_QOS_CLASS_LEVEL_2 = 1,
+	SCHED_GRID_QOS_CLASS_LEVEL_3 = 2,
+	SCHED_GRID_QOS_CLASS_LEVEL_4 = 3,
+	SCHED_GRID_QOS_CLASS_LEVEL_5 = 4,
+	SCHED_GRID_QOS_CLASS_LEVEL_6 = 5,
+	SCHED_GRID_QOS_CLASS_LEVEL_7 = 6,
+	SCHED_GRID_QOS_CLASS_LEVEL_8 = 7,
+	SCHED_GRID_QOS_CLASS_LEVEL_NR
+};
+
+enum {
+	SCHED_GRID_QOS_IPS_INDEX = 0,
+	SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX = 1,
+	SCHED_GRID_QOS_MEMBANDWIDTH_INDEX = 2,
+	SCHED_GRID_QOS_SAMPLE_NR
+};
+
+#define SCHED_GRID_QOS_RING_BUFFER_MAXLEN 100
+
+struct sched_grid_qos_ring_buffer {
+	u64 vecs[SCHED_GRID_QOS_RING_BUFFER_MAXLEN];
+	unsigned int head;
+	void (*push)(u64 *data, int stepsize,
+		struct sched_grid_qos_ring_buffer *ring_buffer);
+};
+
+struct sched_grid_qos_sample {
+	const char *name;
+	int index;
+	int sample_bypass;
+	int sample_times;
+	struct sched_grid_qos_ring_buffer ring_buffer;
+	u64 pred_target[MAX_NUMNODES];
+	void (*cal_target)(int stepsize,
+		struct sched_grid_qos_ring_buffer *ring_buffer);
+
+	int account_ready;
+	int (*start)(void *arg);
+	int (*account)(void *arg);
+};
+
+struct sched_grid_qos_stat {
+	enum sched_grid_qos_class class_lvl;
+	int (*set_class_lvl)(struct sched_grid_qos_stat *qos_stat);
+	struct sched_grid_qos_sample sample[SCHED_GRID_QOS_SAMPLE_NR];
+};
+
+struct sched_grid_qos_power {
+	int cpufreq_sense_ratio;
+	int target_cpufreq;
+	int cstate_sense_ratio;
+};
+
+struct sched_grid_qos_affinity {
+	nodemask_t mem_preferred_node_mask;
+};
+
+struct task_struct;
+struct sched_grid_qos {
+	struct sched_grid_qos_stat stat;
+	struct sched_grid_qos_power power;
+	struct sched_grid_qos_affinity affinity;
+
+	int (*affinity_set)(struct task_struct *p);
+};
+
+int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig);
+void sched_grid_qos_free(struct task_struct *p);
+
+int sched_grid_preferred_interleave_nid(struct mempolicy *policy);
+int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask);
+#else
+static inline int
+sched_grid_preferred_interleave_nid(struct mempolicy *policy)
+{
+	return -1;
+}
+static inline int
+sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask)
+{
+	return preferred_nid;
+}
+#endif
+#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index c256525d4ce5e..db95586d745ba 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -21,6 +21,7 @@
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
 #include <linux/sched/cputime.h>
+#include <linux/sched/grid_qos.h>
 #include <linux/rtmutex.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
@@ -461,6 +462,9 @@ void free_task(struct task_struct *tsk)
 		free_kthread_struct(tsk);
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	sched_prefer_cpus_free(tsk);
+#endif
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	sched_grid_qos_free(tsk);
 #endif
 	free_task_struct(tsk);
 }
@@ -1876,6 +1880,11 @@ static __latent_entropy struct task_struct *copy_process(
 	if (retval)
 		goto bad_fork_free;
 #endif
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	retval = sched_grid_qos_fork(p, current);
+	if (retval)
+		goto bad_fork_free;
+#endif
 
 	/*
 	 * If multiple threads are within copy_process(), then this check
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7fe183404c383..0612af002ae57 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
 obj-$(CONFIG_MEMBARRIER) += membarrier.o
 obj-$(CONFIG_CPU_ISOLATION) += isolation.o
+obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7bb92e081d88e..7c306c2bce65b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -28,6 +28,9 @@
 #include <linux/delay.h>
 #include <linux/tracehook.h>
 #endif
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+#include <linux/sched/grid_qos.h>
+#endif
 #include <trace/events/sched.h>
 
 /*
@@ -5322,6 +5325,8 @@ static void tg_update_task_prefer_cpus(struct task_group *tg)
 			continue;
 
 		set_prefer_cpus_ptr(task, ad->domains[ad->curr_level]);
+		/* grid_qos must not be NULL */
+		task->grid_qos->affinity_set(task);
 	}
 	css_task_iter_end(&it);
 }
diff --git a/kernel/sched/grid/Makefile b/kernel/sched/grid/Makefile
new file mode 100644
index 0000000000000..82f2a09c3c309
--- /dev/null
+++ b/kernel/sched/grid/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_QOS_SCHED_SMART_GRID)  += qos.o power.o stat.o
diff --git a/kernel/sched/grid/internal.h b/kernel/sched/grid/internal.h
new file mode 100644
index 0000000000000..743f72aaffbfc
--- /dev/null
+++ b/kernel/sched/grid/internal.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_SMART_GRID_INTERNAL_H
+#define _LINUX_SCHED_SMART_GRID_INTERNAL_H
+void qos_power_init(struct sched_grid_qos_power *power);
+void qos_stat_init(struct sched_grid_qos_stat *stat);
+#endif
diff --git a/kernel/sched/grid/power.c b/kernel/sched/grid/power.c
new file mode 100644
index 0000000000000..f916cd3801ad7
--- /dev/null
+++ b/kernel/sched/grid/power.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Common code for QOS-aware smart grid Scheduling
+ *
+ * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd
+ *
+ * Author: Wang Shaobo <bobo.shaobowang@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#include <linux/sched/grid_qos.h>
+#include "internal.h"
+
+void qos_power_init(struct sched_grid_qos_power *power)
+{
+	power->cpufreq_sense_ratio = 0;
+	power->target_cpufreq = 0;
+	power->cstate_sense_ratio = 0;
+}
diff --git a/kernel/sched/grid/qos.c b/kernel/sched/grid/qos.c
new file mode 100644
index 0000000000000..c334ae7e0bf9b
--- /dev/null
+++ b/kernel/sched/grid/qos.c
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Common code for Smart Grid Scheduling
+ *
+ * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd
+ *
+ * Author: Wang Shaobo <bobo.shaobowang@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#include <linux/nodemask.h>
+#include <linux/mempolicy.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/sched/grid_qos.h>
+#include "internal.h"
+
+static int qos_affinity_set(struct task_struct *p)
+{
+	int n;
+	struct sched_grid_qos_affinity *affinity = &p->grid_qos->affinity;
+
+	nodes_clear(affinity->mem_preferred_node_mask);
+	/*
+	 * We want the memory allocation to be as close to the CPU
+	 * as possible, and adjust after getting memory bandwidth usage.
+	 */
+	for (n = 0; n < nr_node_ids; n++)
+		if (cpumask_intersects(cpumask_of_node(n), p->prefer_cpus))
+			node_set(n, affinity->mem_preferred_node_mask);
+
+	return 0;
+}
+
+int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig)
+{
+	struct sched_grid_qos *qos;
+
+	qos = kzalloc(sizeof(*qos), GFP_KERNEL);
+	if (!qos)
+		return -ENOMEM;
+
+	qos_power_init(&qos->power);
+	qos_stat_init(&qos->stat);
+
+	nodes_clear(qos->affinity.mem_preferred_node_mask);
+	if (likely(orig->grid_qos))
+		qos->affinity = orig->grid_qos->affinity;
+	qos->affinity_set = qos_affinity_set;
+	p->grid_qos = qos;
+
+	return 0;
+}
+
+void sched_grid_qos_free(struct task_struct *p)
+{
+	kfree(p->grid_qos);
+}
+
+/* dynamic select a more appropriate preferred interleave nid for process */
+int sched_grid_preferred_interleave_nid(struct mempolicy *policy)
+{
+	nodemask_t nmask;
+	unsigned int next;
+	struct task_struct *me = current;
+	nodemask_t *preferred_nmask = NULL;
+
+	if (likely(me->grid_qos))
+		preferred_nmask =
+			&me->grid_qos->affinity.mem_preferred_node_mask;
+
+	if (!preferred_nmask || !policy)
+		return NUMA_NO_NODE;
+
+	if (nodes_equal(policy->v.nodes, *preferred_nmask))
+		return NUMA_NO_NODE;
+	/*
+	 * We perceive the actual consumption of memory bandwidth
+	 * in each node and post a preferred interleave nid in
+	 * more appropriate range.
+	 */
+	nodes_and(nmask, policy->v.nodes, *preferred_nmask);
+	if (nodes_empty(nmask))
+		return NUMA_NO_NODE;
+
+	next = next_node_in(me->il_prev, nmask);
+	if (next < MAX_NUMNODES)
+		me->il_prev = next;
+	return next;
+}
+
+/* dynamic select a more appropriate preferred nid for process */
+int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask)
+{
+	int nd = preferred_nid;
+	nodemask_t nmask, ndmask;
+	nodemask_t *preferred_nmask = NULL;
+
+	if (likely(current->grid_qos))
+		preferred_nmask =
+			¤t->grid_qos->affinity.mem_preferred_node_mask;
+
+	if (!preferred_nmask)
+		return preferred_nid;
+
+	/*
+	 * We perceive the actual consumption of memory bandwidth
+	 * in each node and post a preferred nid in more appropriate
+	 * range.
+	 */
+	nmask = *preferred_nmask;
+	if (nodemask) {
+		if (nodes_equal(*nodemask, nmask))
+			return preferred_nid;
+
+		nodes_and(nmask, nmask, *nodemask);
+	}
+
+	if (node_isset(preferred_nid, nmask))
+		return preferred_nid;
+
+	/*
+	 * We prefer the numa node we're running, if there is no limit
+	 * to nodemask, we select preferred nid in preferred range or
+	 * in restriced range if not.
+	 */
+	init_nodemask_of_node(&ndmask, numa_node_id());
+	nodes_and(ndmask, nmask, ndmask);
+	if (!nodes_empty(ndmask))
+		nd = first_node(ndmask);
+	else if (!nodes_empty(nmask))
+		nd = first_node(nmask);
+
+	return nd;
+}
diff --git a/kernel/sched/grid/stat.c b/kernel/sched/grid/stat.c
new file mode 100644
index 0000000000000..b40c751456081
--- /dev/null
+++ b/kernel/sched/grid/stat.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Common code for QOS-aware smart grid Scheduling
+ *
+ * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd
+ *
+ * Author: Wang Shaobo <bobo.shaobowang@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#include <linux/sched/grid_qos.h>
+#include "internal.h"
+
+void qos_stat_init(struct sched_grid_qos_stat *stat)
+{
+	stat->sample[SCHED_GRID_QOS_IPS_INDEX].name = "ips";
+	stat->sample[SCHED_GRID_QOS_IPS_INDEX].index = SCHED_GRID_QOS_IPS_INDEX;
+	stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].name = "membound_ratio";
+	stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].index =
+		SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX;
+	stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].name = "memband_width";
+	stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].index =
+		SCHED_GRID_QOS_MEMBANDWIDTH_INDEX;
+}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4769ed2ed7f38..701988dc02f66 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -76,6 +76,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/task.h>
+#include <linux/sched/grid_qos.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
 #include <linux/slab.h>
@@ -2172,7 +2173,14 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 	if (pol->mode == MPOL_INTERLEAVE) {
 		unsigned nid;
 
-		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+		if (smart_grid_used()) {
+			nid = sched_grid_preferred_interleave_nid(pol);
+			nid = (nid == NUMA_NO_NODE) ?
+				interleave_nid(pol, vma, addr, PAGE_SHIFT + order) : nid;
+		} else {
+			nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+		}
+
 		mpol_cond_put(pol);
 		page = alloc_page_interleave(gfp, order, nid);
 		goto out;
@@ -2234,6 +2242,8 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 
 	nmask = policy_nodemask(gfp, pol);
 	preferred_nid = policy_node(gfp, pol, node);
+	if (smart_grid_used())
+		preferred_nid = sched_grid_preferred_nid(preferred_nid, nmask);
 	page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask);
 	mark_vma_cdm(nmask, page, vma);
 	mpol_cond_put(pol);
-- 
2.25.1