[PATCH OLK-5.10 v6 02/20] sched: smart grid: init sched_grid_qos structure on QOS purpose

24 Nov 2023

From: Wang ShaoBo <bobo.shaobowang@huawei.com>

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I7BQZ0
CVE: NA

----------------------------------------

As smart grid scheduling (SGS) may shrink resources and affect task QOS,
We provide methods for evaluating task QOS in divided grid, we mainly
focus on the following two aspects:

   1. Evaluate whether (such as CPU or memory) resources meet our demand
   2. Ensure the least impact when working with (cpufreq and cpuidle)
      governors

For tackling this questions, we have summarized several sampling methods
to obtain tasks' characteristics at same time reducing scheduling noise
as much as possible:

  1. we detected the key factors that how sensitive a process is in cpufreq
     or cpuidle adjustment, and to guide the cpufreq/cpuidle governor
  2. We dynamically monitor process memory bandwidth and adjust memory
     allocation to minimize cross-remote memory access
  3. We provide a variety of load tracking mechanisms to adapt to different
     types of task's load change

     ---------------------------------     -----------------
    |            class A              |   |     class B     |
    |    --------        --------     |   |     --------    |
    |   | group0 |      | group1 |    |---|    | group2 |   |
    |    --------        --------     |   |     --------    |
    |    CPU/memory sensitive type    |   |   balance type  |
     ----------------+----------------     ------+-------+--
                     v                           v       | (target cpufreq)
     ----------------------------------------------      | (sensitivity)
    |              Not satisfied with QOS?         |     |
     --------------------------+-------------------      |
                               v                         v
     ----------------------------------------------     ----------------
    |              expand or shrink resource       |<--|  energy model  |
     ------------------------+---------------------     ----------------
                             v                           |
     -----------      -----------      ------------      v
    |           |    |           |    |            |    ---------------
    |   GRID0   +----+   GRID1   +----+   GRID2    |<--|   governor    |
    |           |    |           |    |            |    ---------------
     -----------      -----------      ------------
                   \            |            /
                    \  -------------------  /
                      |  pages migration  |
                       -------------------

We will introduce the energy model in the follow-up implementation, and
consider the dynamic affinity adjustment between each divided grid in the
runtime.

Signed-off-by: Wang ShaoBo <bobo.shaobowang@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: Yipeng Zou <zouyipeng@huawei.com>
---
 include/linux/sched.h          |   9 ++
 include/linux/sched/grid_qos.h |  92 ++++++++++++++++++++
 kernel/fork.c                  |  13 ++-
 kernel/sched/Makefile          |   1 +
 kernel/sched/fair.c            |   5 ++
 kernel/sched/grid/Makefile     |   2 +
 kernel/sched/grid/internal.h   |   6 ++
 kernel/sched/grid/power.c      |  27 ++++++
 kernel/sched/grid/qos.c        | 149 +++++++++++++++++++++++++++++++++
 kernel/sched/grid/stat.c       |  32 +++++++
 mm/mempolicy.c                 |  12 ++-
 11 files changed, 346 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/sched/grid_qos.h
 create mode 100644 kernel/sched/grid/Makefile
 create mode 100644 kernel/sched/grid/internal.h
 create mode 100644 kernel/sched/grid/power.c
 create mode 100644 kernel/sched/grid/qos.c
 create mode 100644 kernel/sched/grid/stat.c

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 44db39317cef..7e5af98957d9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1449,7 +1449,16 @@ struct task_struct {
 	KABI_RESERVE(10)
 	KABI_RESERVE(11)
 #endif
+
+#if !defined(__GENKSYMS__)
+#if defined(CONFIG_QOS_SCHED_SMART_GRID)
+	struct sched_grid_qos *grid_qos;
+#else
+	KABI_RESERVE(12)
+#endif
+#else
 	KABI_RESERVE(12)
+#endif
 	KABI_RESERVE(13)
 	KABI_RESERVE(14)
 	KABI_RESERVE(15)
diff --git a/include/linux/sched/grid_qos.h b/include/linux/sched/grid_qos.h
new file mode 100644
index 000000000000..cea2bf651880
--- /dev/null
+++ b/include/linux/sched/grid_qos.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_GRID_QOS_H
+#define _LINUX_SCHED_GRID_QOS_H
+#include <linux/nodemask.h>
+
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+enum sched_grid_qos_class {
+	SCHED_GRID_QOS_CLASS_LEVEL_1 = 0,
+	SCHED_GRID_QOS_CLASS_LEVEL_2 = 1,
+	SCHED_GRID_QOS_CLASS_LEVEL_3 = 2,
+	SCHED_GRID_QOS_CLASS_LEVEL_4 = 3,
+	SCHED_GRID_QOS_CLASS_LEVEL_5 = 4,
+	SCHED_GRID_QOS_CLASS_LEVEL_6 = 5,
+	SCHED_GRID_QOS_CLASS_LEVEL_7 = 6,
+	SCHED_GRID_QOS_CLASS_LEVEL_8 = 7,
+	SCHED_GRID_QOS_CLASS_LEVEL_NR
+};
+
+enum {
+	SCHED_GRID_QOS_IPS_INDEX = 0,
+	SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX = 1,
+	SCHED_GRID_QOS_MEMBANDWIDTH_INDEX = 2,
+	SCHED_GRID_QOS_SAMPLE_NR
+};
+
+#define SCHED_GRID_QOS_RING_BUFFER_MAXLEN 100
+
+struct sched_grid_qos_ring_buffer {
+	u64 vecs[SCHED_GRID_QOS_RING_BUFFER_MAXLEN];
+	unsigned int head;
+	void (*push)(u64 *data, int stepsize,
+		struct sched_grid_qos_ring_buffer *ring_buffer);
+};
+
+struct sched_grid_qos_sample {
+	const char *name;
+	int index;
+	int sample_bypass;
+	int sample_times;
+	struct sched_grid_qos_ring_buffer ring_buffer;
+	u64 pred_target[MAX_NUMNODES];
+	void (*cal_target)(int stepsize,
+		struct sched_grid_qos_ring_buffer *ring_buffer);
+
+	int account_ready;
+	int (*start)(void *arg);
+	int (*account)(void *arg);
+};
+
+struct sched_grid_qos_stat {
+	enum sched_grid_qos_class class_lvl;
+	int (*set_class_lvl)(struct sched_grid_qos_stat *qos_stat);
+	struct sched_grid_qos_sample sample[SCHED_GRID_QOS_SAMPLE_NR];
+};
+
+struct sched_grid_qos_power {
+	int cpufreq_sense_ratio;
+	int target_cpufreq;
+	int cstate_sense_ratio;
+};
+
+struct sched_grid_qos_affinity {
+	nodemask_t mem_preferred_node_mask;
+};
+
+struct task_struct;
+struct sched_grid_qos {
+	struct sched_grid_qos_stat stat;
+	struct sched_grid_qos_power power;
+	struct sched_grid_qos_affinity affinity;
+
+	int (*affinity_set)(struct task_struct *p);
+};
+
+int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig);
+void sched_grid_qos_free(struct task_struct *p);
+
+int sched_grid_preferred_interleave_nid(struct mempolicy *policy);
+int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask);
+#else
+static inline int
+sched_grid_preferred_interleave_nid(struct mempolicy *policy)
+{
+	return NUMA_NO_NODE;
+}
+static inline int
+sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask)
+{
+	return preferred_nid;
+}
+#endif
+#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index a531fd38d111..adc37960c01b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -97,7 +97,9 @@
 #include <linux/scs.h>
 #include <linux/io_uring.h>
 #include <linux/sched/mm.h>
-
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+#include <linux/sched/grid_qos.h>
+#endif
 #include <linux/share_pool.h>
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -470,6 +472,9 @@ void free_task(struct task_struct *tsk)
 		free_kthread_struct(tsk);
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	sched_prefer_cpus_free(tsk);
+#endif
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	sched_grid_qos_free(tsk);
 #endif
 	free_task_struct(tsk);
 }
@@ -2083,6 +2088,12 @@ static __latent_entropy struct task_struct *copy_process(
 	if (retval < 0)
 		goto bad_fork_free;
 
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	retval = sched_grid_qos_fork(p, current);
+	if (retval)
+		goto bad_fork_cleanup_count;
+#endif
+
 	/*
 	 * If multiple threads are within copy_process(), then this check
 	 * triggers too late. This doesn't hurt, the check is only there
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 6f3106774d05..a6fe0ee09917 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -39,3 +39,4 @@ obj-$(CONFIG_PSI) += psi.o
 obj-$(CONFIG_SCHED_CORE) += core_sched.o
 obj-$(CONFIG_BPF_SCHED) += bpf_sched.o
 obj-$(CONFIG_BPF_SCHED) += bpf_topology.o
+obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index db18ebdd313a..2e7d27824c3a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -28,6 +28,9 @@
 #include <linux/delay.h>
 #include <linux/tracehook.h>
 #endif
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+#include <linux/sched/grid_qos.h>
+#endif
 #include <linux/bpf_sched.h>
 
 /*
@@ -5845,6 +5848,8 @@ static void tg_update_task_prefer_cpus(struct task_group *tg)
 			continue;
 
 		set_prefer_cpus_ptr(task, ad->domains[ad->curr_level]);
+		/* grid_qos must not be NULL */
+		task->grid_qos->affinity_set(task);
 	}
 	css_task_iter_end(&it);
 }
diff --git a/kernel/sched/grid/Makefile b/kernel/sched/grid/Makefile
new file mode 100644
index 000000000000..82f2a09c3c30
--- /dev/null
+++ b/kernel/sched/grid/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_QOS_SCHED_SMART_GRID)  += qos.o power.o stat.o
diff --git a/kernel/sched/grid/internal.h b/kernel/sched/grid/internal.h
new file mode 100644
index 000000000000..743f72aaffbf
--- /dev/null
+++ b/kernel/sched/grid/internal.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_SMART_GRID_INTERNAL_H
+#define _LINUX_SCHED_SMART_GRID_INTERNAL_H
+void qos_power_init(struct sched_grid_qos_power *power);
+void qos_stat_init(struct sched_grid_qos_stat *stat);
+#endif
diff --git a/kernel/sched/grid/power.c b/kernel/sched/grid/power.c
new file mode 100644
index 000000000000..f916cd3801ad
--- /dev/null
+++ b/kernel/sched/grid/power.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Common code for QOS-aware smart grid Scheduling
+ *
+ * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd
+ *
+ * Author: Wang Shaobo <bobo.shaobowang@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#include <linux/sched/grid_qos.h>
+#include "internal.h"
+
+void qos_power_init(struct sched_grid_qos_power *power)
+{
+	power->cpufreq_sense_ratio = 0;
+	power->target_cpufreq = 0;
+	power->cstate_sense_ratio = 0;
+}
diff --git a/kernel/sched/grid/qos.c b/kernel/sched/grid/qos.c
new file mode 100644
index 000000000000..3fb433d213fd
--- /dev/null
+++ b/kernel/sched/grid/qos.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Common code for Smart Grid Scheduling
+ *
+ * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd
+ *
+ * Author: Wang Shaobo <bobo.shaobowang@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#include <linux/nodemask.h>
+#include <linux/mempolicy.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/numa.h>
+#include <linux/sched/grid_qos.h>
+#include "internal.h"
+
+static int qos_affinity_set(struct task_struct *p)
+{
+	int n;
+	struct sched_grid_qos_affinity *affinity = &p->grid_qos->affinity;
+
+	nodes_clear(affinity->mem_preferred_node_mask);
+	/*
+	 * We want the memory allocation to be as close to the CPU
+	 * as possible, and adjust after getting memory bandwidth usage.
+	 */
+	for (n = 0; n < nr_node_ids; n++)
+		if (cpumask_intersects(cpumask_of_node(n), p->prefer_cpus))
+			node_set(n, affinity->mem_preferred_node_mask);
+
+	return 0;
+}
+
+int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig)
+{
+	struct sched_grid_qos *qos;
+
+	qos = kzalloc(sizeof(*qos), GFP_KERNEL);
+	if (!qos)
+		return -ENOMEM;
+
+	qos_power_init(&qos->power);
+	qos_stat_init(&qos->stat);
+
+	nodes_clear(qos->affinity.mem_preferred_node_mask);
+	if (likely(orig->grid_qos))
+		qos->affinity = orig->grid_qos->affinity;
+	qos->affinity_set = qos_affinity_set;
+	p->grid_qos = qos;
+
+	return 0;
+}
+
+void sched_grid_qos_free(struct task_struct *p)
+{
+	kfree(p->grid_qos);
+	p->grid_qos = NULL;
+}
+
+/* dynamic select a more appropriate preferred interleave nid for process */
+int sched_grid_preferred_interleave_nid(struct mempolicy *policy)
+{
+#ifndef CONFIG_NUMA
+	return NUMA_NO_NODE;
+#else
+	nodemask_t nmask;
+	unsigned int next;
+	struct task_struct *me = current;
+	nodemask_t *preferred_nmask = NULL;
+
+	if (likely(me->grid_qos))
+		preferred_nmask =
+			&me->grid_qos->affinity.mem_preferred_node_mask;
+
+	if (!preferred_nmask || !policy)
+		return NUMA_NO_NODE;
+
+	if (nodes_equal(policy->v.nodes, *preferred_nmask))
+		return NUMA_NO_NODE;
+	/*
+	 * We perceive the actual consumption of memory bandwidth
+	 * in each node and post a preferred interleave nid in
+	 * more appropriate range.
+	 */
+	nodes_and(nmask, policy->v.nodes, *preferred_nmask);
+	if (nodes_empty(nmask))
+		return NUMA_NO_NODE;
+
+	next = next_node_in(me->il_prev, nmask);
+	if (next < MAX_NUMNODES)
+		me->il_prev = next;
+	return next;
+#endif
+}
+
+/* dynamic select a more appropriate preferred nid for process */
+int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask)
+{
+	int nd = preferred_nid;
+	nodemask_t nmask, ndmask;
+	nodemask_t *preferred_nmask = NULL;
+
+	if (likely(current->grid_qos))
+		preferred_nmask =
+			¤t->grid_qos->affinity.mem_preferred_node_mask;
+
+	if (!preferred_nmask)
+		return preferred_nid;
+
+	/*
+	 * We perceive the actual consumption of memory bandwidth
+	 * in each node and post a preferred nid in more appropriate
+	 * range.
+	 */
+	nmask = *preferred_nmask;
+	if (nodemask) {
+		if (nodes_equal(*nodemask, nmask))
+			return preferred_nid;
+
+		nodes_and(nmask, nmask, *nodemask);
+	}
+
+	if (node_isset(preferred_nid, nmask))
+		return preferred_nid;
+
+	/*
+	 * We prefer the numa node we're running, if there is no limit
+	 * to nodemask, we select preferred nid in preferred range or
+	 * in restriced range if not.
+	 */
+	init_nodemask_of_node(&ndmask, numa_node_id());
+	nodes_and(ndmask, nmask, ndmask);
+	if (!nodes_empty(ndmask))
+		nd = first_node(ndmask);
+	else if (!nodes_empty(nmask))
+		nd = first_node(nmask);
+
+	return nd;
+}
diff --git a/kernel/sched/grid/stat.c b/kernel/sched/grid/stat.c
new file mode 100644
index 000000000000..b40c75145608
--- /dev/null
+++ b/kernel/sched/grid/stat.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Common code for QOS-aware smart grid Scheduling
+ *
+ * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd
+ *
+ * Author: Wang Shaobo <bobo.shaobowang@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#include <linux/sched/grid_qos.h>
+#include "internal.h"
+
+void qos_stat_init(struct sched_grid_qos_stat *stat)
+{
+	stat->sample[SCHED_GRID_QOS_IPS_INDEX].name = "ips";
+	stat->sample[SCHED_GRID_QOS_IPS_INDEX].index = SCHED_GRID_QOS_IPS_INDEX;
+	stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].name = "membound_ratio";
+	stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].index =
+		SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX;
+	stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].name = "memband_width";
+	stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].index =
+		SCHED_GRID_QOS_MEMBANDWIDTH_INDEX;
+}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f83491d21656..659c6f0d146e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -79,6 +79,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/task.h>
+#include <linux/sched/grid_qos.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
 #include <linux/slab.h>
@@ -2373,7 +2374,14 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 	if (pol->mode == MPOL_INTERLEAVE) {
 		unsigned nid;
 
-		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+		if (smart_grid_used()) {
+			nid = sched_grid_preferred_interleave_nid(pol);
+			nid = (nid == NUMA_NO_NODE) ?
+				interleave_nid(pol, vma, addr, PAGE_SHIFT + order) : nid;
+		} else {
+			nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+		}
+
 		mpol_cond_put(pol);
 		page = alloc_page_interleave(gfp, order, nid);
 		goto out;
@@ -2427,6 +2435,8 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
 
 	nmask = policy_nodemask(gfp, pol);
 	preferred_nid = policy_node(gfp, pol, node);
+	if (smart_grid_used())
+		preferred_nid = sched_grid_preferred_nid(preferred_nid, nmask);
 	page = __alloc_pages(gfp, order, preferred_nid, nmask);
 	mark_vma_cdm(nmask, page, vma);
 	mpol_cond_put(pol);
-- 
2.34.1