[PATCH OLK-6.6 v6 2/8] sched: smart grid: init sched_grid_qos structure on QOS purpose

17 Jan 2024

From: Wang ShaoBo <bobo.shaobowang@huawei.com>

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I8WMOG
CVE: NA

----------------------------------------

As smart grid scheduling (SGS) may shrink resources and affect task QOS,
We provide methods for evaluating task QOS in divided grid, we mainly
focus on the following two aspects:

   1. Evaluate whether (such as CPU or memory) resources meet our demand
   2. Ensure the least impact when working with (cpufreq and cpuidle)
      governors

For tackling this questions, we have summarized several sampling methods
to obtain tasks' characteristics at same time reducing scheduling noise
as much as possible:

  1. we detected the key factors that how sensitive a process is in cpufreq
     or cpuidle adjustment, and to guide the cpufreq/cpuidle governor
  2. We dynamically monitor process memory bandwidth and adjust memory
     allocation to minimize cross-remote memory access
  3. We provide a variety of load tracking mechanisms to adapt to different
     types of task's load change

     ---------------------------------     -----------------
    |            class A              |   |     class B     |
    |    --------        --------     |   |     --------    |
    |   | group0 |      | group1 |    |---|    | group2 |   |
    |    --------        --------     |   |     --------    |
    |    CPU/memory sensitive type    |   |   balance type  |
     ----------------+----------------     ------+-------+--
                     v                           v       | (target cpufreq)
     ----------------------------------------------      | (sensitivity)
    |              Not satisfied with QOS?         |     |
     --------------------------+-------------------      |
                               v                         v
     ----------------------------------------------     ----------------
    |              expand or shrink resource       |<--|  energy model  |
     ------------------------+---------------------     ----------------
                             v                           |
     -----------      -----------      ------------      v
    |           |    |           |    |            |    ---------------
    |   GRID0   +----+   GRID1   +----+   GRID2    |<--|   governor    |
    |           |    |           |    |            |    ---------------
     -----------      -----------      ------------
                   \            |            /
                    \  -------------------  /
                      |  pages migration  |
                       -------------------

We will introduce the energy model in the follow-up implementation, and
consider the dynamic affinity adjustment between each divided grid in the
runtime.

Signed-off-by: Wang ShaoBo <bobo.shaobowang@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Reviewed-by: Xie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: Zhang Changzhong <zhangchangzhong@huawei.com>
Signed-off-by: Yipeng Zou <zouyipeng@huawei.com>
---
 include/linux/sched.h          |   5 ++
 include/linux/sched/grid_qos.h | 104 ++++++++++++++++++++++
 kernel/fork.c                  |  12 +++
 kernel/sched/Makefile          |   1 +
 kernel/sched/grid/Makefile     |   2 +
 kernel/sched/grid/internal.h   |   6 ++
 kernel/sched/grid/power.c      |  27 ++++++
 kernel/sched/grid/qos.c        | 156 +++++++++++++++++++++++++++++++++
 kernel/sched/grid/stat.c       |  32 +++++++
 mm/mempolicy.c                 |  11 ++-
 10 files changed, 354 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/sched/grid_qos.h
 create mode 100644 kernel/sched/grid/Makefile
 create mode 100644 kernel/sched/grid/internal.h
 create mode 100644 kernel/sched/grid/power.c
 create mode 100644 kernel/sched/grid/qos.c
 create mode 100644 kernel/sched/grid/stat.c

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b43345aac49d..569653f9b420 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1555,6 +1555,11 @@ struct task_struct {
 #ifdef CONFIG_PSI_FINE_GRAINED
 	int memstall_type;
 #endif
+
+#if defined(CONFIG_QOS_SCHED_SMART_GRID) && !defined(__GENKSYMS__)
+		struct sched_grid_qos *grid_qos;
+#endif
+
 	/*
 	 * New fields for task_struct should be added above here, so that
 	 * they are included in the randomized portion of task_struct.
diff --git a/include/linux/sched/grid_qos.h b/include/linux/sched/grid_qos.h
new file mode 100644
index 000000000000..23d08dbb6ae6
--- /dev/null
+++ b/include/linux/sched/grid_qos.h
@@ -0,0 +1,104 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_GRID_QOS_H
+#define _LINUX_SCHED_GRID_QOS_H
+#include <linux/nodemask.h>
+#include <linux/sched.h>
+
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+enum sched_grid_qos_class {
+	SCHED_GRID_QOS_CLASS_LEVEL_1 = 0,
+	SCHED_GRID_QOS_CLASS_LEVEL_2 = 1,
+	SCHED_GRID_QOS_CLASS_LEVEL_3 = 2,
+	SCHED_GRID_QOS_CLASS_LEVEL_4 = 3,
+	SCHED_GRID_QOS_CLASS_LEVEL_5 = 4,
+	SCHED_GRID_QOS_CLASS_LEVEL_6 = 5,
+	SCHED_GRID_QOS_CLASS_LEVEL_7 = 6,
+	SCHED_GRID_QOS_CLASS_LEVEL_8 = 7,
+	SCHED_GRID_QOS_CLASS_LEVEL_NR
+};
+
+enum {
+	SCHED_GRID_QOS_IPS_INDEX = 0,
+	SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX = 1,
+	SCHED_GRID_QOS_MEMBANDWIDTH_INDEX = 2,
+	SCHED_GRID_QOS_SAMPLE_NR
+};
+
+#define SCHED_GRID_QOS_RING_BUFFER_MAXLEN 100
+
+struct sched_grid_qos_ring_buffer {
+	u64 vecs[SCHED_GRID_QOS_RING_BUFFER_MAXLEN];
+	unsigned int head;
+	void (*push)(u64 *data, int stepsize,
+		struct sched_grid_qos_ring_buffer *ring_buffer);
+};
+
+struct sched_grid_qos_sample {
+	const char *name;
+	int index;
+	int sample_bypass;
+	int sample_times;
+	struct sched_grid_qos_ring_buffer ring_buffer;
+	u64 pred_target[MAX_NUMNODES];
+	void (*cal_target)(int stepsize,
+		struct sched_grid_qos_ring_buffer *ring_buffer);
+
+	int account_ready;
+	int (*start)(void *arg);
+	int (*account)(void *arg);
+};
+
+struct sched_grid_qos_stat {
+	enum sched_grid_qos_class class_lvl;
+	int (*set_class_lvl)(struct sched_grid_qos_stat *qos_stat);
+	struct sched_grid_qos_sample sample[SCHED_GRID_QOS_SAMPLE_NR];
+};
+
+struct sched_grid_qos_power {
+	int cpufreq_sense_ratio;
+	int target_cpufreq;
+	int cstate_sense_ratio;
+};
+
+struct sched_grid_qos_affinity {
+	nodemask_t mem_preferred_node_mask;
+	const struct cpumask *prefer_cpus;
+};
+
+struct task_struct;
+struct sched_grid_qos {
+	struct sched_grid_qos_stat stat;
+	struct sched_grid_qos_power power;
+	struct sched_grid_qos_affinity affinity;
+
+	int (*affinity_set)(struct task_struct *p);
+};
+
+static inline int sched_qos_affinity_set(struct task_struct *p)
+{
+	return p->grid_qos->affinity_set(p);
+}
+
+int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig);
+void sched_grid_qos_free(struct task_struct *p);
+
+int sched_grid_preferred_interleave_nid(struct mempolicy *policy);
+int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask);
+#else
+static inline int
+sched_grid_preferred_interleave_nid(struct mempolicy *policy)
+{
+	return NUMA_NO_NODE;
+}
+static inline int
+sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask)
+{
+	return preferred_nid;
+}
+
+static inline int sched_qos_affinity_set(struct task_struct *p)
+{
+	return 0;
+}
+#endif
+#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index a1cd8930c3e1..72247526a384 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -99,6 +99,9 @@
 #include <linux/stackprotector.h>
 #include <linux/user_events.h>
 #include <linux/iommu.h>
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+#include <linux/sched/grid_qos.h>
+#endif
 #include <linux/share_pool.h>
 
 #include <asm/pgalloc.h>
@@ -628,6 +631,9 @@ void free_task(struct task_struct *tsk)
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	if (dynamic_affinity_enabled())
 		sched_prefer_cpus_free(tsk);
+#endif
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	sched_grid_qos_free(tsk);
 #endif
 	free_task_struct(tsk);
 }
@@ -2389,6 +2395,12 @@ __latent_entropy struct task_struct *copy_process(
 	}
 	current->flags &= ~PF_NPROC_EXCEEDED;
 
+#ifdef CONFIG_QOS_SCHED_SMART_GRID
+	retval = sched_grid_qos_fork(p, current);
+	if (retval)
+		goto bad_fork_cleanup_count;
+#endif
+
 	/*
 	 * If multiple threads are within copy_process(), then this check
 	 * triggers too late. This doesn't hurt, the check is only there
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 976092b7bd45..cd0be22a94fd 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -32,3 +32,4 @@ obj-y += core.o
 obj-y += fair.o
 obj-y += build_policy.o
 obj-y += build_utility.o
+obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/
diff --git a/kernel/sched/grid/Makefile b/kernel/sched/grid/Makefile
new file mode 100644
index 000000000000..82f2a09c3c30
--- /dev/null
+++ b/kernel/sched/grid/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_QOS_SCHED_SMART_GRID)  += qos.o power.o stat.o
diff --git a/kernel/sched/grid/internal.h b/kernel/sched/grid/internal.h
new file mode 100644
index 000000000000..743f72aaffbf
--- /dev/null
+++ b/kernel/sched/grid/internal.h
@@ -0,0 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_SMART_GRID_INTERNAL_H
+#define _LINUX_SCHED_SMART_GRID_INTERNAL_H
+void qos_power_init(struct sched_grid_qos_power *power);
+void qos_stat_init(struct sched_grid_qos_stat *stat);
+#endif
diff --git a/kernel/sched/grid/power.c b/kernel/sched/grid/power.c
new file mode 100644
index 000000000000..f916cd3801ad
--- /dev/null
+++ b/kernel/sched/grid/power.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Common code for QOS-aware smart grid Scheduling
+ *
+ * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd
+ *
+ * Author: Wang Shaobo <bobo.shaobowang@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#include <linux/sched/grid_qos.h>
+#include "internal.h"
+
+void qos_power_init(struct sched_grid_qos_power *power)
+{
+	power->cpufreq_sense_ratio = 0;
+	power->target_cpufreq = 0;
+	power->cstate_sense_ratio = 0;
+}
diff --git a/kernel/sched/grid/qos.c b/kernel/sched/grid/qos.c
new file mode 100644
index 000000000000..4d36c3640753
--- /dev/null
+++ b/kernel/sched/grid/qos.c
@@ -0,0 +1,156 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Common code for Smart Grid Scheduling
+ *
+ * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd
+ *
+ * Author: Wang Shaobo <bobo.shaobowang@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#include <linux/nodemask.h>
+#include <linux/mempolicy.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/numa.h>
+#include <linux/sched/cputime.h>
+#include <linux/sched/grid_qos.h>
+#include "internal.h"
+
+static inline int qos_affinity_set(struct task_struct *p)
+{
+	int n;
+	struct sched_grid_qos_affinity *affinity = &p->grid_qos->affinity;
+
+	if (likely(affinity->prefer_cpus == p->select_cpus))
+		return 0;
+
+	/*
+	 * We want the memory allocation to be as close to the CPU
+	 * as possible, and adjust after getting memory bandwidth usage.
+	 */
+	for (n = 0; n < nr_node_ids; n++) {
+		if (cpumask_intersects(cpumask_of_node(n), p->select_cpus))
+			node_set(n, affinity->mem_preferred_node_mask);
+		else
+			node_clear(n, affinity->mem_preferred_node_mask);
+	}
+
+	affinity->prefer_cpus = p->select_cpus;
+	return 0;
+}
+
+int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig)
+{
+	struct sched_grid_qos *qos;
+
+	qos = kzalloc(sizeof(*qos), GFP_KERNEL);
+	if (!qos)
+		return -ENOMEM;
+
+	qos_power_init(&qos->power);
+	qos_stat_init(&qos->stat);
+
+	nodes_clear(qos->affinity.mem_preferred_node_mask);
+	if (likely(orig->grid_qos))
+		qos->affinity = orig->grid_qos->affinity;
+	qos->affinity_set = qos_affinity_set;
+	p->grid_qos = qos;
+
+	return 0;
+}
+
+void sched_grid_qos_free(struct task_struct *p)
+{
+	kfree(p->grid_qos);
+	p->grid_qos = NULL;
+}
+
+/* dynamic select a more appropriate preferred interleave nid for process */
+int sched_grid_preferred_interleave_nid(struct mempolicy *policy)
+{
+#ifndef CONFIG_NUMA
+	return NUMA_NO_NODE;
+#else
+	nodemask_t nmask;
+	unsigned int next;
+	struct task_struct *me = current;
+	nodemask_t *preferred_nmask = NULL;
+
+	if (likely(me->grid_qos))
+		preferred_nmask =
+			&me->grid_qos->affinity.mem_preferred_node_mask;
+
+	if (!preferred_nmask || !policy)
+		return NUMA_NO_NODE;
+
+	if (nodes_equal(policy->nodes, *preferred_nmask))
+		return NUMA_NO_NODE;
+	/*
+	 * We perceive the actual consumption of memory bandwidth
+	 * in each node and post a preferred interleave nid in
+	 * more appropriate range.
+	 */
+	nodes_and(nmask, policy->nodes, *preferred_nmask);
+	if (nodes_empty(nmask))
+		return NUMA_NO_NODE;
+
+	next = next_node_in(me->il_prev, nmask);
+	if (next < MAX_NUMNODES)
+		me->il_prev = next;
+	return next;
+#endif
+}
+
+/* dynamic select a more appropriate preferred nid for process */
+int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask)
+{
+	int nd = preferred_nid;
+	nodemask_t nmask, ndmask;
+	nodemask_t *preferred_nmask = NULL;
+
+	if (likely(current->grid_qos))
+		preferred_nmask =
+			¤t->grid_qos->affinity.mem_preferred_node_mask;
+
+	if (!preferred_nmask)
+		return preferred_nid;
+
+	/*
+	 * We perceive the actual consumption of memory bandwidth
+	 * in each node and post a preferred nid in more appropriate
+	 * range.
+	 */
+	nmask = *preferred_nmask;
+	if (nodemask) {
+		if (nodes_equal(*nodemask, nmask))
+			return preferred_nid;
+
+		nodes_and(nmask, nmask, *nodemask);
+	}
+
+	if (node_isset(preferred_nid, nmask))
+		return preferred_nid;
+
+	/*
+	 * We prefer the numa node we're running, if there is no limit
+	 * to nodemask, we select preferred nid in preferred range or
+	 * in restriced range if not.
+	 */
+	init_nodemask_of_node(&ndmask, numa_node_id());
+	nodes_and(ndmask, nmask, ndmask);
+	if (!nodes_empty(ndmask))
+		nd = first_node(ndmask);
+	else if (!nodes_empty(nmask))
+		nd = first_node(nmask);
+
+	return nd;
+}
diff --git a/kernel/sched/grid/stat.c b/kernel/sched/grid/stat.c
new file mode 100644
index 000000000000..b40c75145608
--- /dev/null
+++ b/kernel/sched/grid/stat.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Common code for QOS-aware smart grid Scheduling
+ *
+ * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd
+ *
+ * Author: Wang Shaobo <bobo.shaobowang@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#include <linux/sched/grid_qos.h>
+#include "internal.h"
+
+void qos_stat_init(struct sched_grid_qos_stat *stat)
+{
+	stat->sample[SCHED_GRID_QOS_IPS_INDEX].name = "ips";
+	stat->sample[SCHED_GRID_QOS_IPS_INDEX].index = SCHED_GRID_QOS_IPS_INDEX;
+	stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].name = "membound_ratio";
+	stat->sample[SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX].index =
+		SCHED_GRID_QOS_MEMBOUND_RATIO_INDEX;
+	stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].name = "memband_width";
+	stat->sample[SCHED_GRID_QOS_MEMBANDWIDTH_INDEX].index =
+		SCHED_GRID_QOS_MEMBANDWIDTH_INDEX;
+}
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b23a239de750..707dde78d753 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -79,6 +79,7 @@
 #include <linux/sched/mm.h>
 #include <linux/sched/numa_balancing.h>
 #include <linux/sched/task.h>
+#include <linux/sched/grid_qos.h>
 #include <linux/nodemask.h>
 #include <linux/cpuset.h>
 #include <linux/slab.h>
@@ -2200,9 +2201,13 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
 
 	if (pol->mode == MPOL_INTERLEAVE) {
 		struct page *page;
-		unsigned nid;
+		int nid = NUMA_NO_NODE;
+
+		if (smart_grid_used())
+			nid = sched_grid_preferred_interleave_nid(pol);
+		if (nid == NUMA_NO_NODE)
+			nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
 
-		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
 		mpol_cond_put(pol);
 		gfp |= __GFP_COMP;
 		page = alloc_page_interleave(gfp, order, nid);
@@ -2267,6 +2272,8 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
 
 	nmask = policy_nodemask(gfp, pol);
 	preferred_nid = policy_node(gfp, pol, node);
+	if (smart_grid_used())
+		preferred_nid = sched_grid_preferred_nid(preferred_nid, nmask);
 	folio = __folio_alloc(gfp, order, preferred_nid, nmask);
 	mpol_cond_put(pol);
 out:
-- 
2.34.1