[PATCH OLK-5.10 v4 10/11] sched: Introduce CONFIG_QOS_SCHED_NUMA_ICON

23 May 2024

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ
CVE: NA
--------------------------------
Introduce NUMA isolation and consolidation. If enabled,
scheduler will identify relationship between tasks,
and track NUMA resource usage.
With 'numa_icon=enable/disable' to control the feature.
Signed-off-by: Hui Tang tanghui20@huawei.com
---
 include/linux/sched.h          |  15 ++++
 include/uapi/linux/bpf.h       |   7 ++
 init/Kconfig                   |  13 +++
 kernel/sched/Makefile          |   1 +
 kernel/sched/bpf_sched.c       |  29 +++++++
 kernel/sched/fair.c            |  14 ++++
 kernel/sched/numa_icon.c       | 144 +++++++++++++++++++++++++++++++++
 kernel/sched/numa_icon.h       |  43 ++++++++++
 kernel/sched/sched.h           |   2 +
 scripts/bpf_helpers_doc.py     |   2 +
 tools/include/uapi/linux/bpf.h |   7 ++
 tools/lib/bpf/libbpf_sched.h   |  24 ++++++
 12 files changed, 301 insertions(+)
 create mode 100644 kernel/sched/numa_icon.c
 create mode 100644 kernel/sched/numa_icon.h

diff --git a/include/linux/sched.h b/include/linux/sched.h
index af43d8d55e1b..fa83018137ce 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2360,6 +2360,21 @@ struct bpf_sched_cpu_stats {
    KABI_RESERVE(4)
 };
+struct bpf_node_stats {
+	unsigned long util;
+	unsigned long compute_capacity;
+	unsigned int weight;
+
+	KABI_RESERVE(1)
+	KABI_RESERVE(2)
+	KABI_RESERVE(3)
+	KABI_RESERVE(4)
+	KABI_RESERVE(5)
+	KABI_RESERVE(6)
+	KABI_RESERVE(7)
+	KABI_RESERVE(8)
+};
+
 struct cpumask_op_args {
    unsigned int op_type;
    void *arg1;
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8aba6670549c..b87934003c40 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3914,6 +3914,12 @@ union bpf_attr {
  *		set current task preferred node.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_node_stats(int nid, struct bpf_node_stats *ctx, int len)
+ *	Description
+ *		get resource statistics of *nid* and store in *ctx*.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
    FN(unspec),			\
@@ -4089,6 +4095,7 @@ union bpf_attr {
    FN(nodemask_op),		\
    FN(get_task_relationship_stats),\
    FN(sched_set_curr_preferred_node),\
+	FN(get_node_stats),		\
    /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/init/Kconfig b/init/Kconfig
index 7bcc7d5a8584..b722b7a887c1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1090,6 +1090,19 @@ config SCHED_TASK_RELATIONSHIP
If in doubt, say N.
+config QOS_SCHED_NUMA_ICON
+	bool "numa aware schedule"
+	depends on BPF_SCHED
+	depends on SCHED_TASK_RELATIONSHIP
+	default n
+	help
+	 This feature provides the NUMA Isolation and Consolidationthe
+	 Mechanisms based on ebpf and task relationship. If enabled, scheduler
+	 places related tasks on same numa node when the node has spare
+	 resource.
+
+	 If in doubt, say N.
+
 config UCLAMP_TASK_GROUP
    bool "Utilization clamping per group of tasks"
    depends on CGROUP_SCHED
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 879c22e63c6c..ff9ff2c17f79 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -41,3 +41,4 @@ obj-$(CONFIG_BPF_SCHED) += bpf_sched.o
 obj-$(CONFIG_BPF_SCHED) += bpf_topology.o
 obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/
 obj-$(CONFIG_SCHED_TASK_RELATIONSHIP) += relationship.o relationship_ioctl.o
+obj-$(CONFIG_QOS_SCHED_NUMA_ICON) += numa_icon.o
diff --git a/kernel/sched/bpf_sched.c b/kernel/sched/bpf_sched.c
index ac1b94ea6740..3e14d1fa911e 100644
--- a/kernel/sched/bpf_sched.c
+++ b/kernel/sched/bpf_sched.c
@@ -346,6 +346,31 @@ static const struct bpf_func_proto bpf_cpus_share_cache_proto = {
    .arg2_type	= ARG_ANYTHING,
 };
+#ifdef CONFIG_QOS_SCHED_NUMA_ICON
+BPF_CALL_3(bpf_get_node_stats, int, nid,
+	   struct bpf_node_stats *, ctx,
+	   int, len)
+{
+	if (len != sizeof(*ctx))
+		return -EINVAL;
+
+	if ((unsigned int)nid >= nr_node_ids)
+		return -EINVAL;
+
+	sched_get_node_load(nid, ctx);
+	return 0;
+}
+
+const struct bpf_func_proto bpf_get_node_stats_proto = {
+	.func		= bpf_get_node_stats,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_ANYTHING,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE,
+};
+#endif
+
 #ifdef CONFIG_SCHED_TASK_RELATIONSHIP
 BPF_CALL_3(bpf_get_task_relationship_stats, struct task_struct *, tsk,
       struct bpf_map *, map, struct bpf_relationship_get_args *, args)
@@ -413,6 +438,10 @@ bpf_sched_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
    	return &bpf_cpus_share_cache_proto;
    case BPF_FUNC_nodemask_op:
    	return &bpf_nodemask_op_proto;
+#ifdef CONFIG_QOS_SCHED_NUMA_ICON
+	case BPF_FUNC_get_node_stats:
+		return &bpf_get_node_stats_proto;
+#endif
 #ifdef CONFIG_SCHED_TASK_RELATIONSHIP
    case BPF_FUNC_get_task_relationship_stats:
    	return &bpf_get_task_relationship_stats_proto;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c64055874a73..404358af80c7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3954,6 +3954,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
cfs_rq_util_change(cfs_rq, 0);
+	numa_load_change(cfs_rq);
+
    trace_pelt_cfs_tp(cfs_rq);
 }
@@ -3984,6 +3986,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
cfs_rq_util_change(cfs_rq, 0);
+	numa_load_change(cfs_rq);
+
    trace_pelt_cfs_tp(cfs_rq);
 }
@@ -4024,6 +4028,7 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
} else if (decayed) {
    	cfs_rq_util_change(cfs_rq, 0);
+		numa_load_change(cfs_rq);
if (flags & UPDATE_TG)
    		update_tg_load_avg(cfs_rq);
@@ -13286,6 +13291,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
    task_tick_core(rq, curr);
task_tick_relationship(rq, curr);
+
+	update_numa_capacity(rq);
 }
/*
@@ -13868,6 +13875,7 @@ void sched_show_relationship(struct task_struct *p, struct seq_file *m)
 #ifdef CONFIG_SCHED_TASK_RELATIONSHIP
    struct net_group *net_grp;
    struct numa_group *ng;
+	int node;
if (!task_relationship_used())
    	return;
@@ -13889,6 +13897,10 @@ void sched_show_relationship(struct task_struct *p, struct seq_file *m)
    }
rcu_read_unlock();
+
+	for_each_online_node(node) {
+		print_node_load_info(m, node);
+	}
 #endif
 }
 #endif /* CONFIG_SCHED_DEBUG */
@@ -13959,6 +13971,8 @@ __init void init_sched_fair_class(void)
    	INIT_LIST_HEAD(&per_cpu(qos_throttled_cfs_rq, i));
 #endif
+	init_sched_numa_icon();
+
 #ifdef CONFIG_SMP
    open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
diff --git a/kernel/sched/numa_icon.c b/kernel/sched/numa_icon.c
new file mode 100644
index 000000000000..e9825ac7f866
--- /dev/null
+++ b/kernel/sched/numa_icon.c
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Common code for task numa isolation consolidation
+ *
+ * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd
+ *
+ * Author: Hui Tang tanghui20@huawei.com
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#include "sched.h"
+
+static bool __sched_numa_icon_switch __initdata;
+DEFINE_STATIC_KEY_FALSE(sched_numa_icon_switch);
+
+struct node_load_info *node_load_ptr;
+
+static void set_numa_icon_switch(bool enabled)
+{
+	if (enabled) {
+		static_branch_enable(&sched_numa_icon_switch);
+		task_relationship_enable();
+	} else {
+		static_branch_disable(&sched_numa_icon_switch);
+		task_relationship_disable();
+	}
+}
+
+static int __init numa_icon_switch_setup(char *str)
+{
+	int ret = 0;
+
+	if (!str)
+		goto out;
+
+	/*
+	 * This code is called before jump labels have been set up, so we can't
+	 * change the static branch directly just yet.  Instead set a temporary
+	 * variable so init_numa_icon_switch() can do it later.
+	 */
+	if (!strcmp(str, "enable")) {
+		__sched_numa_icon_switch = true;
+		ret = 1;
+	} else if (!strcmp(str, "disable")) {
+		__sched_numa_icon_switch = false;
+		ret = 1;
+	}
+out:
+	if (!ret)
+		pr_warn("Unable to parse numa_icon=\n");
+
+	return ret;
+}
+__setup("numa_icon=", numa_icon_switch_setup);
+
+__init void init_sched_numa_icon(void)
+{
+	int i;
+
+	set_numa_icon_switch(__sched_numa_icon_switch);
+
+	if (!sched_numa_icon_enabled())
+		return;
+
+	node_load_ptr = kcalloc(nr_node_ids, sizeof(struct node_load_info),
+		GFP_KERNEL);
+
+	for (i = 0; i < nr_node_ids; i++) {
+		raw_spin_lock_init(&node_load_ptr[i].lock);
+		node_load_ptr[i].util_avg_last =
+			kcalloc(nr_cpu_ids, sizeof(struct sched_avg), GFP_KERNEL);
+	}
+
+	for_each_possible_cpu(i) {
+		node_load_ptr[cpu_to_node(i)].compute_capacity +=
+			SCHED_CAPACITY_SCALE;
+	}
+}
+
+void print_node_load_info(struct seq_file *m, int node)
+{
+	if (!sched_numa_icon_enabled())
+		return;
+
+	seq_printf(m, "node %d capacity=%lu util_avg=%lu\n", node,
+		node_load_ptr[node].compute_capacity,
+		atomic_long_read(&node_load_ptr[node].util_avg));
+}
+
+void numa_load_change(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	int cpu = cpu_of(rq);
+	int nid = cpu_to_node(cpu);
+	struct sched_avg *avg_old;
+	long delta;
+
+	if (!sched_numa_icon_enabled())
+		return;
+
+	avg_old = &node_load_ptr[nid].util_avg_last[cpu];
+
+	if (&rq->cfs != cfs_rq)
+		return;
+
+	delta = cfs_rq->avg.util_avg - avg_old->util_avg;
+	atomic_long_add(delta, &node_load_ptr[nid].util_avg);
+	avg_old->util_avg = cfs_rq->avg.util_avg;
+}
+
+void update_numa_capacity(struct rq *rq)
+{
+	int cpu = cpu_of(rq);
+	int nid = cpu_to_node(cpu);
+	unsigned long capacity = 0;
+
+	if (!sched_numa_icon_enabled())
+		return;
+
+	if (cpu != cpumask_first(cpumask_of_node(nid)))
+		return;
+
+	for_each_cpu(cpu, cpumask_of_node(nid)) {
+		capacity += cpu_rq(cpu)->cpu_capacity;
+	}
+	node_load_ptr[nid].compute_capacity = capacity;
+}
+
+#ifdef CONFIG_BPF_SCHED
+void sched_get_node_load(int nid, struct bpf_node_stats *ctx)
+{
+	ctx->util = atomic_long_read(&node_load_ptr[nid].util_avg);
+	ctx->compute_capacity = node_load_ptr[nid].compute_capacity;
+	ctx->weight = cpumask_weight(cpumask_of_node(nid));
+}
+#endif
diff --git a/kernel/sched/numa_icon.h b/kernel/sched/numa_icon.h
new file mode 100644
index 000000000000..adeed53e9f14
--- /dev/null
+++ b/kernel/sched/numa_icon.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_NUMA_ICON_H
+#include <linux/sched.h>
+
+struct node_load_info {
+	raw_spinlock_t		lock ____cacheline_aligned;
+	atomic_long_t		util_avg;
+	unsigned long		compute_capacity;
+	struct sched_avg	*util_avg_last;
+};
+
+#ifdef CONFIG_QOS_SCHED_NUMA_ICON
+extern struct static_key_false sched_numa_icon_switch;
+static __always_inline bool sched_numa_icon_enabled(void)
+{
+	return static_branch_unlikely(&sched_numa_icon_switch);
+}
+
+extern void print_node_load_info(struct seq_file *m, int node);
+extern __init void init_sched_numa_icon(void);
+extern void sched_get_node_load(int nid, struct bpf_node_stats *ctx);
+extern void init_node_load(struct rq *rq);
+extern void numa_load_change(struct cfs_rq *cfs_rq);
+extern void update_numa_capacity(struct rq *rq);
+
+#else /* !CONFIG_QOS_SCHED_NUMA_ICON */
+static inline void init_sched_numa_icon(void) {}
+
+static inline void init_node_load(struct rq *rq) {}
+
+static inline void numa_load_change(struct cfs_rq *cfs_rq) {}
+
+static inline void update_numa_capacity(struct rq *rq) {}
+
+static inline void print_node_load_info(struct seq_file *m, int node) {}
+
+static __always_inline bool sched_numa_icon_enabled(void)
+{
+	return false;
+}
+#endif /* CONFIG_QOS_SCHED_NUMA_ICON */
+
+#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e6f934af7062..3b2fc472908a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -76,6 +76,8 @@
#include "cpupri.h"
 #include "cpudeadline.h"
+#include "numa_icon.h"
+#include <uapi/linux/sched_ctrl.h>
#include <trace/events/sched.h>
diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py
index 4f6fac621f65..3afc3e354844 100755
--- a/scripts/bpf_helpers_doc.py
+++ b/scripts/bpf_helpers_doc.py
@@ -449,6 +449,7 @@ class PrinterHelpers(Printer):
             'struct bpf_relationship_get_args',
             'struct bpf_relationship_set_args',
             'struct sched_preferred_node_ctx',
+            'struct bpf_node_stats',
     ]
     known_types = {
             '...',
@@ -506,6 +507,7 @@ class PrinterHelpers(Printer):
             'struct bpf_relationship_get_args',
             'struct bpf_relationship_set_args',
             'struct sched_preferred_node_ctx',
+            'struct bpf_node_stats',
     }
     mapped_types = {
             'u8': '__u8',
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 5c04747f201c..5a153a1a8f18 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3914,6 +3914,12 @@ union bpf_attr {
  *		set current task preferred node.
  *	Return
  *		0 on success, or a negative error in case of failure.
+ *
+ * int bpf_get_node_stats(int nid, struct bpf_node_stats *ctx, int len)
+ *	Description
+ *		get resource statistics of *nid* and store in *ctx*.
+ *	Return
+ *		0 on success, or a negative error in case of failure.
  */
 #define __BPF_FUNC_MAPPER(FN)		\
    FN(unspec),			\
@@ -4089,6 +4095,7 @@ union bpf_attr {
    FN(nodemask_op),		\
    FN(get_task_relationship_stats),\
    FN(sched_set_curr_preferred_node),\
+	FN(get_node_stats),		\
    /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
diff --git a/tools/lib/bpf/libbpf_sched.h b/tools/lib/bpf/libbpf_sched.h
index 04af73b92856..3e9b41788637 100644
--- a/tools/lib/bpf/libbpf_sched.h
+++ b/tools/lib/bpf/libbpf_sched.h
@@ -623,6 +623,30 @@ static __always_inline  int libbpf_sched_se_tag_of(struct sched_entity *se)
    return se_tag;
 }
+static __always_inline unsigned long libbpf_node_cfs_util_of(int nid)
+{
+	struct bpf_node_stats stats = {0};
+
+	bpf_get_node_stats(nid, &stats, sizeof(stats));
+	return getVal(stats.util);
+}
+
+static __always_inline unsigned long libbpf_node_cfs_capacity_of(int nid)
+{
+	struct bpf_node_stats stats = {0};
+
+	bpf_get_node_stats(nid, &stats, sizeof(stats));
+	return getVal(stats.compute_capacity);
+}
+
+static __always_inline unsigned int libbpf_node_weight_of(int nid)
+{
+	struct bpf_node_stats stats = {0};
+
+	bpf_get_node_stats(nid, &stats, sizeof(stats));
+	return getVal(stats.weight);
+}
+
 static __always_inline int
 libbpf_mem_preferred_nid(struct task_struct *tsk, nodemask_t *preferred_node)
 {
-- 
2.34.1


    

2025

2024

2023

2022

2021

2020

2019

[PATCH OLK-5.10 v4 10/11] sched: Introduce CONFIG_QOS_SCHED_NUMA_ICON