[PATCH OLK-5.10 v5 05/11] sched: Introduce task relationship by net and memory

24 May 2024

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ
CVE: NA

--------------------------------

There may be some relationships between threads, such as
network communication, memory sharing, etc.

Generally, threads that have relationships may have better
performance when they are scheduled to the same smt, cluster
or numa, because they share certain resources.

This patch is in the scheduler, and provides a mechanism to
identify and maintain the affinity relationship. Currently,
the memory and network parts have been implemented, and other
relationships will be extended in the future.

Signed-off-by: Hui Tang <tanghui20@huawei.com>
---
 fs/exec.c                          |   2 +
 include/linux/sched.h              |   5 +
 include/linux/sched/relationship.h | 137 +++++++++++
 init/Kconfig                       |  10 +
 init/init_task.c                   |   3 +
 kernel/fork.c                      |  13 +
 kernel/sched/Makefile              |   1 +
 kernel/sched/debug.c               |   2 +
 kernel/sched/fair.c                |  80 +++++-
 kernel/sched/relationship.c        | 379 +++++++++++++++++++++++++++++
 10 files changed, 631 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/sched/relationship.h
 create mode 100644 kernel/sched/relationship.c

diff --git a/fs/exec.c b/fs/exec.c
index 981b3ac90c44..792d62632e92 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -38,6 +38,7 @@
 #include <linux/sched/coredump.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/numa_balancing.h>
+#include <linux/sched/relationship.h>
 #include <linux/sched/task.h>
 #include <linux/pagemap.h>
 #include <linux/perf_event.h>
@@ -1822,6 +1823,7 @@ static int bprm_execve(struct linux_binprm *bprm,
 	rseq_execve(current);
 	acct_update_integrals(current);
 	task_numa_free(current, false);
+	task_relationship_free(current, true);
 	return retval;
 
 out:
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3d5553f70401..af43d8d55e1b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -26,6 +26,7 @@
 #include <linux/resource.h>
 #include <linux/latencytop.h>
 #include <linux/sched/prio.h>
+#include <linux/sched/relationship.h>
 #include <linux/sched/types.h>
 #include <linux/signal_types.h>
 #include <linux/mm_types_task.h>
@@ -1468,7 +1469,11 @@ struct task_struct {
 #else
 	KABI_RESERVE(13)
 #endif
+#if defined(CONFIG_SCHED_TASK_RELATIONSHIP) && !defined(__GENKSYMS__)
+	KABI_USE(14, struct task_relationship *rship)
+#else
 	KABI_RESERVE(14)
+#endif
 	KABI_RESERVE(15)
 	KABI_RESERVE(16)
 	KABI_AUX_PTR(task_struct)
diff --git a/include/linux/sched/relationship.h b/include/linux/sched/relationship.h
new file mode 100644
index 000000000000..df3f3f7814cd
--- /dev/null
+++ b/include/linux/sched/relationship.h
@@ -0,0 +1,137 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SCHED_RELATIONSHIP_H
+#define _LINUX_SCHED_RELATIONSHIP_H
+
+#include <linux/nodemask.h>
+#include <linux/jump_label.h>
+#include <linux/refcount.h>
+
+#define FAULT_NODES_MAX 4
+
+struct task_struct;
+struct rq;
+
+#ifdef CONFIG_SCHED_DEBUG
+struct seq_file;
+#endif
+
+struct fault_array_info {
+	int nid;
+	unsigned long val;
+};
+
+struct relationship_hdr {
+	refcount_t refcount;
+	spinlock_t lock;
+	int nr_tasks;
+	int gid;
+	nodemask_t preferred_nid;
+};
+
+enum net_req_type {
+	NET_RS_TYPE_INVALID = 0,
+	NET_RS_TYPE_LOCAL,
+	NET_RS_TYPE_RX,
+	NET_RS_TYPE_TX,
+	NET_RS_TYPE_MAX
+};
+
+struct net_relationship_req {
+	enum net_req_type net_rship_type;
+	pid_t rx_pid;
+	pid_t tx_pid;
+	int nic_nid;
+	int rx_dev_idx;
+	int rx_dev_queue_idx;
+	u64 rx_dev_netns_cookie;
+	unsigned long rxtx_bytes;
+
+	/* reserved */
+	unsigned long rxtx_cnt;
+};
+
+struct net_relationship_callback {
+	struct callback_head twork;
+	atomic_t active;
+	pid_t src_pid;
+	struct net_relationship_req req;
+};
+
+struct net_group {
+	struct rcu_head rcu;
+	struct relationship_hdr hdr;
+	unsigned long rxtx_bytes;
+
+	/* reserved */
+	unsigned long rxtx_cnt;
+};
+
+struct numa_fault_ext {
+	struct fault_array_info faults_ordered[FAULT_NODES_MAX];
+};
+
+struct task_relationship {
+	/* network relationship */
+	struct net_group __rcu *net_group;
+	spinlock_t net_lock;
+	int nic_nid;
+	int rx_dev_idx;
+	int rx_dev_queue_idx;
+	unsigned long rx_dev_netns_cookie;
+	unsigned long rxtx_remote_bytes;
+	unsigned long rxtx_remote_update_next;
+	unsigned long rxtx_remote_buffer;
+	unsigned long rxtx_bytes;
+	unsigned long rxtx_buffer;
+	unsigned long rxtx_update_next;
+	struct net_relationship_callback cb;
+
+	/* extras numa fault data */
+	struct numa_fault_ext faults;
+};
+
+extern void task_relationship_enable(void);
+extern void task_relationship_disable(void);
+
+#ifdef CONFIG_SCHED_DEBUG
+extern void sched_show_relationship(struct task_struct *p, struct seq_file *m);
+#endif
+
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP
+extern int sched_relationship_fork(struct task_struct *p);
+extern void sched_relationship_free(struct task_struct *p);
+void task_relationship_free(struct task_struct *tsk, bool reset);
+extern bool task_relationship_supported(struct task_struct *tsk);
+extern int sched_net_relationship_submit(struct net_relationship_req *req);
+extern void numa_faults_update_and_sort(int nid, int new,
+					  struct fault_array_info *stats);
+
+DECLARE_STATIC_KEY_FALSE(__relationship_switch);
+static inline bool task_relationship_used(void)
+{
+	return static_branch_unlikely(&__relationship_switch);
+}
+#else
+static inline bool task_relationship_used(void)
+{
+	return false;
+}
+
+static inline int sched_relationship_fork(struct task_struct *p)
+{
+	return 0;
+}
+
+static inline void sched_relationship_free(struct task_struct *p) {}
+
+static inline void
+task_relationship_free(struct task_struct *tsk, bool reset) {}
+
+static inline int
+sched_net_relationship_submit(struct net_relationship_req *req)
+{
+	return 0;
+}
+#endif
+
+#endif
diff --git a/init/Kconfig b/init/Kconfig
index 325c02d4a4df..ea9a6e93155b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1081,6 +1081,16 @@ config QOS_SCHED_DYNAMIC_AFFINITY
 	 of taskgroup is below threshold setted, otherwise make taskgroup to use
 	 cpus allowed.
 
+config SCHED_TASK_RELATIONSHIP
+	bool "task relationship"
+	depends on NUMA_BALANCING
+	default n
+	help
+	 This feature enables the scheduler to identify tasks relationship by
+	 page fault, SPE, socket and other IPC method.
+
+	 If in doubt, say N.
+
 config UCLAMP_TASK_GROUP
 	bool "Utilization clamping per group of tasks"
 	depends on CGROUP_SCHED
diff --git a/init/init_task.c b/init/init_task.c
index fa8838c2c203..3b846f8223d9 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -217,6 +217,9 @@ struct task_struct init_task
 #ifdef CONFIG_QOS_SCHED_DYNAMIC_AFFINITY
 	.prefer_cpus	= NULL,
 #endif
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP
+	.rship		= NULL,
+#endif
 #ifdef CONFIG_SECCOMP_FILTER
 	.seccomp	= { .filter_count = ATOMIC_INIT(0) },
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 079b718131b0..12db99751381 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -476,6 +476,8 @@ void free_task(struct task_struct *tsk)
 #ifdef CONFIG_QOS_SCHED_SMART_GRID
 	sched_grid_qos_free(tsk);
 #endif
+	if (task_relationship_used())
+		sched_relationship_free(tsk);
 	free_task_struct(tsk);
 }
 EXPORT_SYMBOL(free_task);
@@ -748,6 +750,7 @@ void __put_task_struct(struct task_struct *tsk)
 	io_uring_free(tsk);
 	cgroup_free(tsk);
 	task_numa_free(tsk, true);
+	task_relationship_free(tsk, false);
 	security_task_free(tsk);
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
@@ -949,6 +952,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 	tsk->prefer_cpus = NULL;
 #endif
 
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP
+	tsk->rship = NULL;
+#endif
+
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
 	clear_tsk_need_resched(tsk);
@@ -2102,6 +2109,12 @@ static __latent_entropy struct task_struct *copy_process(
 		goto bad_fork_cleanup_count;
 #endif
 
+	if (task_relationship_used()) {
+		retval = sched_relationship_fork(p);
+		if (retval)
+			goto bad_fork_cleanup_count;
+	}
+
 	/*
 	 * If multiple threads are within copy_process(), then this check
 	 * triggers too late. This doesn't hurt, the check is only there
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index a6fe0ee09917..114dc36320c6 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -40,3 +40,4 @@ obj-$(CONFIG_SCHED_CORE) += core_sched.o
 obj-$(CONFIG_BPF_SCHED) += bpf_sched.o
 obj-$(CONFIG_BPF_SCHED) += bpf_topology.o
 obj-$(CONFIG_QOS_SCHED_SMART_GRID) += grid/
+obj-$(CONFIG_SCHED_TASK_RELATIONSHIP) += relationship.o
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 00f01518bbdd..5233ba9fdc69 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1040,6 +1040,8 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 	}
 
 	sched_show_numa(p, m);
+
+	sched_show_relationship(p, m);
 }
 
 void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 20f971b7df19..f9aa00ec559e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1086,6 +1086,11 @@ struct numa_group {
 	struct rcu_head rcu;
 	unsigned long total_faults;
 	unsigned long max_faults_cpu;
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP
+	struct fault_array_info score_ordered[FAULT_NODES_MAX];
+	struct fault_array_info faults_ordered[FAULT_NODES_MAX];
+	nodemask_t preferred_nid;
+#endif
 	/*
 	 * Faults_cpu is used to decide whether memory should move
 	 * towards the CPU. As a consequence, these stats are weighted
@@ -2279,6 +2284,9 @@ static int preferred_group_nid(struct task_struct *p, int nid)
 {
 	nodemask_t nodes;
 	int dist;
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP
+	struct numa_group *ng;
+#endif
 
 	/* Direct connections between all NUMA nodes. */
 	if (sched_numa_topology_type == NUMA_DIRECT)
@@ -2301,7 +2309,19 @@ static int preferred_group_nid(struct task_struct *p, int nid)
 				max_score = score;
 				max_node = node;
 			}
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP
+			if (task_relationship_used()) {
+				ng = deref_curr_numa_group(p);
+				if (ng) {
+					spin_lock_irq(&ng->lock);
+					numa_faults_update_and_sort(node, score,
+						ng->score_ordered);
+					spin_unlock_irq(&ng->lock);
+				}
+			}
+#endif
 		}
+
 		return max_node;
 	}
 
@@ -2451,6 +2471,17 @@ static void task_numa_placement(struct task_struct *p)
 			max_faults = group_faults;
 			max_nid = nid;
 		}
+
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP
+		if (task_relationship_used()) {
+			numa_faults_update_and_sort(nid, faults,
+				p->rship->faults.faults_ordered);
+
+			if (ng)
+				numa_faults_update_and_sort(nid, group_faults,
+					ng->faults_ordered);
+		}
+#endif
 	}
 
 	if (ng) {
@@ -2512,6 +2543,16 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
 
 		grp->nr_tasks++;
 		rcu_assign_pointer(p->numa_group, grp);
+
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP
+		if (task_relationship_used()) {
+			grp->preferred_nid = NODE_MASK_NONE;
+			for (i = 0; i < FAULT_NODES_MAX; i++) {
+				grp->faults_ordered[i].nid = -1;
+				grp->score_ordered[i].nid = -1;
+			}
+		}
+#endif
 	}
 
 	rcu_read_lock();
@@ -2623,6 +2664,15 @@ void task_numa_free(struct task_struct *p, bool final)
 		p->total_numa_faults = 0;
 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
 			numa_faults[i] = 0;
+
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP
+		if (task_relationship_used()) {
+			for (i = 0; i < FAULT_NODES_MAX; i++) {
+				p->rship->faults.faults_ordered[i].nid = -1;
+				p->rship->faults.faults_ordered[i].val = 0;
+			}
+		}
+#endif
 	}
 }
 
@@ -13707,7 +13757,7 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
 	struct numa_group *ng;
 
 	rcu_read_lock();
-	ng = rcu_dereference(p->numa_group);
+
 	for_each_online_node(node) {
 		if (p->numa_faults) {
 			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
@@ -13722,6 +13772,34 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
 	rcu_read_unlock();
 }
 #endif /* CONFIG_NUMA_BALANCING */
+
+void sched_show_relationship(struct task_struct *p, struct seq_file *m)
+{
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP
+	struct net_group *net_grp;
+	struct numa_group *ng;
+
+	if (!task_relationship_used())
+		return;
+
+	rcu_read_lock();
+
+	ng = rcu_dereference(p->numa_group);
+	if (ng) {
+		seq_printf(m, "numa group preferred nid %*pbl\n",
+			nodemask_pr_args(&ng->preferred_nid));
+	}
+
+	net_grp = rcu_dereference(p->rship->net_group);
+	if (net_grp) {
+		seq_printf(m, "net group gid %d preferred nid %*pbl\n",
+			net_grp->hdr.gid,
+			nodemask_pr_args(&net_grp->hdr.preferred_nid));
+	}
+
+	rcu_read_unlock();
+#endif
+}
 #endif /* CONFIG_SCHED_DEBUG */
 
 __init void init_sched_fair_class(void)
diff --git a/kernel/sched/relationship.c b/kernel/sched/relationship.c
new file mode 100644
index 000000000000..01879e3272de
--- /dev/null
+++ b/kernel/sched/relationship.c
@@ -0,0 +1,379 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Common code for task relationship aware
+ *
+ * Copyright (C) 2023-2024 Huawei Technologies Co., Ltd
+ *
+ * Author: Hui Tang <tanghui20@huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+#include <linux/bpf_sched.h>
+#include <linux/sort.h>
+
+#include "sched.h"
+
+#define RXTX_BYTES_PERIOD_MS	(1000)
+#define RXTX_BYTES_DECAY_RATIO	(2)
+
+DEFINE_STATIC_KEY_FALSE(__relationship_switch);
+
+void task_relationship_enable(void)
+{
+	static_branch_enable(&__relationship_switch);
+}
+
+void task_relationship_disable(void)
+{
+	static_branch_disable(&__relationship_switch);
+}
+
+bool task_relationship_supported(struct task_struct *tsk)
+{
+	if (!task_relationship_used())
+		return false;
+
+	if (!tsk->rship || !tsk->mm ||
+		!cpumask_subset(cpu_online_mask, tsk->cpus_ptr) ||
+		!nodes_subset(node_online_map, tsk->mems_allowed) ||
+		get_task_policy(tsk)->mode == MPOL_BIND ||
+		get_task_policy(tsk)->mode == MPOL_INTERLEAVE)
+		return false;
+
+	return true;
+}
+
+static inline int get_net_group(struct net_group *grp)
+{
+	return refcount_inc_not_zero(&grp->hdr.refcount);
+}
+
+static inline void put_net_group(struct net_group *grp)
+{
+	if (refcount_dec_and_test(&grp->hdr.refcount))
+		kfree_rcu(grp, rcu);
+}
+
+static inline void put_task_net_group(struct task_struct *tsk, bool reset)
+{
+	struct net_group *grp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->rship->net_lock, flags);
+
+	grp = rcu_dereference_protected(tsk->rship->net_group,
+					lockdep_is_held(&tsk->rship->net_lock));
+	if (grp) {
+		spin_lock(&grp->hdr.lock);
+		grp->rxtx_bytes -= tsk->rship->rxtx_bytes;
+		grp->hdr.nr_tasks--;
+		spin_unlock(&grp->hdr.lock);
+		put_net_group(grp);
+		RCU_INIT_POINTER(tsk->rship->net_group, NULL);
+	}
+
+	if (reset) {
+		tsk->rship->rxtx_bytes = 0;
+		tsk->rship->rxtx_remote_bytes = 0;
+		tsk->rship->rx_dev_idx = -1;
+		tsk->rship->rx_dev_queue_idx = -1;
+		tsk->rship->nic_nid = -1;
+		tsk->rship->rx_dev_netns_cookie = 0;
+	}
+
+	spin_unlock_irqrestore(&tsk->rship->net_lock, flags);
+}
+
+static inline int remote_rxtx_process(struct net_relationship_req *req)
+{
+	struct task_relationship *rship;
+	struct task_struct *tsk;
+	unsigned long flags;
+	pid_t pid;
+	long diff;
+
+	rcu_read_lock();
+
+	pid = req->net_rship_type == NET_RS_TYPE_RX ? req->rx_pid : req->tx_pid;
+	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
+	if (!tsk || !task_relationship_supported(tsk))
+		goto out_unlock;
+
+	rship = tsk->rship;
+	if (time_after(jiffies, rship->rxtx_remote_update_next)) {
+		diff = rship->rxtx_remote_buffer - rship->rxtx_remote_bytes / 2;
+
+		spin_lock_irqsave(&rship->net_lock, flags);
+		rship->nic_nid = req->nic_nid;
+		if (req->net_rship_type == NET_RS_TYPE_RX) {
+			rship->rx_dev_idx = req->rx_dev_idx;
+			rship->rx_dev_queue_idx = req->rx_dev_queue_idx;
+			rship->rx_dev_netns_cookie = req->rx_dev_netns_cookie;
+		}
+		rship->rxtx_remote_bytes += diff;
+		rship->rxtx_remote_buffer = 0;
+		spin_unlock_irqrestore(&rship->net_lock, flags);
+	}
+
+	rship->rxtx_remote_buffer += req->rxtx_bytes;
+
+out_unlock:
+	rcu_read_unlock();
+
+	return 0;
+}
+
+int sched_net_relationship_submit(struct net_relationship_req *req)
+{
+	struct task_struct *rx_tsk, *tx_tsk, *dst_tsk;
+	struct net_group *rx_grp, *tx_grp;
+	int ret;
+
+	if (req->net_rship_type == NET_RS_TYPE_RX ||
+	    req->net_rship_type == NET_RS_TYPE_TX)
+		return remote_rxtx_process(req);
+
+	rcu_read_lock();
+
+	rx_tsk = find_task_by_pid_ns(req->rx_pid, &init_pid_ns);
+	tx_tsk = find_task_by_pid_ns(req->tx_pid, &init_pid_ns);
+	if (!rx_tsk || !tx_tsk) {
+		ret = -ESRCH;
+		goto out_unlock;
+	}
+
+	if (!task_relationship_supported(rx_tsk) ||
+	    !task_relationship_supported(tx_tsk)) {
+		ret = -EPERM;
+		goto out_unlock;
+	}
+
+	if (atomic_read(&rx_tsk->rship->cb.active) &&
+	    atomic_read(&tx_tsk->rship->cb.active)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	rx_grp = rcu_dereference(rx_tsk->rship->net_group);
+	tx_grp = rcu_dereference(tx_tsk->rship->net_group);
+	if (rx_grp && tx_grp) {
+		dst_tsk = rx_grp->hdr.nr_tasks >= tx_grp->hdr.nr_tasks ?
+			rx_tsk : tx_tsk;
+	} else if (rx_grp) {
+		dst_tsk = rx_tsk;
+	} else if (tx_grp) {
+		dst_tsk = tx_tsk;
+	} else {
+		dst_tsk = !atomic_read(&rx_tsk->rship->cb.active) ?
+			rx_tsk : tx_tsk;
+	}
+
+	if (atomic_cmpxchg(&dst_tsk->rship->cb.active, 0, 1)) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+
+	memcpy(&dst_tsk->rship->cb.req, req, sizeof(*req));
+	dst_tsk->rship->cb.src_pid = dst_tsk == rx_tsk ?
+		req->tx_pid : req->rx_pid;
+	task_work_add(dst_tsk, &dst_tsk->rship->cb.twork, TWA_RESUME);
+	ret = 0;
+
+out_unlock:
+	rcu_read_unlock();
+	return ret;
+}
+
+static void task_net_group(struct task_struct *curr, struct task_struct *src)
+{
+	struct net_group *src_grp, *curr_grp, *grp;
+
+	double_lock_irq(&src->rship->net_lock, &curr->rship->net_lock);
+	curr_grp = rcu_dereference_protected(curr->rship->net_group,
+			lockdep_is_held(&curr->rship->net_lock));
+	src_grp = rcu_dereference_protected(src->rship->net_group,
+			lockdep_is_held(&src->rship->net_lock));
+
+	if (!curr_grp) {
+		grp = kzalloc(sizeof(*grp), GFP_ATOMIC | __GFP_NOWARN);
+		if (!grp)
+			goto out_unlock;
+
+		refcount_set(&grp->hdr.refcount, 1);
+		spin_lock_init(&grp->hdr.lock);
+		grp->hdr.gid = curr->pid;
+		grp->hdr.preferred_nid = NODE_MASK_NONE;
+		node_set(task_node(curr), grp->hdr.preferred_nid);
+		grp->hdr.nr_tasks = 1;
+		rcu_assign_pointer(curr->rship->net_group, grp);
+		curr_grp = rcu_dereference_protected(curr->rship->net_group,
+				lockdep_is_held(&curr->rship->net_lock));
+	}
+
+	if (curr_grp == src_grp)
+		goto out_unlock;
+
+	if (!get_net_group(curr_grp))
+		goto out_unlock;
+
+	spin_lock(&curr_grp->hdr.lock);
+	curr_grp->hdr.nr_tasks++;
+	curr_grp->rxtx_bytes += src->rship->rxtx_bytes;
+	spin_unlock(&curr_grp->hdr.lock);
+
+	if (src_grp) {
+		spin_lock(&src_grp->hdr.lock);
+		src_grp->hdr.nr_tasks--;
+		src_grp->rxtx_bytes -= src->rship->rxtx_bytes;
+		spin_unlock(&src_grp->hdr.lock);
+		put_net_group(src_grp);
+	}
+
+	rcu_assign_pointer(src->rship->net_group, curr_grp);
+out_unlock:
+	spin_unlock(&src->rship->net_lock);
+	spin_unlock_irq(&curr->rship->net_lock);
+}
+
+static void task_rxtx_data_update(struct task_struct *tsk)
+{
+	struct net_group *grp;
+	long bytes_diff;
+
+	spin_lock_irq(&tsk->rship->net_lock);
+	bytes_diff = tsk->rship->rxtx_buffer -
+		tsk->rship->rxtx_bytes / RXTX_BYTES_DECAY_RATIO;
+	tsk->rship->rxtx_bytes += bytes_diff;
+	tsk->rship->rxtx_buffer = 0;
+	tsk->rship->rxtx_update_next = jiffies +
+		msecs_to_jiffies(RXTX_BYTES_PERIOD_MS);
+
+	grp = rcu_dereference_protected(tsk->rship->net_group,
+			lockdep_is_held(&tsk->rship->net_lock));
+	if (grp) {
+		spin_lock(&grp->hdr.lock);
+		grp->rxtx_bytes += bytes_diff;
+		spin_unlock(&grp->hdr.lock);
+	}
+
+	spin_unlock_irq(&tsk->rship->net_lock);
+}
+
+static void task_net_relationship_work(struct callback_head *work)
+{
+	struct net_relationship_callback *ncb;
+	struct task_struct *curr = current;
+	struct net_relationship_req req;
+	struct task_struct *src;
+
+	ncb = container_of(work, struct net_relationship_callback, twork);
+	req = ncb->req;
+	atomic_set(&ncb->active, 0);
+
+	rcu_read_lock();
+	src = find_task_by_pid_ns(ncb->src_pid, &init_pid_ns);
+	if (!src) {
+		rcu_read_unlock();
+		return;
+	}
+
+	if (!task_relationship_supported(src) ||
+	    !task_relationship_supported(curr)) {
+		rcu_read_unlock();
+		return;
+	}
+
+	/* prevent src going away */
+	get_task_struct(src);
+
+	rcu_read_unlock();
+
+	/* build net relationship */
+	task_net_group(src, curr);
+
+	if (time_after(jiffies, curr->rship->rxtx_update_next))
+		task_rxtx_data_update(curr);
+
+	if (time_after(jiffies, src->rship->rxtx_update_next))
+		task_rxtx_data_update(src);
+
+	double_lock_irq(&src->rship->net_lock, &curr->rship->net_lock);
+	curr->rship->rxtx_buffer += req.rxtx_bytes;
+	src->rship->rxtx_buffer += req.rxtx_bytes;
+	spin_unlock(&src->rship->net_lock);
+	spin_unlock_irq(&curr->rship->net_lock);
+
+	put_task_struct(src);
+}
+
+static int cmp_fault_stats(const void *a, const void *b)
+{
+	return ((struct fault_array_info *)b)->val -
+		((struct fault_array_info *)a)->val;
+}
+
+void numa_faults_update_and_sort(int nid, int new,
+				 struct fault_array_info *stats)
+{
+	int nodes, i;
+
+	if (!task_relationship_used())
+		return;
+
+	if (nid == first_online_node) {
+		for (i = 0; i < FAULT_NODES_MAX; i++) {
+			stats[i].nid = -1;
+			stats[i].val = 0;
+		}
+	}
+
+	nodes = min(FAULT_NODES_MAX, num_online_nodes());
+	if (new <= stats[nodes - 1].val)
+		return;
+
+	stats[nodes - 1].nid = nid;
+	stats[nodes - 1].val = new;
+	sort(stats, nodes, sizeof(stats[0]), cmp_fault_stats, NULL);
+}
+
+void task_relationship_free(struct task_struct *tsk, bool reset)
+{
+	if (!task_relationship_used())
+		return;
+
+	put_task_net_group(tsk, reset);
+}
+
+int sched_relationship_fork(struct task_struct *p)
+{
+	int i;
+
+	p->rship = kzalloc(sizeof(struct task_relationship), GFP_KERNEL);
+	if (!p->rship)
+		return -ENOMEM;
+
+	for (i = 0; i < FAULT_NODES_MAX; i++)
+		p->rship->faults.faults_ordered[i].nid = -1;
+
+	p->rship->nic_nid = -1;
+	p->rship->rx_dev_idx = -1;
+	p->rship->rx_dev_queue_idx = -1;
+
+	spin_lock_init(&p->rship->net_lock);
+	init_task_work(&p->rship->cb.twork, task_net_relationship_work);
+	return 0;
+}
+
+void sched_relationship_free(struct task_struct *p)
+{
+	kfree(p->rship);
+	p->rship = NULL;
+}
-- 
2.34.1