[PATCH OLK-5.10 v5 05/11] mm/mem_sampling.c: Drive NUMA balancing via mem_sampling access data

4 Jun 2024

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ
CVE: NA

--------------------------------

Feed the page access data obtained from mem_sampling to NUMA
balancing as hint fault equivalents. The existing per-task
and per-group fault stats are now built from page access
information provided by mem_sampling. With this it will not be
necessary to scan the address space to introduce NUMA hinting
faults.

A callback is registered from NUMA balancing to mem_sampling for
subscribing the page access information from mem_sampling. NUMA
balancing then uses task_work framework to process the mem_sampling
data and realize the migrating policy.

The sampling policy in NUMA balancing can be switched between
original task_tick_numa() and mem_sampling records by writing to a
sysctl interface dynamically. Similar effects can be achieved
by switching on and off CONFIG_NUMA_BALANCING_MEM_SAMPLING.

Note that THP migrating is not supported for now.

Signed-off-by: Ze Zuo <zuoze1@huawei.com>
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Signed-off-by: Shuang Yan <yanshuang7@huawei.com>
---
 include/linux/mem_sampling.h |   2 +
 kernel/sched/fair.c          |  11 +++
 mm/Kconfig                   |  13 +++
 mm/mem_sampling.c            | 176 +++++++++++++++++++++++++++++++++++
 4 files changed, 202 insertions(+)

diff --git a/include/linux/mem_sampling.h b/include/linux/mem_sampling.h
index 282f3f7d415b..b27b5e1fd96e 100644
--- a/include/linux/mem_sampling.h
+++ b/include/linux/mem_sampling.h
@@ -12,6 +12,8 @@
 #ifndef __MEM_SAMPLING_H
 #define __MEM_SAMPLING_H
 
+DECLARE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling);
+
 enum mem_sampling_sample_type {
 	MEM_SAMPLING_L1D_ACCESS	= 1 << 0,
 	MEM_SAMPLING_L1D_MISS		= 1 << 1,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7289ae80c936..25d348fa0658 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -30,6 +30,7 @@
 #endif
 #include <linux/sched/grid_qos.h>
 #include <linux/bpf_sched.h>
+#include <linux/mem_sampling.h>
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -2967,6 +2968,16 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	struct callback_head *work = &curr->numa_work;
 	u64 period, now;
 
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+	/*
+	 * If we are using access hints from hardware (like using
+	 * SPE), don't scan the address space.
+	 * Note that currently PMD-level page migration is not
+	 * supported.
+	 */
+	if (static_branch_unlikely(&sched_numabalancing_mem_sampling))
+		return;
+#endif
 	/*
 	 * We don't care about NUMA placement if we don't have memory.
 	 */
diff --git a/mm/Kconfig b/mm/Kconfig
index 381d440f85eb..55e324ce1633 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1019,6 +1019,19 @@ config MEM_SAMPLING
 	  features.. It requires at least one hardware pmu (e.g. ARM_SPE) to
 	  be enabled.
 
+config NUMABALANCING_MEM_SAMPLING
+	bool "Use hardware memory samples for numa balancing"
+	depends on MEM_SAMPLING && NUMA_BALANCING
+	default n
+	help
+	  This feature relies on hardware sampling, and will use memory access
+	  information obtained from hardware sampling in the NUMA balancing
+	  policy instead of the native software PROT_NONE scheme. Turning on
+	  this feature may have a performance impact on some workloads, for
+	  example, lightweight memory access programs.
+
+	  if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING.
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c
index c14d5214b747..ef943967210c 100644
--- a/mm/mem_sampling.c
+++ b/mm/mem_sampling.c
@@ -18,6 +18,10 @@
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/mem_sampling.h>
+#include <linux/mempolicy.h>
+#include <linux/task_work.h>
+#include <linux/migrate.h>
+#include <linux/sched/numa_balancing.h>
 
 struct mem_sampling_ops_struct mem_sampling_ops;
 
@@ -31,6 +35,15 @@ struct mem_sampling_record_cb_list_entry {
 };
 LIST_HEAD(mem_sampling_record_cb_list);
 
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+struct mem_sampling_numa_access_work {
+	struct callback_head work;
+	u64 vaddr, paddr;
+	/* Test for debug : decode buffer cpu not same with handle interrupt cpu*/
+	int cpu;
+};
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
+
 void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb)
 {
 	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
@@ -103,6 +116,167 @@ static void mem_sampling_process(struct mem_sampling_record *record_base, int nr
 		mem_sampling_ops.sampling_stop();
 }
 
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+
+static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+				unsigned long addr, int page_nid,
+				int *flags)
+{
+	get_page(page);
+
+	count_vm_numa_event(NUMA_HINT_FAULTS);
+	if (page_nid == numa_node_id()) {
+		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+		*flags |= TNF_FAULT_LOCAL;
+	}
+
+	return mpol_misplaced(page, vma, addr);
+}
+
+/*
+ * Called from task_work context to act upon the page access.
+ *
+ * Physical address (provided by SPE) is used directly instead
+ * of walking the page tables to get to the PTE/page. Hence we
+ * don't check if PTE is writable for the TNF_NO_GROUP
+ * optimization, which means RO pages are considered for grouping.
+ */
+static void do_numa_access(struct task_struct *p, u64 vaddr, u64 paddr)
+{
+	struct mm_struct *mm = p->mm;
+	struct vm_area_struct *vma;
+	struct page *page = NULL;
+	int page_nid = NUMA_NO_NODE;
+	int last_cpupid;
+	int target_nid;
+	int flags = 0;
+
+	if (!mm)
+		return;
+
+	if (!mmap_read_trylock(mm))
+		return;
+
+	vma = find_vma(mm, vaddr);
+	if (!vma)
+		goto out_unlock;
+
+	if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+		is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP))
+		goto out_unlock;
+
+	if (!vma->vm_mm ||
+	    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+		goto out_unlock;
+
+	if (!vma_is_accessible(vma))
+		goto out_unlock;
+
+	page = pfn_to_online_page(PHYS_PFN(paddr));
+	if (!page || is_zone_device_page(page))
+		goto out_unlock;
+
+	if (unlikely(!PageLRU(page)))
+		goto out_unlock;
+
+	/* TODO: handle PTE-mapped THP or PMD-mapped THP*/
+	if (PageCompound(page))
+		goto out_unlock;
+
+	/*
+	 * Flag if the page is shared between multiple address spaces. This
+	 * is later used when determining whether to group tasks together
+	 */
+	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+		flags |= TNF_SHARED;
+
+	last_cpupid = page_cpupid_last(page);
+	page_nid = page_to_nid(page);
+
+	target_nid = numa_migrate_prep(page, vma, vaddr, page_nid, &flags);
+	if (target_nid == NUMA_NO_NODE) {
+		put_page(page);
+		goto out;
+	}
+
+	/* Migrate to the requested node */
+	if (migrate_misplaced_page(page, vma, target_nid)) {
+		page_nid = target_nid;
+		flags |= TNF_MIGRATED;
+	} else {
+		flags |= TNF_MIGRATE_FAIL;
+	}
+
+out:
+	if (page_nid != NUMA_NO_NODE)
+		task_numa_fault(last_cpupid, page_nid, 1, flags);
+
+out_unlock:
+	mmap_read_unlock(mm);
+}
+
+static void task_mem_sampling_access_work(struct callback_head *work)
+{
+	struct mem_sampling_numa_access_work *iwork =
+		container_of(work, struct mem_sampling_numa_access_work, work);
+	struct task_struct *p = current;
+	int cpu = smp_processor_id();
+	u64 vaddr = iwork->vaddr;
+	u64 paddr = iwork->paddr;
+
+	if (iwork->cpu != cpu) {
+		kfree(iwork);
+		return;
+	}
+
+	kfree(iwork);
+
+	do_numa_access(p, vaddr, paddr);
+}
+
+static void numa_create_taskwork(u64 vaddr, u64 paddr, int cpu)
+{
+	struct mem_sampling_numa_access_work *iwork = NULL;
+
+	iwork = kzalloc(sizeof(*iwork), GFP_ATOMIC);
+	if (!iwork)
+		return;
+
+	iwork->vaddr = vaddr;
+	iwork->paddr = paddr;
+	iwork->cpu = smp_processor_id();
+
+	init_task_work(&iwork->work, task_mem_sampling_access_work);
+	task_work_add(current, &iwork->work, TWA_RESUME);
+}
+
+static void numa_balancing_mem_sampling_cb(struct mem_sampling_record *record)
+{
+	struct task_struct *p = current;
+	u64 vaddr = record->virt_addr;
+	u64 paddr = record->phys_addr;
+
+	/* Discard kernel address accesses */
+	if (vaddr & (1UL << 63))
+		return;
+
+	if (p->pid != record->context_id)
+		return;
+
+	numa_create_taskwork(vaddr, paddr, smp_processor_id());
+}
+
+static void numa_balancing_mem_sampling_cb_register(void)
+{
+	mem_sampling_record_cb_register(numa_balancing_mem_sampling_cb);
+}
+
+static void numa_balancing_mem_sampling_cb_unregister(void)
+{
+	mem_sampling_record_cb_unregister(numa_balancing_mem_sampling_cb);
+}
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
+
 static inline enum mem_sampling_type_enum mem_sampling_get_type(void)
 {
 #ifdef CONFIG_ARM_SPE
@@ -112,6 +286,8 @@ static inline enum mem_sampling_type_enum mem_sampling_get_type(void)
 #endif
 }
 
+DEFINE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling);
+
 static int sysctl_mem_sampling_mode;
 
 static void __set_mem_sampling_state(bool enabled)
-- 
2.33.0