[PATCH OLK-5.10 v2 05/11] Drive NUMA balancing via mem_sampling access data

2 Jun 2024

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ
CVE: NA
--------------------------------
Feed the page access data obtained from mem_sampling to NUMA
balancing as hint fault equivalents. The existing per-task
and per-group fault stats are now built from page access
information provided by mem_sampling. With this it will not be
necessary to scan the address space to introduce NUMA hinting
faults.
A callback is registered from NUMA balancing to mem_sampling for
subscribing the page access information from mem_sampling. NUMA
balancing then uses task_work framework to process the mem_sampling
data and realize the migrating policy.
The sampling policy in NUMA balancing can be switched between
original task_tick_numa() and mem_sampling records by writing to a
sysctl interface dynamically. Similar effects can be achieved
by switching on and off CONFIG_NUMA_BALANCING_MEM_SAMPLING.
Note that THP migrating is not supported for now.
Signed-off-by: Ze Zuo zuoze1@huawei.com
Signed-off-by: Tong Tiangen tongtiangen@huawei.com
Signed-off-by: Shuang Yan yanshuang7@huawei.com
---
 include/linux/mem_sampling.h |   5 +
 kernel/sched/fair.c          |  11 +++
 kernel/sched/sched.h         |   1 +
 mm/Kconfig                   |  13 +++
 mm/mem_sampling.c            | 181 +++++++++++++++++++++++++++++++++++
 5 files changed, 211 insertions(+)

diff --git a/include/linux/mem_sampling.h b/include/linux/mem_sampling.h
index 282f3f7d415b..c7f99c1305d8 100644
--- a/include/linux/mem_sampling.h
+++ b/include/linux/mem_sampling.h
@@ -95,4 +95,9 @@ static inline int arm_spe_enabled(void)
    return 0;
 }
 #endif /* CONFIG_ARM_SPE */
+
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+void numa_balancing_mem_sampling_cb_register(void);
+void numa_balancing_mem_sampling_cb_unregister(void);
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
 #endif	/* __MEM_SAMPLING_H */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7289ae80c936..d3482a532bd9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2967,6 +2967,17 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
    struct callback_head *work = &curr->numa_work;
    u64 period, now;
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+	/*
+	 * If we are using access hints from hardware (like using
+	 * SPE), don't scan the address space.
+	 * Note that currently PMD-level page migration is not
+	 * supported.
+	 */
+	if (static_branch_unlikely(&sched_numabalancing_mem_sampling))
+		return;
+#endif
+
    /*
     * We don't care about NUMA placement if we don't have memory.
     */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3b2fc472908a..e8dbf7785437 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2121,6 +2121,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
extern struct static_key_false sched_numa_balancing;
 extern struct static_key_false sched_schedstats;
+extern struct static_key_false sched_numabalancing_mem_sampling;
static inline u64 global_rt_period(void)
 {
diff --git a/mm/Kconfig b/mm/Kconfig
index c339a8ec5d13..8d5eb35c146f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1018,6 +1018,19 @@ config MEM_SAMPLING
      features.. It requires at least one hardware pmu (e.g. ARM_SPE) to
      be enabled.
+config NUMABALANCING_MEM_SAMPLING
+	bool "Use hardware memory samples for numa balancing"
+	depends on MEM_SAMPLING && NUMA_BALANCING
+	default n
+	help
+	  This feature relies on hardware sampling, and will use memory access
+	  information obtained from hardware sampling in the NUMA balancing
+	  policy instead of the native software PROT_NONE scheme. Turning on
+	  this feature may have a performance impact on some workloads, for
+	  example, lightweight memory access programs.
+
+	  if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING.
+
 source "mm/damon/Kconfig"
endmenu
diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c
index ad2cd61768c6..8423b1c1fa2c 100644
--- a/mm/mem_sampling.c
+++ b/mm/mem_sampling.c
@@ -18,6 +18,10 @@
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/mem_sampling.h>
+#include <linux/mempolicy.h>
+#include <linux/task_work.h>
+#include <linux/migrate.h>
+#include <linux/sched/numa_balancing.h>
struct mem_sampling_ops_struct mem_sampling_ops;
@@ -33,6 +37,15 @@ struct mem_sampling_record_cb_list_entry {
 };
 LIST_HEAD(mem_sampling_record_cb_list);
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+struct mem_sampling_numa_access_work {
+	struct callback_head work;
+	u64 laddr, paddr;
+	/* Test for debug : decode buffer cpu not same with handle interrupt cpu*/
+	int cpu;
+};
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
+
 void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb)
 {
    struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
@@ -110,6 +123,172 @@ void mem_sampling_process(struct mem_sampling_record *record_base, int nr_record
    	mem_sampling_ops.sampling_stop();
 }
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+
+static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+				unsigned long addr, int page_nid,
+				int *flags)
+{
+	get_page(page);
+
+	count_vm_numa_event(NUMA_HINT_FAULTS);
+	if (page_nid == numa_node_id()) {
+		count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+		*flags |= TNF_FAULT_LOCAL;
+	}
+
+	return mpol_misplaced(page, vma, addr);
+}
+
+/*
+ * Called from task_work context to act upon the page access.
+ *
+ * Physical address (provided by SPE) is used directly instead
+ * of walking the page tables to get to the PTE/page. Hence we
+ * don't check if PTE is writable for the TNF_NO_GROUP
+ * optimization, which means RO pages are considered for grouping.
+ */
+void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr)
+{
+	struct mm_struct *mm = p->mm;
+	struct vm_area_struct *vma;
+	struct page *page = NULL;
+	int page_nid = NUMA_NO_NODE;
+	int last_cpupid;
+	int target_nid;
+	int flags = 0;
+
+	if (!mm)
+		return;
+
+	if (!mmap_read_trylock(mm))
+		return;
+
+	vma = find_vma(mm, laddr);
+	if (!vma)
+		goto out_unlock;
+
+	if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+		is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP))
+		goto out_unlock;
+
+	if (!vma->vm_mm ||
+	    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+		goto out_unlock;
+
+	if (!vma_is_accessible(vma))
+		goto out_unlock;
+
+	page = pfn_to_online_page(PHYS_PFN(paddr));
+	if (!page || is_zone_device_page(page))
+		goto out_unlock;
+
+	if (unlikely(!PageLRU(page)))
+		goto out_unlock;
+
+	/* TODO: handle PTE-mapped THP or PMD-mapped THP*/
+	if (PageCompound(page))
+		goto out_unlock;
+
+	/*
+	 * Flag if the page is shared between multiple address spaces. This
+	 * is later used when determining whether to group tasks together
+	 */
+	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+		flags |= TNF_SHARED;
+
+	last_cpupid = page_cpupid_last(page);
+	page_nid = page_to_nid(page);
+
+	target_nid = numa_migrate_prep(page, vma, laddr, page_nid, &flags);
+	if (target_nid == NUMA_NO_NODE) {
+		put_page(page);
+		goto out;
+	}
+
+	/* Migrate to the requested node */
+	if (migrate_misplaced_page(page, vma, target_nid)) {
+		page_nid = target_nid;
+		flags |= TNF_MIGRATED;
+	} else {
+		flags |= TNF_MIGRATE_FAIL;
+	}
+
+out:
+	if (page_nid != NUMA_NO_NODE)
+		task_numa_fault(last_cpupid, page_nid, 1, flags);
+
+out_unlock:
+	mmap_read_unlock(mm);
+}
+
+static void task_mem_sampling_access_work(struct callback_head *work)
+{
+	struct mem_sampling_numa_access_work *iwork =
+		container_of(work, struct mem_sampling_numa_access_work, work);
+	struct task_struct *p = current;
+	int cpu = smp_processor_id();
+	u64 laddr = iwork->laddr;
+	u64 paddr = iwork->paddr;
+
+	kfree(iwork);
+	if (iwork->cpu != cpu)
+		return;
+
+	do_numa_access(p, laddr, paddr);
+}
+
+void numa_create_taskwork(u64 laddr, u64 paddr, int cpu)
+{
+	struct mem_sampling_numa_access_work *iwork = NULL;
+
+	iwork = kzalloc(sizeof(*iwork), GFP_ATOMIC);
+	if (!iwork)
+		return;
+
+	iwork->laddr = laddr;
+	iwork->paddr = paddr;
+	iwork->cpu = smp_processor_id();
+
+	init_task_work(&iwork->work, task_mem_sampling_access_work);
+	task_work_add(current, &iwork->work, TWA_RESUME);
+}
+
+void numa_balancing_mem_sampling_cb(struct mem_sampling_record *record)
+{
+	struct task_struct *p = current;
+	u64 laddr = record->virt_addr;
+	u64 paddr = record->phys_addr;
+
+	/* Discard kernel address accesses */
+	if (laddr & (1UL << 63))
+		return;
+
+	if (p->pid != record->context_id)
+		return;
+
+	numa_create_taskwork(laddr, paddr, smp_processor_id());
+}
+
+void numa_balancing_mem_sampling_cb_register(void)
+{
+	mem_sampling_record_cb_register(numa_balancing_mem_sampling_cb);
+}
+
+void numa_balancing_mem_sampling_cb_unregister(void)
+{
+	mem_sampling_record_cb_unregister(numa_balancing_mem_sampling_cb);
+}
+#else
+static inline void numa_balancing_mem_sampling_cb_register(void)
+{
+}
+
+static inline void numa_balancing_mem_sampling_cb_unregister(void)
+{
+}
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
+
 static inline enum mem_sampling_type_enum mem_sampling_get_type(void)
 {
 #ifdef CONFIG_ARM_SPE
@@ -121,6 +300,8 @@ static inline enum mem_sampling_type_enum mem_sampling_get_type(void)
DEFINE_STATIC_KEY_FALSE(mem_sampling_access_hints);
+DEFINE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling);
+
 int sysctl_mem_sampling_mode;
static void __set_mem_sampling_state(bool enabled)
-- 
2.33.0


    

2025

2024

2023

2022

2021

2020

2019

[PATCH OLK-5.10 v2 05/11] Drive NUMA balancing via mem_sampling access data