hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Feed the page access data obtained from mem_sampling to NUMA balancing as hint fault equivalents. The existing per-task and per-group fault stats are now built from page access information provided by mem_sampling. With this it will not be necessary to scan the address space to introduce NUMA hinting faults.
A callback is registered from NUMA balancing to mem_sampling for subscribing the page access information from mem_sampling. NUMA balancing then uses task_work framework to process the mem_sampling data and realize the migrating policy.
The sampling policy in NUMA balancing can be switched between original task_tick_numa() and mem_sampling records by writing to a sysctl interface dynamically. Similar effects can be achieved by switching on and off CONFIG_NUMA_BALANCING_MEM_SAMPLING.
Note that THP migrating is not supported for now.
Signed-off-by: Ze Zuo zuoze1@huawei.com Signed-off-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Shuang Yan yanshuang7@huawei.com --- include/linux/mem_sampling.h | 5 + kernel/sched/fair.c | 11 +++ kernel/sched/sched.h | 1 + mm/Kconfig | 13 +++ mm/mem_sampling.c | 181 +++++++++++++++++++++++++++++++++++ 5 files changed, 211 insertions(+)
diff --git a/include/linux/mem_sampling.h b/include/linux/mem_sampling.h index 282f3f7d415b..c7f99c1305d8 100644 --- a/include/linux/mem_sampling.h +++ b/include/linux/mem_sampling.h @@ -95,4 +95,9 @@ static inline int arm_spe_enabled(void) return 0; } #endif /* CONFIG_ARM_SPE */ + +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING +void numa_balancing_mem_sampling_cb_register(void); +void numa_balancing_mem_sampling_cb_unregister(void); +#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */ #endif /* __MEM_SAMPLING_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7289ae80c936..d3482a532bd9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2967,6 +2967,17 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) struct callback_head *work = &curr->numa_work; u64 period, now;
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING + /* + * If we are using access hints from hardware (like using + * SPE), don't scan the address space. + * Note that currently PMD-level page migration is not + * supported. + */ + if (static_branch_unlikely(&sched_numabalancing_mem_sampling)) + return; +#endif + /* * We don't care about NUMA placement if we don't have memory. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3b2fc472908a..e8dbf7785437 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2121,6 +2121,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; +extern struct static_key_false sched_numabalancing_mem_sampling;
static inline u64 global_rt_period(void) { diff --git a/mm/Kconfig b/mm/Kconfig index c339a8ec5d13..8d5eb35c146f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1018,6 +1018,19 @@ config MEM_SAMPLING features.. It requires at least one hardware pmu (e.g. ARM_SPE) to be enabled.
+config NUMABALANCING_MEM_SAMPLING + bool "Use hardware memory samples for numa balancing" + depends on MEM_SAMPLING && NUMA_BALANCING + default n + help + This feature relies on hardware sampling, and will use memory access + information obtained from hardware sampling in the NUMA balancing + policy instead of the native software PROT_NONE scheme. Turning on + this feature may have a performance impact on some workloads, for + example, lightweight memory access programs. + + if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING. + source "mm/damon/Kconfig"
endmenu diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c index ad2cd61768c6..8423b1c1fa2c 100644 --- a/mm/mem_sampling.c +++ b/mm/mem_sampling.c @@ -18,6 +18,10 @@ #include <linux/list.h> #include <linux/mm.h> #include <linux/mem_sampling.h> +#include <linux/mempolicy.h> +#include <linux/task_work.h> +#include <linux/migrate.h> +#include <linux/sched/numa_balancing.h>
struct mem_sampling_ops_struct mem_sampling_ops;
@@ -33,6 +37,15 @@ struct mem_sampling_record_cb_list_entry { }; LIST_HEAD(mem_sampling_record_cb_list);
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING +struct mem_sampling_numa_access_work { + struct callback_head work; + u64 laddr, paddr; + /* Test for debug : decode buffer cpu not same with handle interrupt cpu*/ + int cpu; +}; +#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */ + void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb) { struct mem_sampling_record_cb_list_entry *cb_entry, *tmp; @@ -110,6 +123,172 @@ void mem_sampling_process(struct mem_sampling_record *record_base, int nr_record mem_sampling_ops.sampling_stop(); }
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING + +static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, + unsigned long addr, int page_nid, + int *flags) +{ + get_page(page); + + count_vm_numa_event(NUMA_HINT_FAULTS); + if (page_nid == numa_node_id()) { + count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); + *flags |= TNF_FAULT_LOCAL; + } + + return mpol_misplaced(page, vma, addr); +} + +/* + * Called from task_work context to act upon the page access. + * + * Physical address (provided by SPE) is used directly instead + * of walking the page tables to get to the PTE/page. Hence we + * don't check if PTE is writable for the TNF_NO_GROUP + * optimization, which means RO pages are considered for grouping. + */ +void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr) +{ + struct mm_struct *mm = p->mm; + struct vm_area_struct *vma; + struct page *page = NULL; + int page_nid = NUMA_NO_NODE; + int last_cpupid; + int target_nid; + int flags = 0; + + if (!mm) + return; + + if (!mmap_read_trylock(mm)) + return; + + vma = find_vma(mm, laddr); + if (!vma) + goto out_unlock; + + if (!vma_migratable(vma) || !vma_policy_mof(vma) || + is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) + goto out_unlock; + + if (!vma->vm_mm || + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + goto out_unlock; + + if (!vma_is_accessible(vma)) + goto out_unlock; + + page = pfn_to_online_page(PHYS_PFN(paddr)); + if (!page || is_zone_device_page(page)) + goto out_unlock; + + if (unlikely(!PageLRU(page))) + goto out_unlock; + + /* TODO: handle PTE-mapped THP or PMD-mapped THP*/ + if (PageCompound(page)) + goto out_unlock; + + /* + * Flag if the page is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) + flags |= TNF_SHARED; + + last_cpupid = page_cpupid_last(page); + page_nid = page_to_nid(page); + + target_nid = numa_migrate_prep(page, vma, laddr, page_nid, &flags); + if (target_nid == NUMA_NO_NODE) { + put_page(page); + goto out; + } + + /* Migrate to the requested node */ + if (migrate_misplaced_page(page, vma, target_nid)) { + page_nid = target_nid; + flags |= TNF_MIGRATED; + } else { + flags |= TNF_MIGRATE_FAIL; + } + +out: + if (page_nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, page_nid, 1, flags); + +out_unlock: + mmap_read_unlock(mm); +} + +static void task_mem_sampling_access_work(struct callback_head *work) +{ + struct mem_sampling_numa_access_work *iwork = + container_of(work, struct mem_sampling_numa_access_work, work); + struct task_struct *p = current; + int cpu = smp_processor_id(); + u64 laddr = iwork->laddr; + u64 paddr = iwork->paddr; + + kfree(iwork); + if (iwork->cpu != cpu) + return; + + do_numa_access(p, laddr, paddr); +} + +void numa_create_taskwork(u64 laddr, u64 paddr, int cpu) +{ + struct mem_sampling_numa_access_work *iwork = NULL; + + iwork = kzalloc(sizeof(*iwork), GFP_ATOMIC); + if (!iwork) + return; + + iwork->laddr = laddr; + iwork->paddr = paddr; + iwork->cpu = smp_processor_id(); + + init_task_work(&iwork->work, task_mem_sampling_access_work); + task_work_add(current, &iwork->work, TWA_RESUME); +} + +void numa_balancing_mem_sampling_cb(struct mem_sampling_record *record) +{ + struct task_struct *p = current; + u64 laddr = record->virt_addr; + u64 paddr = record->phys_addr; + + /* Discard kernel address accesses */ + if (laddr & (1UL << 63)) + return; + + if (p->pid != record->context_id) + return; + + numa_create_taskwork(laddr, paddr, smp_processor_id()); +} + +void numa_balancing_mem_sampling_cb_register(void) +{ + mem_sampling_record_cb_register(numa_balancing_mem_sampling_cb); +} + +void numa_balancing_mem_sampling_cb_unregister(void) +{ + mem_sampling_record_cb_unregister(numa_balancing_mem_sampling_cb); +} +#else +static inline void numa_balancing_mem_sampling_cb_register(void) +{ +} + +static inline void numa_balancing_mem_sampling_cb_unregister(void) +{ +} +#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */ + static inline enum mem_sampling_type_enum mem_sampling_get_type(void) { #ifdef CONFIG_ARM_SPE @@ -121,6 +300,8 @@ static inline enum mem_sampling_type_enum mem_sampling_get_type(void)
DEFINE_STATIC_KEY_FALSE(mem_sampling_access_hints);
+DEFINE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling); + int sysctl_mem_sampling_mode;
static void __set_mem_sampling_state(bool enabled)