hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Add mem_sampling abstract layer to provide hardware mem access for kernel features, e.g., NUMA balancing or DAMON.
Abstract layer mem_sampling provides an interface to start the sampling of hardware pmu on current cpu and provides callback registrations to subscribe access information (e.g., for NUMA balancing in subsequent patches). Internally, mem_sampling registers a callback in specific pmu driver which forwards the captured records to higher-level through registered callbacks. Sampling actions are also managed by hw_pmu layer. CONFIG_MEM_SAMPLING is added to enable the mem_sampling layer.
For now, mem_sampling only supports SPE driver. New hardware pmu support could be added in mem_sampling with no need to adjust higher-level kernel feature code.
Signed-off-by: Ze Zuo zuoze1@huawei.com Signed-off-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Shuang Yan yanshuang7@huawei.com --- drivers/arm/spe/spe.c | 21 +++++- drivers/arm/spe/spe.h | 2 +- include/linux/mem_sampling.h | 98 +++++++++++++++++++++++++++ mm/Kconfig | 11 +++ mm/Makefile | 1 + mm/mem_sampling.c | 126 +++++++++++++++++++++++++++++++++++ 6 files changed, 256 insertions(+), 3 deletions(-) create mode 100644 include/linux/mem_sampling.h create mode 100644 mm/mem_sampling.c
diff --git a/drivers/arm/spe/spe.c b/drivers/arm/spe/spe.c index 26abfabf90ba..88d7cfbb6633 100644 --- a/drivers/arm/spe/spe.c +++ b/drivers/arm/spe/spe.c @@ -13,6 +13,7 @@ #include <linux/of_irq.h> #include <linux/perf/arm_pmu.h> #include <linux/platform_device.h> +#include <linux/mem_sampling.h>
#include "spe-decoder/arm-spe-decoder.h" #include "spe-decoder/arm-spe-pkt-decoder.h" @@ -31,6 +32,12 @@ static enum cpuhp_state arm_spe_online;
DEFINE_PER_CPU(struct arm_spe_buf, per_cpu_spe_buf);
+mem_sampling_cb_type arm_spe_sampling_cb; +void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb) +{ + arm_spe_sampling_cb = cb; +} + static inline int arm_spe_per_buffer_alloc(int cpu) { struct arm_spe_buf *spe_buf = &per_cpu(per_cpu_spe_buf, cpu); @@ -376,6 +383,16 @@ static irqreturn_t arm_spe_irq_handler(int irq, void *dev) case SPE_PMU_BUF_FAULT_ACT_OK: spe_buf->nr_records = 0; arm_spe_decode_buf(spe_buf->cur, spe_buf->size); + + /* + * Callback function processing record data. + * Call one: arm_spe_sampling_cb - mem_sampling layer. + * TODO: use per CPU workqueue to process data and reduce + * interrupt processing time + */ + if (arm_spe_sampling_cb) + arm_spe_sampling_cb((struct mem_sampling_record *)spe_buf->record_base, + spe_buf->nr_records); break;
case SPE_PMU_BUF_FAULT_ACT_SPURIOUS: @@ -663,7 +680,7 @@ static void arm_spe_sample_para_init(void) void arm_spe_record_enqueue(struct arm_spe_record *record) { struct arm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf); - struct arm_spe_record *record_tail; + struct mem_sampling_record *record_tail;
if (spe_buf->nr_records >= SPE_RECORD_BUFFER_MAX_RECORDS) { pr_err("nr_records exceeded!\n"); @@ -672,7 +689,7 @@ void arm_spe_record_enqueue(struct arm_spe_record *record)
record_tail = spe_buf->record_base + spe_buf->nr_records * SPE_RECORD_ENTRY_SIZE; - *record_tail = *(struct arm_spe_record *)record; + *record_tail = *(struct mem_sampling_record *)record; spe_buf->nr_records++;
} diff --git a/drivers/arm/spe/spe.h b/drivers/arm/spe/spe.h index 4348d15581dd..865888b8260f 100644 --- a/drivers/arm/spe/spe.h +++ b/drivers/arm/spe/spe.h @@ -14,7 +14,7 @@ #define SPE_SAMPLE_PERIOD 1024
#define SPE_RECORD_BUFFER_MAX_RECORDS (100) -#define SPE_RECORD_ENTRY_SIZE sizeof(struct arm_spe_record) +#define SPE_RECORD_ENTRY_SIZE sizeof(struct mem_sampling_record)
#define SPE_PMU_FEAT_FILT_EVT (1UL << 0) #define SPE_PMU_FEAT_FILT_TYP (1UL << 1) diff --git a/include/linux/mem_sampling.h b/include/linux/mem_sampling.h new file mode 100644 index 000000000000..282f3f7d415b --- /dev/null +++ b/include/linux/mem_sampling.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * mem_sampling.h: declare the mem_sampling abstract layer and provide + * unified pmu sampling for NUMA, DAMON, etc. + * + * Sample records are converted to mem_sampling_record, and then + * mem_sampling_record_captured_cb_type invoke the callbacks to + * pass the record. + * + * Copyright (c) 2024-2025, Huawei Technologies Ltd. + */ +#ifndef __MEM_SAMPLING_H +#define __MEM_SAMPLING_H + +enum mem_sampling_sample_type { + MEM_SAMPLING_L1D_ACCESS = 1 << 0, + MEM_SAMPLING_L1D_MISS = 1 << 1, + MEM_SAMPLING_LLC_ACCESS = 1 << 2, + MEM_SAMPLING_LLC_MISS = 1 << 3, + MEM_SAMPLING_TLB_ACCESS = 1 << 4, + MEM_SAMPLING_TLB_MISS = 1 << 5, + MEM_SAMPLING_BRANCH_MISS = 1 << 6, + MEM_SAMPLING_REMOTE_ACCESS = 1 << 7, +}; + +enum mem_sampling_op_type { + MEM_SAMPLING_LD = 1 << 0, + MEM_SAMPLING_ST = 1 << 1, +}; + +struct mem_sampling_record { + enum mem_sampling_sample_type type; + int err; + u32 op; + u32 latency; + u64 from_ip; + u64 to_ip; + u64 timestamp; + u64 virt_addr; + u64 phys_addr; + u64 context_id; + u16 source; +}; + +/* + * Callbacks should be registered using mem_sampling_record_cb_register() + * by NUMA, DAMON and etc during their initialisation. + * Callbacks will be invoked on new hardware pmu records caputured. + */ +typedef void (*mem_sampling_record_cb_type)(struct mem_sampling_record *record); +void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb); +void mem_sampling_record_cb_unregister(mem_sampling_record_cb_type cb); + +#ifdef CONFIG_MEM_SAMPLING +void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr); +#else +static inline void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr) { }; +#endif + +/* invoked by specific mem_sampling */ +typedef void (*mem_sampling_cb_type)(struct mem_sampling_record *record_base, + int n_records); + + +struct mem_sampling_ops_struct { + int (*sampling_start)(void); + void (*sampling_stop)(void); + void (*sampling_continue)(void); +}; +extern struct mem_sampling_ops_struct mem_sampling_ops; + +enum mem_sampling_type_enum { + MEM_SAMPLING_ARM_SPE, + MEM_SAMPLING_UNSUPPORTED +}; + +#ifdef CONFIG_ARM_SPE +int arm_spe_start(void); +void arm_spe_stop(void); +void arm_spe_continue(void); +int arm_spe_enabled(void); +void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb); +#else +static inline void arm_spe_stop(void) { }; +static inline void arm_spe_continue(void) { }; +static inline void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb) { }; + +static inline int arm_spe_start(void) +{ + return 0; +} + +static inline int arm_spe_enabled(void) +{ + return 0; +} +#endif /* CONFIG_ARM_SPE */ +#endif /* __MEM_SAMPLING_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 70c85533aada..381d440f85eb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1008,6 +1008,17 @@ config EXTEND_HUGEPAGE_MAPPING help Introduce vmalloc/vmap/remap interfaces that handle only hugepages.
+config MEM_SAMPLING + bool "Use hardware memory sampling for kernel features(NUMA, DAMON, etc.)" + default n + depends on ARM64 + select ARM_SPE if ARM64 + help + Memory sampling is primarily based on specific hardware capabilities, + which enable hardware PMUs to sample memory access for use by kernel + features.. It requires at least one hardware pmu (e.g. ARM_SPE) to + be enabled. + source "mm/damon/Kconfig"
endmenu diff --git a/mm/Makefile b/mm/Makefile index a014a5e08f7b..112966190c1d 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -133,3 +133,4 @@ obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o +obj-$(CONFIG_MEM_SAMPLING) += mem_sampling.o diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c new file mode 100644 index 000000000000..df8e8823e211 --- /dev/null +++ b/mm/mem_sampling.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mem_sampling.c: declare the mem_sampling abstract layer and provide + * unified pmu sampling for NUMA, DAMON, etc. + * + * Sample records are converted to mem_sampling_record, and then + * mem_sampling_record_captured_cb_type invoke the callbacks to + * pass the record. + * + * Copyright (c) 2024-2025, Huawei Technologies Ltd. + */ + +#define pr_fmt(fmt) "mem_sampling: " fmt + +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/mem_sampling.h> + +struct mem_sampling_ops_struct mem_sampling_ops; + +struct mem_sampling_record_cb_list_entry { + struct list_head list; + mem_sampling_record_cb_type cb; +}; +LIST_HEAD(mem_sampling_record_cb_list); + +void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb) +{ + struct mem_sampling_record_cb_list_entry *cb_entry, *tmp; + + list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) { + if (cb_entry->cb == cb) { + pr_info("mem_sampling record cb already registered\n"); + return; + } + } + + cb_entry = kmalloc(sizeof(struct mem_sampling_record_cb_list_entry), GFP_KERNEL); + if (!cb_entry) { + pr_info("mem_sampling record cb entry alloc memory failed\n"); + return; + } + + cb_entry->cb = cb; + list_add(&(cb_entry->list), &mem_sampling_record_cb_list); +} + +void mem_sampling_record_cb_unregister(mem_sampling_record_cb_type cb) +{ + struct mem_sampling_record_cb_list_entry *cb_entry, *tmp; + + list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) { + if (cb_entry->cb == cb) { + list_del(&cb_entry->list); + kfree(cb_entry); + return; + } + } +} + +void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr) +{ + if (!mem_sampling_ops.sampling_start) + return; + + if (curr->mm) + mem_sampling_ops.sampling_start(); + else + mem_sampling_ops.sampling_stop(); +} + +static void mem_sampling_process(struct mem_sampling_record *record_base, int nr_records) +{ + int i; + struct mem_sampling_record *record; + struct mem_sampling_record_cb_list_entry *cb_entry, *tmp; + + if (list_empty(&mem_sampling_record_cb_list)) + goto out; + + for (i = 0; i < nr_records; i++) { + record = record_base + i; + list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) { + cb_entry->cb(record); + } + } +out: + mem_sampling_ops.sampling_continue(); +} + +static inline enum mem_sampling_type_enum mem_sampling_get_type(void) +{ +#ifdef CONFIG_ARM_SPE + return MEM_SAMPLING_ARM_SPE; +#else + return MEM_SAMPLING_UNSUPPORTED; +#endif +} + +static int __init mem_sampling_init(void) +{ + enum mem_sampling_type_enum mem_sampling_type = mem_sampling_get_type(); + + switch (mem_sampling_type) { + case MEM_SAMPLING_ARM_SPE: + if (!arm_spe_enabled()) + return -ENODEV; + mem_sampling_ops.sampling_start = arm_spe_start, + mem_sampling_ops.sampling_stop = arm_spe_stop, + mem_sampling_ops.sampling_continue = arm_spe_continue, + + arm_spe_record_capture_callback_register(mem_sampling_process); + break; + + default: + pr_info("unsupport hardware pmu type(%d), disable access hint!\n", + mem_sampling_type); + return -ENODEV; + } + + return 0; +} +late_initcall(mem_sampling_init);