
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/IC8KS8 CVE: NA -------------------------------- Add mem_sampling abstract layer to provide hardware mem access for kernel features, e.g., NUMA balancing or DAMON. Abstract layer mem_sampling provides an interface to start the sampling of hardware pmu on current cpu and provides callback registrations to subscribe access information (e.g., for NUMA balancing in subsequent patches). Internally, mem_sampling registers a callback in specific pmu driver which forwards the captured records to higher-level through registered callbacks. Sampling actions are also managed by hw_pmu layer. CONFIG_MEM_SAMPLING is added to enable the mem_sampling layer. For now, mem_sampling only supports SPE driver. New hardware pmu support could be added in mem_sampling with no need to adjust higher-level kernel feature code. Signed-off-by: Ze Zuo <zuoze1@huawei.com> Signed-off-by: Tong Tiangen <tongtiangen@huawei.com> Signed-off-by: Shuang Yan <yanshuang7@huawei.com> --- drivers/arm/mm_monitor/mm_spe.c | 5 +- drivers/arm/mm_monitor/mm_spe.h | 2 +- include/linux/mem_sampling.h | 82 +++++++++++++++++++++ mm/Kconfig | 15 ++++ mm/Makefile | 1 + mm/mem_sampling.c | 126 ++++++++++++++++++++++++++++++++ 6 files changed, 228 insertions(+), 3 deletions(-) create mode 100644 include/linux/mem_sampling.h create mode 100644 mm/mem_sampling.c diff --git a/drivers/arm/mm_monitor/mm_spe.c b/drivers/arm/mm_monitor/mm_spe.c index f2f2b3320357..cbde84c228a0 100644 --- a/drivers/arm/mm_monitor/mm_spe.c +++ b/drivers/arm/mm_monitor/mm_spe.c @@ -11,6 +11,7 @@ #include <linux/of_device.h> #include <linux/perf/arm_pmu.h> +#include <linux/mem_sampling.h> #include "spe-decoder/arm-spe-decoder.h" #include "spe-decoder/arm-spe-pkt-decoder.h" @@ -377,7 +378,7 @@ static void mm_spe_sample_para_init(void) void mm_spe_record_enqueue(struct arm_spe_record *record) { struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf); - struct arm_spe_record *record_tail; + struct mem_sampling_record *record_tail; if (spe_buf->nr_records >= SPE_RECORD_BUFFER_MAX_RECORDS) { pr_err("nr_records exceeded!\n"); @@ -386,7 +387,7 @@ void mm_spe_record_enqueue(struct arm_spe_record *record) record_tail = spe_buf->record_base + spe_buf->nr_records * SPE_RECORD_ENTRY_SIZE; - *record_tail = *(struct arm_spe_record *)record; + *record_tail = *(struct mem_sampling_record *)record; spe_buf->nr_records++; } diff --git a/drivers/arm/mm_monitor/mm_spe.h b/drivers/arm/mm_monitor/mm_spe.h index da134e8794c8..bd0a1574a1b0 100644 --- a/drivers/arm/mm_monitor/mm_spe.h +++ b/drivers/arm/mm_monitor/mm_spe.h @@ -9,7 +9,7 @@ #define SPE_SAMPLE_PERIOD 1024 #define SPE_RECORD_BUFFER_MAX_RECORDS (100) -#define SPE_RECORD_ENTRY_SIZE sizeof(struct arm_spe_record) +#define SPE_RECORD_ENTRY_SIZE sizeof(struct mem_sampling_record) #define ARMV8_SPE_MEM_SAMPLING_PDEV_NAME "arm,mm_spe,spe-v1" struct mm_spe { diff --git a/include/linux/mem_sampling.h b/include/linux/mem_sampling.h new file mode 100644 index 000000000000..3e000a0deced --- /dev/null +++ b/include/linux/mem_sampling.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * mem_sampling.h: declare the mem_sampling abstract layer and provide + * unified pmu sampling for NUMA, DAMON, etc. + * + * Sample records are converted to mem_sampling_record, and then + * mem_sampling_record_captured_cb_type invoke the callbacks to + * pass the record. + * + * Copyright (c) 2024-2025, Huawei Technologies Ltd. + */ +#ifndef __MEM_SAMPLING_H +#define __MEM_SAMPLING_H + +enum mem_sampling_sample_type { + MEM_SAMPLING_L1D_ACCESS = 1 << 0, + MEM_SAMPLING_L1D_MISS = 1 << 1, + MEM_SAMPLING_LLC_ACCESS = 1 << 2, + MEM_SAMPLING_LLC_MISS = 1 << 3, + MEM_SAMPLING_TLB_ACCESS = 1 << 4, + MEM_SAMPLING_TLB_MISS = 1 << 5, + MEM_SAMPLING_BRANCH_MISS = 1 << 6, + MEM_SAMPLING_REMOTE_ACCESS = 1 << 7, +}; + +enum mem_sampling_op_type { + MEM_SAMPLING_LD = 1 << 0, + MEM_SAMPLING_ST = 1 << 1, +}; + +struct mem_sampling_record { + enum mem_sampling_sample_type type; + int err; + u32 op; + u32 latency; + u64 from_ip; + u64 to_ip; + u64 timestamp; + u64 virt_addr; + u64 phys_addr; + u64 context_id; + u64 boost_spe_addr[8]; + u64 rem_addr; + u16 source; +}; + +struct mem_sampling_ops_struct { + int (*sampling_start)(void); + void (*sampling_stop)(void); + void (*sampling_continue)(void); + void (*sampling_decoding)(void); + struct mm_spe_buf* (*mm_spe_getbuf_addr)(void); + int (*mm_spe_getnum_record)(void); + +}; +extern struct mem_sampling_ops_struct mem_sampling_ops; + +enum mem_sampling_type_enum { + MEM_SAMPLING_ARM_SPE, + MEM_SAMPLING_UNSUPPORTED +}; + +#ifdef CONFIG_ARM_SPE_MEM_SAMPLING +int mm_spe_start(void); +void mm_spe_stop(void); +void mm_spe_continue(void); +void mm_spe_decoding(void); +int mm_spe_getnum_record(void); +struct mm_spe_buf *mm_spe_getbuf_addr(void); +int mm_spe_enabled(void); +void arm_spe_set_probe_status(int status); +#else +static inline void mm_spe_stop(void) { } +static inline void mm_spe_continue(void) { } +static inline void mm_spe_decoding(void) { } +static inline void arm_spe_set_probe_status(int status) { } +static inline int mm_spe_start(void) { return 0; } +static inline int mm_spe_getnum_record(void) { return 0; } +static inline struct mm_spe_buf *mm_spe_getbuf_addr(void) { return NULL; } +static inline int mm_spe_enabled(void) { return 0; } +#endif /* CONFIG_ARM_SPE_MEM_SAMPLING */ +#endif /* __MEM_SAMPLING_H */ diff --git a/mm/Kconfig b/mm/Kconfig index 56171b9dd873..c2b45a71a992 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1452,6 +1452,21 @@ config BPF_READAHEAD of the kernel is adjusted based on the application read mode to optimize the read performance in the Spark SQL scenario, +config MEM_SAMPLING + bool "Use hardware memory sampling for kernel features(NUMA, DAMON, etc.)" + default n + depends on ARM64 + select ARM_SPE_MEM_SAMPLING if ARM64 + help + This option enables hardware-based memory sampling for kernel features + such as NUMA balancing and DAMON. If disabled, software-based memory + sampling will be used instead. + + Memory sampling is primarily based on specific hardware capabilities, + which enable hardware PMUs to sample memory access for use by kernel + features. It requires at least one hardware PMU (e.g. ARM_SPE_MEM_SAMPLING) + to be enabled. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 11df2de8fdbe..674777b7c99f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -147,3 +147,4 @@ obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o obj-$(CONFIG_DYNAMIC_POOL) += dynamic_pool.o +obj-$(CONFIG_MEM_SAMPLING) += mem_sampling.o diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c new file mode 100644 index 000000000000..551c18452b2e --- /dev/null +++ b/mm/mem_sampling.c @@ -0,0 +1,126 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * mem_sampling.c: declare the mem_sampling abstract layer and provide + * unified pmu sampling for NUMA, DAMON, etc. + * + * Sample records are converted to mem_sampling_record, and then + * mem_sampling_record_captured_cb_type invoke the callbacks to + * pass the record. + * + * Copyright (c) 2024-2025, Huawei Technologies Ltd. + */ + +#define pr_fmt(fmt) "mem_sampling: " fmt + +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/mem_sampling.h> + +struct mem_sampling_ops_struct mem_sampling_ops; + +/* + * Callbacks should be registered using mem_sampling_record_cb_register() + * by NUMA, DAMON and etc during their initialisation. + * Callbacks will be invoked on new hardware pmu records caputured. + */ +typedef void (*mem_sampling_record_cb_type)(struct mem_sampling_record *record); + +struct mem_sampling_record_cb_list_entry { + struct list_head list; + mem_sampling_record_cb_type cb; +}; +LIST_HEAD(mem_sampling_record_cb_list); + +void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb) +{ + struct mem_sampling_record_cb_list_entry *cb_entry, *tmp; + + list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) { + if (cb_entry->cb == cb) + return; + } + + cb_entry = kmalloc(sizeof(struct mem_sampling_record_cb_list_entry), GFP_KERNEL); + if (!cb_entry) + return; + + cb_entry->cb = cb; + list_add(&(cb_entry->list), &mem_sampling_record_cb_list); +} + +void mem_sampling_record_cb_unregister(mem_sampling_record_cb_type cb) +{ + struct mem_sampling_record_cb_list_entry *cb_entry, *tmp; + + list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) { + if (cb_entry->cb == cb) { + list_del(&cb_entry->list); + kfree(cb_entry); + return; + } + } +} + +void mem_sampling_process(void) +{ + int i, nr_records; + struct mem_sampling_record *record; + struct mem_sampling_record *record_base; + struct mem_sampling_record_cb_list_entry *cb_entry, *tmp; + + mem_sampling_ops.sampling_decoding(); + + record_base = (struct mem_sampling_record *)mem_sampling_ops.mm_spe_getbuf_addr(); + nr_records = mem_sampling_ops.mm_spe_getnum_record(); + + if (list_empty(&mem_sampling_record_cb_list)) + goto out; + + for (i = 0; i < nr_records; i++) { + record = record_base + i; + list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) { + cb_entry->cb(record); + } + } +out: + mem_sampling_ops.sampling_continue(); + +} +EXPORT_SYMBOL_GPL(mem_sampling_process); + +static inline enum mem_sampling_type_enum mem_sampling_get_type(void) +{ +#ifdef CONFIG_ARM_SPE_MEM_SAMPLING + return MEM_SAMPLING_ARM_SPE; +#else + return MEM_SAMPLING_UNSUPPORTED; +#endif +} + +static int __init mem_sampling_init(void) +{ + enum mem_sampling_type_enum mem_sampling_type = mem_sampling_get_type(); + + switch (mem_sampling_type) { + case MEM_SAMPLING_ARM_SPE: + mem_sampling_ops.sampling_start = mm_spe_start; + mem_sampling_ops.sampling_stop = mm_spe_stop; + mem_sampling_ops.sampling_continue = mm_spe_continue; + mem_sampling_ops.sampling_decoding = mm_spe_decoding; + mem_sampling_ops.mm_spe_getbuf_addr = mm_spe_getbuf_addr; + mem_sampling_ops.mm_spe_getnum_record = mm_spe_getnum_record; + + break; + + default: + pr_info("unsupport hardware pmu type(%d), disable access hint!\n", + mem_sampling_type); + return -ENODEV; + } + + return 0; +} +late_initcall(mem_sampling_init); -- 2.25.1