[PATCH OLK-6.6 v2] mm_monitor/mm_spe: Add PMU based memory sampling abstract layer

18 May 2025

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/IC8KS8
CVE: NA

--------------------------------

Add mem_sampling abstract layer to provide hardware mem access
for kernel features, e.g., NUMA balancing or DAMON.

Abstract layer mem_sampling provides an interface to start the
sampling of hardware pmu on current cpu and provides
callback registrations to subscribe access information (e.g.,
for NUMA balancing in subsequent patches). Internally,
mem_sampling registers a callback in specific pmu driver which
forwards the captured records to higher-level through
registered callbacks. Sampling actions are also managed by
hw_pmu layer. CONFIG_MEM_SAMPLING is added to enable the
mem_sampling layer.

For now, mem_sampling only supports SPE driver. New hardware pmu
support could be added in mem_sampling with no need to adjust
higher-level kernel feature code.

Signed-off-by: Ze Zuo <zuoze1@huawei.com>
Signed-off-by: Tong Tiangen <tongtiangen@huawei.com>
Signed-off-by: Shuang Yan <yanshuang7@huawei.com>
---
 drivers/arm/mm_monitor/mm_spe.c |   5 +-
 drivers/arm/mm_monitor/mm_spe.h |   2 +-
 include/linux/mem_sampling.h    |  82 +++++++++++++++++++++
 mm/Kconfig                      |  15 ++++
 mm/Makefile                     |   1 +
 mm/mem_sampling.c               | 126 ++++++++++++++++++++++++++++++++
 6 files changed, 228 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/mem_sampling.h
 create mode 100644 mm/mem_sampling.c

diff --git a/drivers/arm/mm_monitor/mm_spe.c b/drivers/arm/mm_monitor/mm_spe.c
index f2f2b3320357..cbde84c228a0 100644
--- a/drivers/arm/mm_monitor/mm_spe.c
+++ b/drivers/arm/mm_monitor/mm_spe.c
@@ -11,6 +11,7 @@
 
 #include <linux/of_device.h>
 #include <linux/perf/arm_pmu.h>
+#include <linux/mem_sampling.h>
 
 #include "spe-decoder/arm-spe-decoder.h"
 #include "spe-decoder/arm-spe-pkt-decoder.h"
@@ -377,7 +378,7 @@ static void mm_spe_sample_para_init(void)
 void mm_spe_record_enqueue(struct arm_spe_record *record)
 {
 	struct mm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf);
-	struct arm_spe_record *record_tail;
+	struct mem_sampling_record *record_tail;
 
 	if (spe_buf->nr_records >= SPE_RECORD_BUFFER_MAX_RECORDS) {
 		pr_err("nr_records exceeded!\n");
@@ -386,7 +387,7 @@ void mm_spe_record_enqueue(struct arm_spe_record *record)
 
 	record_tail = spe_buf->record_base +
 			spe_buf->nr_records * SPE_RECORD_ENTRY_SIZE;
-	*record_tail = *(struct arm_spe_record *)record;
+	*record_tail = *(struct mem_sampling_record *)record;
 	spe_buf->nr_records++;
 }
 
diff --git a/drivers/arm/mm_monitor/mm_spe.h b/drivers/arm/mm_monitor/mm_spe.h
index da134e8794c8..bd0a1574a1b0 100644
--- a/drivers/arm/mm_monitor/mm_spe.h
+++ b/drivers/arm/mm_monitor/mm_spe.h
@@ -9,7 +9,7 @@
 #define SPE_SAMPLE_PERIOD		1024
 
 #define SPE_RECORD_BUFFER_MAX_RECORDS	(100)
-#define SPE_RECORD_ENTRY_SIZE		sizeof(struct arm_spe_record)
+#define SPE_RECORD_ENTRY_SIZE		sizeof(struct mem_sampling_record)
 #define ARMV8_SPE_MEM_SAMPLING_PDEV_NAME "arm,mm_spe,spe-v1"
 
 struct mm_spe {
diff --git a/include/linux/mem_sampling.h b/include/linux/mem_sampling.h
new file mode 100644
index 000000000000..3e000a0deced
--- /dev/null
+++ b/include/linux/mem_sampling.h
@@ -0,0 +1,82 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * mem_sampling.h: declare the mem_sampling abstract layer and provide
+ * unified pmu sampling for NUMA, DAMON, etc.
+ *
+ * Sample records are converted to mem_sampling_record, and then
+ * mem_sampling_record_captured_cb_type invoke the callbacks to
+ * pass the record.
+ *
+ * Copyright (c) 2024-2025, Huawei Technologies Ltd.
+ */
+#ifndef __MEM_SAMPLING_H
+#define __MEM_SAMPLING_H
+
+enum mem_sampling_sample_type {
+	MEM_SAMPLING_L1D_ACCESS		= 1 << 0,
+	MEM_SAMPLING_L1D_MISS		= 1 << 1,
+	MEM_SAMPLING_LLC_ACCESS		= 1 << 2,
+	MEM_SAMPLING_LLC_MISS		= 1 << 3,
+	MEM_SAMPLING_TLB_ACCESS		= 1 << 4,
+	MEM_SAMPLING_TLB_MISS		= 1 << 5,
+	MEM_SAMPLING_BRANCH_MISS	= 1 << 6,
+	MEM_SAMPLING_REMOTE_ACCESS	= 1 << 7,
+};
+
+enum mem_sampling_op_type {
+	MEM_SAMPLING_LD	= 1 << 0,
+	MEM_SAMPLING_ST	= 1 << 1,
+};
+
+struct mem_sampling_record {
+	enum mem_sampling_sample_type	type;
+	int				err;
+	u32				op;
+	u32				latency;
+	u64				from_ip;
+	u64				to_ip;
+	u64				timestamp;
+	u64				virt_addr;
+	u64				phys_addr;
+	u64				context_id;
+	u64				boost_spe_addr[8];
+	u64				rem_addr;
+	u16				source;
+};
+
+struct mem_sampling_ops_struct {
+	int (*sampling_start)(void);
+	void (*sampling_stop)(void);
+	void (*sampling_continue)(void);
+	void (*sampling_decoding)(void);
+	struct mm_spe_buf* (*mm_spe_getbuf_addr)(void);
+	int (*mm_spe_getnum_record)(void);
+
+};
+extern struct mem_sampling_ops_struct mem_sampling_ops;
+
+enum mem_sampling_type_enum {
+	MEM_SAMPLING_ARM_SPE,
+	MEM_SAMPLING_UNSUPPORTED
+};
+
+#ifdef CONFIG_ARM_SPE_MEM_SAMPLING
+int mm_spe_start(void);
+void mm_spe_stop(void);
+void mm_spe_continue(void);
+void mm_spe_decoding(void);
+int mm_spe_getnum_record(void);
+struct mm_spe_buf *mm_spe_getbuf_addr(void);
+int mm_spe_enabled(void);
+void arm_spe_set_probe_status(int status);
+#else
+static inline void mm_spe_stop(void) { }
+static inline void mm_spe_continue(void) { }
+static inline void mm_spe_decoding(void) { }
+static inline void arm_spe_set_probe_status(int status) { }
+static inline int mm_spe_start(void) { return 0; }
+static inline int mm_spe_getnum_record(void) { return 0; }
+static inline struct mm_spe_buf *mm_spe_getbuf_addr(void) { return NULL; }
+static inline int mm_spe_enabled(void) { return 0; }
+#endif /* CONFIG_ARM_SPE_MEM_SAMPLING */
+#endif	/* __MEM_SAMPLING_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 56171b9dd873..c2b45a71a992 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1452,6 +1452,21 @@ config BPF_READAHEAD
 	  of the kernel is adjusted based on the application read mode to optimize
 	  the read performance in the Spark SQL scenario,
 
+config MEM_SAMPLING
+	bool "Use hardware memory sampling for kernel features(NUMA, DAMON, etc.)"
+	default n
+	depends on ARM64
+	select ARM_SPE_MEM_SAMPLING if ARM64
+	help
+	  This option enables hardware-based memory sampling for kernel features
+	  such as NUMA balancing and DAMON. If disabled, software-based memory
+	  sampling will be used instead.
+
+	  Memory sampling is primarily based on specific hardware capabilities,
+	  which enable hardware PMUs to sample memory access for use by kernel
+	  features. It requires at least one hardware PMU (e.g. ARM_SPE_MEM_SAMPLING)
+	  to be enabled.
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 11df2de8fdbe..674777b7c99f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -147,3 +147,4 @@ obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o
 obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o
 obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o
 obj-$(CONFIG_DYNAMIC_POOL)	+= dynamic_pool.o
+obj-$(CONFIG_MEM_SAMPLING) += mem_sampling.o
diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c
new file mode 100644
index 000000000000..551c18452b2e
--- /dev/null
+++ b/mm/mem_sampling.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mem_sampling.c: declare the mem_sampling abstract layer and provide
+ * unified pmu sampling for NUMA, DAMON, etc.
+ *
+ * Sample records are converted to mem_sampling_record, and then
+ * mem_sampling_record_captured_cb_type invoke the callbacks to
+ * pass the record.
+ *
+ * Copyright (c) 2024-2025, Huawei Technologies Ltd.
+ */
+
+#define pr_fmt(fmt) "mem_sampling: " fmt
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mem_sampling.h>
+
+struct mem_sampling_ops_struct mem_sampling_ops;
+
+/*
+ * Callbacks should be registered using mem_sampling_record_cb_register()
+ * by NUMA, DAMON and etc during their initialisation.
+ * Callbacks will be invoked on new hardware pmu records caputured.
+ */
+typedef void (*mem_sampling_record_cb_type)(struct mem_sampling_record *record);
+
+struct mem_sampling_record_cb_list_entry {
+	struct list_head list;
+	mem_sampling_record_cb_type cb;
+};
+LIST_HEAD(mem_sampling_record_cb_list);
+
+void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb)
+{
+	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
+
+	list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) {
+		if (cb_entry->cb == cb)
+			return;
+	}
+
+	cb_entry = kmalloc(sizeof(struct mem_sampling_record_cb_list_entry), GFP_KERNEL);
+	if (!cb_entry)
+		return;
+
+	cb_entry->cb = cb;
+	list_add(&(cb_entry->list), &mem_sampling_record_cb_list);
+}
+
+void mem_sampling_record_cb_unregister(mem_sampling_record_cb_type cb)
+{
+	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
+
+	list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) {
+		if (cb_entry->cb == cb) {
+			list_del(&cb_entry->list);
+			kfree(cb_entry);
+			return;
+		}
+	}
+}
+
+void mem_sampling_process(void)
+{
+	int i, nr_records;
+	struct mem_sampling_record *record;
+	struct mem_sampling_record *record_base;
+	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
+
+	mem_sampling_ops.sampling_decoding();
+
+	record_base = (struct mem_sampling_record *)mem_sampling_ops.mm_spe_getbuf_addr();
+	nr_records = mem_sampling_ops.mm_spe_getnum_record();
+
+	if (list_empty(&mem_sampling_record_cb_list))
+		goto out;
+
+	for (i = 0; i < nr_records; i++) {
+		record = record_base + i;
+		list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) {
+			cb_entry->cb(record);
+		}
+	}
+out:
+	mem_sampling_ops.sampling_continue();
+
+}
+EXPORT_SYMBOL_GPL(mem_sampling_process);
+
+static inline enum mem_sampling_type_enum mem_sampling_get_type(void)
+{
+#ifdef CONFIG_ARM_SPE_MEM_SAMPLING
+	return MEM_SAMPLING_ARM_SPE;
+#else
+	return MEM_SAMPLING_UNSUPPORTED;
+#endif
+}
+
+static int __init mem_sampling_init(void)
+{
+	enum mem_sampling_type_enum mem_sampling_type = mem_sampling_get_type();
+
+	switch (mem_sampling_type) {
+	case MEM_SAMPLING_ARM_SPE:
+		mem_sampling_ops.sampling_start		= mm_spe_start;
+		mem_sampling_ops.sampling_stop		= mm_spe_stop;
+		mem_sampling_ops.sampling_continue	= mm_spe_continue;
+		mem_sampling_ops.sampling_decoding	= mm_spe_decoding;
+		mem_sampling_ops.mm_spe_getbuf_addr	= mm_spe_getbuf_addr;
+		mem_sampling_ops.mm_spe_getnum_record	= mm_spe_getnum_record;
+
+		break;
+
+	default:
+		pr_info("unsupport hardware pmu type(%d), disable access hint!\n",
+			mem_sampling_type);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+late_initcall(mem_sampling_init);
-- 
2.25.1