[PATCH OLK-5.10 v5 02/11] mm: Add PMU based memory sampling abstract layer

4 Jun 2024

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ
CVE: NA
--------------------------------
Add mem_sampling abstract layer to provide hardware mem access
for kernel features, e.g., NUMA balancing or DAMON.
Abstract layer mem_sampling provides an interface to start the
sampling of hardware pmu on current cpu and provides
callback registrations to subscribe access information (e.g.,
for NUMA balancing in subsequent patches). Internally,
mem_sampling registers a callback in specific pmu driver which
forwards the captured records to higher-level through
registered callbacks. Sampling actions are also managed by
hw_pmu layer. CONFIG_MEM_SAMPLING is added to enable the
mem_sampling layer.
For now, mem_sampling only supports SPE driver. New hardware pmu
support could be added in mem_sampling with no need to adjust
higher-level kernel feature code.
Signed-off-by: Ze Zuo zuoze1@huawei.com
Signed-off-by: Tong Tiangen tongtiangen@huawei.com
Signed-off-by: Shuang Yan yanshuang7@huawei.com
---
 drivers/arm/spe/spe.c        |  21 +++++-
 drivers/arm/spe/spe.h        |   2 +-
 include/linux/mem_sampling.h |  98 +++++++++++++++++++++++++++
 mm/Kconfig                   |  11 +++
 mm/Makefile                  |   1 +
 mm/mem_sampling.c            | 126 +++++++++++++++++++++++++++++++++++
 6 files changed, 256 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/mem_sampling.h
 create mode 100644 mm/mem_sampling.c

diff --git a/drivers/arm/spe/spe.c b/drivers/arm/spe/spe.c
index 26abfabf90ba..88d7cfbb6633 100644
--- a/drivers/arm/spe/spe.c
+++ b/drivers/arm/spe/spe.c
@@ -13,6 +13,7 @@
 #include <linux/of_irq.h>
 #include <linux/perf/arm_pmu.h>
 #include <linux/platform_device.h>
+#include <linux/mem_sampling.h>
#include "spe-decoder/arm-spe-decoder.h"
 #include "spe-decoder/arm-spe-pkt-decoder.h"
@@ -31,6 +32,12 @@ static enum cpuhp_state arm_spe_online;
DEFINE_PER_CPU(struct arm_spe_buf, per_cpu_spe_buf);
+mem_sampling_cb_type arm_spe_sampling_cb;
+void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb)
+{
+	arm_spe_sampling_cb = cb;
+}
+
 static inline int arm_spe_per_buffer_alloc(int cpu)
 {
    struct arm_spe_buf *spe_buf = &per_cpu(per_cpu_spe_buf, cpu);
@@ -376,6 +383,16 @@ static irqreturn_t arm_spe_irq_handler(int irq, void *dev)
    case SPE_PMU_BUF_FAULT_ACT_OK:
    	spe_buf->nr_records = 0;
    	arm_spe_decode_buf(spe_buf->cur, spe_buf->size);
+
+		/*
+		 * Callback function processing record data.
+		 * Call one: arm_spe_sampling_cb - mem_sampling layer.
+		 * TODO: use per CPU workqueue to process data and reduce
+		 * interrupt processing time
+		 */
+		if (arm_spe_sampling_cb)
+			arm_spe_sampling_cb((struct mem_sampling_record *)spe_buf->record_base,
+						   spe_buf->nr_records);
    	break;
case SPE_PMU_BUF_FAULT_ACT_SPURIOUS:
@@ -663,7 +680,7 @@ static void arm_spe_sample_para_init(void)
 void arm_spe_record_enqueue(struct arm_spe_record *record)
 {
    struct arm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf);
-	struct arm_spe_record *record_tail;
+	struct mem_sampling_record *record_tail;
if (spe_buf->nr_records >= SPE_RECORD_BUFFER_MAX_RECORDS) {
    	pr_err("nr_records exceeded!\n");
@@ -672,7 +689,7 @@ void arm_spe_record_enqueue(struct arm_spe_record *record)
record_tail = spe_buf->record_base +
    		spe_buf->nr_records * SPE_RECORD_ENTRY_SIZE;
-	*record_tail = *(struct arm_spe_record *)record;
+	*record_tail = *(struct mem_sampling_record *)record;
    spe_buf->nr_records++;
}
diff --git a/drivers/arm/spe/spe.h b/drivers/arm/spe/spe.h
index 4348d15581dd..865888b8260f 100644
--- a/drivers/arm/spe/spe.h
+++ b/drivers/arm/spe/spe.h
@@ -14,7 +14,7 @@
 #define SPE_SAMPLE_PERIOD		1024
#define SPE_RECORD_BUFFER_MAX_RECORDS	(100)
-#define SPE_RECORD_ENTRY_SIZE		sizeof(struct arm_spe_record)
+#define SPE_RECORD_ENTRY_SIZE		sizeof(struct mem_sampling_record)
#define SPE_PMU_FEAT_FILT_EVT		(1UL << 0)
 #define SPE_PMU_FEAT_FILT_TYP		(1UL << 1)
diff --git a/include/linux/mem_sampling.h b/include/linux/mem_sampling.h
new file mode 100644
index 000000000000..282f3f7d415b
--- /dev/null
+++ b/include/linux/mem_sampling.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * mem_sampling.h: declare the mem_sampling abstract layer and provide
+ * unified pmu sampling for NUMA, DAMON, etc.
+ *
+ * Sample records are converted to mem_sampling_record, and then
+ * mem_sampling_record_captured_cb_type invoke the callbacks to
+ * pass the record.
+ *
+ * Copyright (c) 2024-2025, Huawei Technologies Ltd.
+ */
+#ifndef __MEM_SAMPLING_H
+#define __MEM_SAMPLING_H
+
+enum mem_sampling_sample_type {
+	MEM_SAMPLING_L1D_ACCESS	= 1 << 0,
+	MEM_SAMPLING_L1D_MISS		= 1 << 1,
+	MEM_SAMPLING_LLC_ACCESS	= 1 << 2,
+	MEM_SAMPLING_LLC_MISS		= 1 << 3,
+	MEM_SAMPLING_TLB_ACCESS	= 1 << 4,
+	MEM_SAMPLING_TLB_MISS		= 1 << 5,
+	MEM_SAMPLING_BRANCH_MISS	= 1 << 6,
+	MEM_SAMPLING_REMOTE_ACCESS	= 1 << 7,
+};
+
+enum mem_sampling_op_type {
+	MEM_SAMPLING_LD		= 1 << 0,
+	MEM_SAMPLING_ST		= 1 << 1,
+};
+
+struct mem_sampling_record {
+	enum mem_sampling_sample_type	type;
+	int			err;
+	u32			op;
+	u32			latency;
+	u64			from_ip;
+	u64			to_ip;
+	u64			timestamp;
+	u64			virt_addr;
+	u64			phys_addr;
+	u64			context_id;
+	u16			source;
+};
+
+/*
+ * Callbacks should be registered using mem_sampling_record_cb_register()
+ * by NUMA, DAMON and etc during their initialisation.
+ * Callbacks will be invoked on new hardware pmu records caputured.
+ */
+typedef void (*mem_sampling_record_cb_type)(struct mem_sampling_record *record);
+void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb);
+void mem_sampling_record_cb_unregister(mem_sampling_record_cb_type cb);
+
+#ifdef CONFIG_MEM_SAMPLING
+void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr);
+#else
+static inline void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr) { };
+#endif
+
+/* invoked by specific mem_sampling */
+typedef void (*mem_sampling_cb_type)(struct mem_sampling_record *record_base,
+							int n_records);
+
+
+struct mem_sampling_ops_struct {
+	int (*sampling_start)(void);
+	void (*sampling_stop)(void);
+	void (*sampling_continue)(void);
+};
+extern struct mem_sampling_ops_struct mem_sampling_ops;
+
+enum mem_sampling_type_enum {
+	MEM_SAMPLING_ARM_SPE,
+	MEM_SAMPLING_UNSUPPORTED
+};
+
+#ifdef CONFIG_ARM_SPE
+int arm_spe_start(void);
+void arm_spe_stop(void);
+void arm_spe_continue(void);
+int arm_spe_enabled(void);
+void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb);
+#else
+static inline void arm_spe_stop(void) { };
+static inline void arm_spe_continue(void) { };
+static inline void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb) { };
+
+static inline int arm_spe_start(void)
+{
+	return 0;
+}
+
+static inline int arm_spe_enabled(void)
+{
+	return 0;
+}
+#endif /* CONFIG_ARM_SPE */
+#endif	/* __MEM_SAMPLING_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 70c85533aada..381d440f85eb 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1008,6 +1008,17 @@ config EXTEND_HUGEPAGE_MAPPING
    help
      Introduce vmalloc/vmap/remap interfaces that handle only hugepages.
+config MEM_SAMPLING
+	bool "Use hardware memory sampling for kernel features(NUMA, DAMON, etc.)"
+	default n
+	depends on ARM64
+	select ARM_SPE if ARM64
+	help
+	  Memory sampling is primarily based on specific hardware capabilities,
+	  which enable hardware PMUs to sample memory access for use by kernel
+	  features.. It requires at least one hardware pmu (e.g. ARM_SPE) to
+	  be enabled.
+
 source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index a014a5e08f7b..112966190c1d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -133,3 +133,4 @@ obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o
 obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o
 obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o
 obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o
+obj-$(CONFIG_MEM_SAMPLING) += mem_sampling.o
diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c
new file mode 100644
index 000000000000..df8e8823e211
--- /dev/null
+++ b/mm/mem_sampling.c
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * mem_sampling.c: declare the mem_sampling abstract layer and provide
+ * unified pmu sampling for NUMA, DAMON, etc.
+ *
+ * Sample records are converted to mem_sampling_record, and then
+ * mem_sampling_record_captured_cb_type invoke the callbacks to
+ * pass the record.
+ *
+ * Copyright (c) 2024-2025, Huawei Technologies Ltd.
+ */
+
+#define pr_fmt(fmt) "mem_sampling: " fmt
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/mem_sampling.h>
+
+struct mem_sampling_ops_struct mem_sampling_ops;
+
+struct mem_sampling_record_cb_list_entry {
+	struct list_head list;
+	mem_sampling_record_cb_type cb;
+};
+LIST_HEAD(mem_sampling_record_cb_list);
+
+void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb)
+{
+	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
+
+	list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) {
+		if (cb_entry->cb == cb) {
+			pr_info("mem_sampling record cb already registered\n");
+			return;
+		}
+	}
+
+	cb_entry = kmalloc(sizeof(struct mem_sampling_record_cb_list_entry), GFP_KERNEL);
+	if (!cb_entry) {
+		pr_info("mem_sampling record cb entry alloc memory failed\n");
+		return;
+	}
+
+	cb_entry->cb = cb;
+	list_add(&(cb_entry->list), &mem_sampling_record_cb_list);
+}
+
+void mem_sampling_record_cb_unregister(mem_sampling_record_cb_type cb)
+{
+	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
+
+	list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) {
+		if (cb_entry->cb == cb) {
+			list_del(&cb_entry->list);
+			kfree(cb_entry);
+			return;
+		}
+	}
+}
+
+void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr)
+{
+	if (!mem_sampling_ops.sampling_start)
+		return;
+
+	if (curr->mm)
+		mem_sampling_ops.sampling_start();
+	else
+		mem_sampling_ops.sampling_stop();
+}
+
+static void mem_sampling_process(struct mem_sampling_record *record_base, int nr_records)
+{
+	int i;
+	struct mem_sampling_record *record;
+	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
+
+	if (list_empty(&mem_sampling_record_cb_list))
+		goto out;
+
+	for (i = 0; i < nr_records; i++) {
+		record = record_base + i;
+		list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) {
+			cb_entry->cb(record);
+		}
+	}
+out:
+	mem_sampling_ops.sampling_continue();
+}
+
+static inline enum mem_sampling_type_enum mem_sampling_get_type(void)
+{
+#ifdef CONFIG_ARM_SPE
+	return MEM_SAMPLING_ARM_SPE;
+#else
+	return MEM_SAMPLING_UNSUPPORTED;
+#endif
+}
+
+static int __init mem_sampling_init(void)
+{
+	enum mem_sampling_type_enum mem_sampling_type = mem_sampling_get_type();
+
+	switch (mem_sampling_type) {
+	case MEM_SAMPLING_ARM_SPE:
+		if (!arm_spe_enabled())
+			return -ENODEV;
+		mem_sampling_ops.sampling_start	= arm_spe_start,
+		mem_sampling_ops.sampling_stop	= arm_spe_stop,
+		mem_sampling_ops.sampling_continue	= arm_spe_continue,
+
+		arm_spe_record_capture_callback_register(mem_sampling_process);
+		break;
+
+	default:
+		pr_info("unsupport hardware pmu type(%d), disable access hint!\n",
+			mem_sampling_type);
+		return -ENODEV;
+	}
+
+	return 0;
+}
+late_initcall(mem_sampling_init);
-- 
2.33.0


    

2025

2024

2023

2022

2021

2020

2019

[PATCH OLK-5.10 v5 02/11] mm: Add PMU based memory sampling abstract layer