hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Pre-sequence patches form a set of independent SPE drivers, and isolate the basic operations of SPE from the perf driver, including SPE interrupt handling, enable and other operations, so that the use of SPE features can be divided into two categories: one, the kernel's basic modules, such as: Numa Balance, Damon, etc., and two, the user uses the perf driver, where the first category of users The first type of user starts and stops SPE through the abstraction layer mem_sampling. In addition, in order to avoid embedded modification of the perf code as much as possible, it accepts perf to control the SPE driver separately, including enabling and stopping.
Signed-off-by: Ze Zuo zuoze1@huawei.com Signed-off-by: Tong Tiangen tongtiangen@huawei.com --- drivers/arm/spe/Kconfig | 3 +- drivers/arm/spe/spe.c | 73 ++++++- drivers/arm/spe/spe.h | 10 + drivers/perf/Kconfig | 2 +- drivers/perf/arm_pmu_acpi.c | 32 ++- drivers/perf/arm_spe_pmu.c | 371 ++++++----------------------------- include/linux/mem_sampling.h | 9 + include/linux/perf/arm_pmu.h | 8 + kernel/sched/fair.c | 3 +- mm/mem_sampling.c | 57 +++++- 10 files changed, 242 insertions(+), 326 deletions(-)
diff --git a/drivers/arm/spe/Kconfig b/drivers/arm/spe/Kconfig index 5ede60666349..2d81364d0e0a 100644 --- a/drivers/arm/spe/Kconfig +++ b/drivers/arm/spe/Kconfig @@ -4,7 +4,8 @@ # config ARM_SPE bool "In-kernel SPE for driver for page access profiling" - depends on ARM64 && !ARM_SPE_PMU + depends on ARM64 + default n help Enable support for the ARMv8.2 Statistical Profiling Extension, which provides periodic sampling of operations in the CPU pipeline. diff --git a/drivers/arm/spe/spe.c b/drivers/arm/spe/spe.c index 88d7cfbb6633..91984b82c76b 100644 --- a/drivers/arm/spe/spe.c +++ b/drivers/arm/spe/spe.c @@ -14,6 +14,7 @@ #include <linux/perf/arm_pmu.h> #include <linux/platform_device.h> #include <linux/mem_sampling.h> +#include <linux/perf/arm_pmu.h>
#include "spe-decoder/arm-spe-decoder.h" #include "spe-decoder/arm-spe-pkt-decoder.h" @@ -30,6 +31,9 @@ static int spe_probe_status = SPE_INIT_FAIL; /* Keep track of our dynamic hotplug state */ static enum cpuhp_state arm_spe_online;
+/* keep track of who use the SPE */ +static enum arm_spe_user_e arm_spe_user = ARM_SPE_USER_MEM_SAMPLING; + DEFINE_PER_CPU(struct arm_spe_buf, per_cpu_spe_buf);
mem_sampling_cb_type arm_spe_sampling_cb; @@ -37,6 +41,31 @@ void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb) { arm_spe_sampling_cb = cb; } +EXPORT_SYMBOL_GPL(arm_spe_record_capture_callback_register); + +/* SPE sampling callback for perf */ +perf_sampling_cb_type arm_spe_sampling_perf_cb; +void arm_spe_sampling_for_perf_callback_register(perf_sampling_cb_type cb) +{ + arm_spe_sampling_perf_cb = cb; +} +EXPORT_SYMBOL_GPL(arm_spe_sampling_for_perf_callback_register); + +/* + * SPE can be useed by mem_sampling/perf, perf takes precedence. + * when perf is used, this callback is used to disable mem_sampling. + */ +mem_sampling_user_switch_cb_type arm_spe_user_switch_cb; +void arm_spe_user_switch_callback_register(mem_sampling_user_switch_cb_type cb) +{ + arm_spe_user_switch_cb = cb; +} + +struct arm_spe *arm_spe_get_desc(void) +{ + return spe; +} +EXPORT_SYMBOL_GPL(arm_spe_get_desc);
static inline int arm_spe_per_buffer_alloc(int cpu) { @@ -371,6 +400,10 @@ static irqreturn_t arm_spe_irq_handler(int irq, void *dev)
switch (act) { case SPE_PMU_BUF_FAULT_ACT_FATAL: + if (unlikely(arm_spe_user == ARM_SPE_USER_PERF)) { + if (arm_spe_sampling_perf_cb) + arm_spe_sampling_perf_cb(act); + } /* * If a fatal exception occurred then leaving the profiling * buffer enabled is a recipe waiting to happen. Since @@ -381,18 +414,27 @@ static irqreturn_t arm_spe_irq_handler(int irq, void *dev) arm_spe_disable_and_drain_local(); break; case SPE_PMU_BUF_FAULT_ACT_OK: - spe_buf->nr_records = 0; - arm_spe_decode_buf(spe_buf->cur, spe_buf->size); - /* * Callback function processing record data. - * Call one: arm_spe_sampling_cb - mem_sampling layer. - * TODO: use per CPU workqueue to process data and reduce - * interrupt processing time + * ARM_SPE_USER_MEM_SAMPLING: arm_spe_record_captured_cb - mem_sampling layer. + * ARM_SPE_USER_PERF: arm_spe_sampling_perf_cb - perf. + * TODO: 1) use per CPU workqueue to process data and reduce + * interrupt processing time. 2) The "register" function can be + * registered in a callback structure. */ - if (arm_spe_sampling_cb) - arm_spe_sampling_cb((struct mem_sampling_record *)spe_buf->record_base, - spe_buf->nr_records); + if (likely(arm_spe_user == ARM_SPE_USER_MEM_SAMPLING)) { + spe_buf->nr_records = 0; + arm_spe_decode_buf(spe_buf->cur, spe_buf->size); + + if (arm_spe_sampling_cb) + arm_spe_sampling_cb( + (struct mem_sampling_record *)spe_buf->record_base, + spe_buf->nr_records); + } else { + if (arm_spe_sampling_perf_cb) + arm_spe_sampling_perf_cb(act); + } + break;
case SPE_PMU_BUF_FAULT_ACT_SPURIOUS: @@ -550,6 +592,19 @@ static void __arm_spe_stop_one(void) __arm_spe_reset_local(); }
+void arm_spe_set_user(enum arm_spe_user_e user) +{ + if (user == ARM_SPE_USER_PERF) + arm_spe_user_switch_cb(USER_SWITCH_AWAY_FROM_MEM_SAMPLING); + else + arm_spe_user_switch_cb(USER_SWITCH_BACK_TO_MEM_SAMPLING); + + __arm_spe_reset_local(); + + arm_spe_user = user; +} +EXPORT_SYMBOL_GPL(arm_spe_set_user); + static int arm_spe_cpu_startup(unsigned int cpu, struct hlist_node *node) { struct arm_spe *spe; diff --git a/drivers/arm/spe/spe.h b/drivers/arm/spe/spe.h index 865888b8260f..0e109af35aa2 100644 --- a/drivers/arm/spe/spe.h +++ b/drivers/arm/spe/spe.h @@ -24,6 +24,7 @@ #define SPE_PMU_FEAT_ERND (1UL << 5) #define SPE_PMU_FEAT_INV_FILT_EVT (1UL << 6) #define SPE_PMU_FEAT_DEV_PROBED (1UL << 63) +#define ARM_SPE_BUF_PAD_BYTE (0) #define PMBLIMITR_EL1_E GENMASK(0, 0) #define PMBSR_EL1_S GENMASK(17, 17) #define PMBSR_EL1_EC GENMASK(31, 26) @@ -71,6 +72,10 @@ #define PMSIDR_EL1_FL GENMASK(2, 2) #define SYS_PMSNEVFR_EL1 sys_reg(3, 0, 9, 9, 1) #define SPE_PMU_FEAT_INV_FILT_EVT (1UL << 6) +#define PMBSR_EL1_COLL_MASK GENMASK(16, 16) +#define PMBSR_EL1_COLL PMBSR_EL1_COLL_MASK +#define PMBSR_EL1_DL_MASK GENMASK(19, 19) +#define PMBSR_EL1_DL PMBSR_EL1_DL_MASK
enum arm_spe_buf_fault_action { SPE_PMU_BUF_FAULT_ACT_SPURIOUS, @@ -78,6 +83,11 @@ enum arm_spe_buf_fault_action { SPE_PMU_BUF_FAULT_ACT_OK, };
+enum arm_spe_user_e { + ARM_SPE_USER_PERF, + ARM_SPE_USER_MEM_SAMPLING, +}; + struct arm_spe { struct pmu pmu; struct platform_device *pdev; diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 67ad53cde11f..e6eee6f3d33c 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -124,7 +124,7 @@ config XGENE_PMU
config ARM_SPE_PMU tristate "Enable support for the ARMv8.2 Statistical Profiling Extension" - depends on ARM64 + depends on ARM_SPE && MEM_SAMPLING help Enable perf support for the ARMv8.2 Statistical Profiling Extension, which provides periodic sampling of operations in diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c index b6b7998a5f9f..4e716b700c0f 100644 --- a/drivers/perf/arm_pmu_acpi.c +++ b/drivers/perf/arm_pmu_acpi.c @@ -124,7 +124,34 @@ arm_acpi_register_pmu_device(struct platform_device *pdev, u8 len, return ret; }
-#if IS_ENABLED(CONFIG_ARM_SPE_PMU) || IS_ENABLED(CONFIG_ARM_SPE) +#if IS_ENABLED(CONFIG_ARM_SPE_PMU) +static struct resource spe_pmu_resources[] = { + { + } +}; + +static struct platform_device spe_pmu_dev = { + .name = ARMV8_SPE_PMU_PDEV_NAME, + .id = -1, + .resource = spe_pmu_resources, + .num_resources = ARRAY_SIZE(spe_pmu_resources) +}; + +static void arm_spe_pmu_acpi_register_device(void) +{ + int ret; + + ret = platform_device_register(&spe_pmu_dev); + if (ret < 0) + pr_warn("ACPI: SPE_PMU: Unable to register device\n"); +} +#else +static inline void arm_spe_pmu_acpi_register_device(void) +{ +} +#endif + +#if IS_ENABLED(CONFIG_ARM_SPE) static struct resource spe_resources[] = { { /* irq */ @@ -160,7 +187,7 @@ static void arm_spe_acpi_register_device(void) static inline void arm_spe_acpi_register_device(void) { } -#endif /* CONFIG_ARM_SPE_PMU */ +#endif /* CONFIG_ARM_SPE */
#if IS_ENABLED(CONFIG_CORESIGHT_TRBE) static struct resource trbe_resources[] = { @@ -402,6 +429,7 @@ static int arm_pmu_acpi_init(void) return 0;
arm_spe_acpi_register_device(); + arm_spe_pmu_acpi_register_device(); arm_trbe_acpi_register_device();
ret = arm_pmu_acpi_parse_irqs(); diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 2a4ebdd1ee78..970bc2f3c4bf 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -39,6 +39,8 @@ #include <asm/mmu.h> #include <asm/sysreg.h>
+#include "../arm/spe/spe.h" + /* * Cache if the event is allowed to trace Context information. * This allows us to perform the check, i.e, perfmon_capable(), @@ -57,8 +59,6 @@ static bool get_spe_event_has_cx(struct perf_event *event) return !!(event->hw.flags & SPE_PMU_HW_FLAGS_CX); }
-#define ARM_SPE_BUF_PAD_BYTE 0 - struct arm_spe_pmu_buf { int nr_pages; bool snapshot; @@ -76,13 +76,6 @@ struct arm_spe_pmu { u16 min_period; u16 counter_sz;
-#define SPE_PMU_FEAT_FILT_EVT (1UL << 0) -#define SPE_PMU_FEAT_FILT_TYP (1UL << 1) -#define SPE_PMU_FEAT_FILT_LAT (1UL << 2) -#define SPE_PMU_FEAT_ARCH_INST (1UL << 3) -#define SPE_PMU_FEAT_LDS (1UL << 4) -#define SPE_PMU_FEAT_ERND (1UL << 5) -#define SPE_PMU_FEAT_DEV_PROBED (1UL << 63) u64 features;
u16 max_record_sz; @@ -95,15 +88,6 @@ struct arm_spe_pmu { /* Convert a free-running index from perf into an SPE buffer offset */ #define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT))
-/* Keep track of our dynamic hotplug state */ -static enum cpuhp_state arm_spe_pmu_online; - -enum arm_spe_pmu_buf_fault_action { - SPE_PMU_BUF_FAULT_ACT_SPURIOUS, - SPE_PMU_BUF_FAULT_ACT_FATAL, - SPE_PMU_BUF_FAULT_ACT_OK, -}; - /* This sysfs gunk was really good fun to write. */ enum arm_spe_pmu_capabilities { SPE_PMU_CAP_ARCH_INST = 0, @@ -276,6 +260,8 @@ static const struct attribute_group *arm_spe_pmu_attr_groups[] = { NULL, };
+struct arm_spe_pmu *spe_pmu_local; + /* Convert between user ABI and register values */ static u64 arm_spe_event_to_pmscr(struct perf_event *event) { @@ -551,12 +537,12 @@ static void arm_spe_pmu_disable_and_drain_local(void) }
/* IRQ handling */ -static enum arm_spe_pmu_buf_fault_action +static enum arm_spe_buf_fault_action arm_spe_pmu_buf_get_fault_act(struct perf_output_handle *handle) { const char *err_str; u64 pmbsr; - enum arm_spe_pmu_buf_fault_action ret; + enum arm_spe_buf_fault_action ret;
/* * Ensure new profiling data is visible to the CPU and any external @@ -621,57 +607,6 @@ arm_spe_pmu_buf_get_fault_act(struct perf_output_handle *handle) return ret; }
-static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev) -{ - struct perf_output_handle *handle = dev; - struct perf_event *event = handle->event; - enum arm_spe_pmu_buf_fault_action act; - - if (!perf_get_aux(handle)) - return IRQ_NONE; - - act = arm_spe_pmu_buf_get_fault_act(handle); - if (act == SPE_PMU_BUF_FAULT_ACT_SPURIOUS) - return IRQ_NONE; - - /* - * Ensure perf callbacks have completed, which may disable the - * profiling buffer in response to a TRUNCATION flag. - */ - irq_work_run(); - - switch (act) { - case SPE_PMU_BUF_FAULT_ACT_FATAL: - /* - * If a fatal exception occurred then leaving the profiling - * buffer enabled is a recipe waiting to happen. Since - * fatal faults don't always imply truncation, make sure - * that the profiling buffer is disabled explicitly before - * clearing the syndrome register. - */ - arm_spe_pmu_disable_and_drain_local(); - break; - case SPE_PMU_BUF_FAULT_ACT_OK: - /* - * We handled the fault (the buffer was full), so resume - * profiling as long as we didn't detect truncation. - * PMBPTR might be misaligned, but we'll burn that bridge - * when we get to it. - */ - if (!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)) { - arm_spe_perf_aux_output_begin(handle, event); - isb(); - } - break; - case SPE_PMU_BUF_FAULT_ACT_SPURIOUS: - /* We've seen you before, but GCC has the memory of a sieve. */ - break; - } - - /* The buffer pointers are now sane, so resume profiling. */ - write_sysreg_s(0, SYS_PMBSR_EL1); - return IRQ_HANDLED; -}
static u64 arm_spe_pmsevfr_res0(u16 pmsver) { @@ -746,6 +681,8 @@ static void arm_spe_pmu_start(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle);
+ arm_spe_set_user(ARM_SPE_USER_PERF); + hwc->state = 0; arm_spe_perf_aux_output_begin(handle, event); if (hwc->state) @@ -780,8 +717,14 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags) struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle);
/* If we're already stopped, then nothing to do */ - if (hwc->state & PERF_HES_STOPPED) + if (hwc->state & PERF_HES_STOPPED) { + /* + * PERF_HES_STOPPED maybe set in arm_spe_perf_aux_output_begin, + * we switch user here. + */ + arm_spe_set_user(ARM_SPE_USER_MEM_SAMPLING); return; + }
/* Stop all trace generation */ arm_spe_pmu_disable_and_drain_local(); @@ -793,7 +736,7 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags) * path. */ if (perf_get_aux(handle)) { - enum arm_spe_pmu_buf_fault_action act; + enum arm_spe_buf_fault_action act;
act = arm_spe_pmu_buf_get_fault_act(handle); if (act == SPE_PMU_BUF_FAULT_ACT_SPURIOUS) @@ -812,6 +755,7 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags) }
hwc->state |= PERF_HES_STOPPED; + arm_spe_set_user(ARM_SPE_USER_MEM_SAMPLING); }
static int arm_spe_pmu_add(struct perf_event *event, int flags) @@ -952,233 +896,58 @@ static void arm_spe_pmu_perf_destroy(struct arm_spe_pmu *spe_pmu) perf_pmu_unregister(&spe_pmu->pmu); }
-static void __arm_spe_pmu_dev_probe(void *info) +void arm_spe_sampling_process(enum arm_spe_buf_fault_action act) { - int fld; - u64 reg; - struct arm_spe_pmu *spe_pmu = info; - struct device *dev = &spe_pmu->pdev->dev; - - fld = cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64DFR0_EL1), - ID_AA64DFR0_PMSVER_SHIFT); - if (!fld) { - dev_err(dev, - "unsupported ID_AA64DFR0_EL1.PMSVer [%d] on CPU %d\n", - fld, smp_processor_id()); - return; - } - spe_pmu->pmsver = (u16)fld; - - /* Read PMBIDR first to determine whether or not we have access */ - reg = read_sysreg_s(SYS_PMBIDR_EL1); - if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT)) { - dev_err(dev, - "profiling buffer owned by higher exception level\n"); - return; - } - - /* Minimum alignment. If it's out-of-range, then fail the probe */ - fld = reg >> SYS_PMBIDR_EL1_ALIGN_SHIFT & SYS_PMBIDR_EL1_ALIGN_MASK; - spe_pmu->align = 1 << fld; - if (spe_pmu->align > SZ_2K) { - dev_err(dev, "unsupported PMBIDR.Align [%d] on CPU %d\n", - fld, smp_processor_id()); - return; - } - - /* It's now safe to read PMSIDR and figure out what we've got */ - reg = read_sysreg_s(SYS_PMSIDR_EL1); - if (reg & BIT(SYS_PMSIDR_EL1_FE_SHIFT)) - spe_pmu->features |= SPE_PMU_FEAT_FILT_EVT; - - if (reg & BIT(SYS_PMSIDR_EL1_FT_SHIFT)) - spe_pmu->features |= SPE_PMU_FEAT_FILT_TYP; - - if (reg & BIT(SYS_PMSIDR_EL1_FL_SHIFT)) - spe_pmu->features |= SPE_PMU_FEAT_FILT_LAT; - - if (reg & BIT(SYS_PMSIDR_EL1_ARCHINST_SHIFT)) - spe_pmu->features |= SPE_PMU_FEAT_ARCH_INST; - - if (reg & BIT(SYS_PMSIDR_EL1_LDS_SHIFT)) - spe_pmu->features |= SPE_PMU_FEAT_LDS; - - if (reg & BIT(SYS_PMSIDR_EL1_ERND_SHIFT)) - spe_pmu->features |= SPE_PMU_FEAT_ERND; - - /* This field has a spaced out encoding, so just use a look-up */ - fld = reg >> SYS_PMSIDR_EL1_INTERVAL_SHIFT & SYS_PMSIDR_EL1_INTERVAL_MASK; - switch (fld) { - case 0: - spe_pmu->min_period = 256; - break; - case 2: - spe_pmu->min_period = 512; - break; - case 3: - spe_pmu->min_period = 768; - break; - case 4: - spe_pmu->min_period = 1024; - break; - case 5: - spe_pmu->min_period = 1536; - break; - case 6: - spe_pmu->min_period = 2048; - break; - case 7: - spe_pmu->min_period = 3072; - break; - default: - dev_warn(dev, "unknown PMSIDR_EL1.Interval [%d]; assuming 8\n", - fld); - fallthrough; - case 8: - spe_pmu->min_period = 4096; - } + struct perf_output_handle *handle = this_cpu_ptr(spe_pmu_local->handle); + struct perf_event *event = handle->event; + u64 pmbsr;
- /* Maximum record size. If it's out-of-range, then fail the probe */ - fld = reg >> SYS_PMSIDR_EL1_MAXSIZE_SHIFT & SYS_PMSIDR_EL1_MAXSIZE_MASK; - spe_pmu->max_record_sz = 1 << fld; - if (spe_pmu->max_record_sz > SZ_2K || spe_pmu->max_record_sz < 16) { - dev_err(dev, "unsupported PMSIDR_EL1.MaxSize [%d] on CPU %d\n", - fld, smp_processor_id()); + if (!perf_get_aux(handle)) return; - } - - fld = reg >> SYS_PMSIDR_EL1_COUNTSIZE_SHIFT & SYS_PMSIDR_EL1_COUNTSIZE_MASK; - switch (fld) { - default: - dev_warn(dev, "unknown PMSIDR_EL1.CountSize [%d]; assuming 2\n", - fld); - fallthrough; - case 2: - spe_pmu->counter_sz = 12; - } - - dev_info(dev, - "probed for CPUs %*pbl [max_record_sz %u, align %u, features 0x%llx]\n", - cpumask_pr_args(&spe_pmu->supported_cpus), - spe_pmu->max_record_sz, spe_pmu->align, spe_pmu->features); - - spe_pmu->features |= SPE_PMU_FEAT_DEV_PROBED; - return; -}
-static void __arm_spe_pmu_reset_local(void) -{ /* - * This is probably overkill, as we have no idea where we're - * draining any buffered data to... + * If we've lost data, disable profiling and also set the PARTIAL + * flag to indicate that the last record is corrupted. */ - arm_spe_pmu_disable_and_drain_local(); - - /* Reset the buffer base pointer */ - write_sysreg_s(0, SYS_PMBPTR_EL1); - isb(); - - /* Clear any pending management interrupts */ - write_sysreg_s(0, SYS_PMBSR_EL1); - isb(); -} - -static void __arm_spe_pmu_setup_one(void *info) -{ - struct arm_spe_pmu *spe_pmu = info; - - __arm_spe_pmu_reset_local(); - enable_percpu_irq(spe_pmu->irq, IRQ_TYPE_NONE); -} - -static void __arm_spe_pmu_stop_one(void *info) -{ - struct arm_spe_pmu *spe_pmu = info; - - disable_percpu_irq(spe_pmu->irq); - __arm_spe_pmu_reset_local(); -} - -static int arm_spe_pmu_cpu_startup(unsigned int cpu, struct hlist_node *node) -{ - struct arm_spe_pmu *spe_pmu; - - spe_pmu = hlist_entry_safe(node, struct arm_spe_pmu, hotplug_node); - if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus)) - return 0; - - __arm_spe_pmu_setup_one(spe_pmu); - return 0; -} - -static int arm_spe_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) -{ - struct arm_spe_pmu *spe_pmu; - - spe_pmu = hlist_entry_safe(node, struct arm_spe_pmu, hotplug_node); - if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus)) - return 0; - - __arm_spe_pmu_stop_one(spe_pmu); - return 0; -} - -static int arm_spe_pmu_dev_init(struct arm_spe_pmu *spe_pmu) -{ - int ret; - cpumask_t *mask = &spe_pmu->supported_cpus; - - /* Make sure we probe the hardware on a relevant CPU */ - ret = smp_call_function_any(mask, __arm_spe_pmu_dev_probe, spe_pmu, 1); - if (ret || !(spe_pmu->features & SPE_PMU_FEAT_DEV_PROBED)) - return -ENXIO; - - /* Request our PPIs (note that the IRQ is still disabled) */ - ret = request_percpu_irq(spe_pmu->irq, arm_spe_pmu_irq_handler, DRVNAME, - spe_pmu->handle); - if (ret) - return ret; + if (FIELD_GET(PMBSR_EL1_DL, pmbsr)) + perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED | + PERF_AUX_FLAG_PARTIAL);
- /* - * Register our hotplug notifier now so we don't miss any events. - * This will enable the IRQ for any supported CPUs that are already - * up. - */ - ret = cpuhp_state_add_instance(arm_spe_pmu_online, - &spe_pmu->hotplug_node); - if (ret) - free_percpu_irq(spe_pmu->irq, spe_pmu->handle); + /* Report collisions to userspace so that it can up the period */ + if (FIELD_GET(PMBSR_EL1_COLL, pmbsr)) + perf_aux_output_flag(handle, PERF_AUX_FLAG_COLLISION);
- return ret; -} + arm_spe_perf_aux_output_end(handle);
-static void arm_spe_pmu_dev_teardown(struct arm_spe_pmu *spe_pmu) -{ - cpuhp_state_remove_instance(arm_spe_pmu_online, &spe_pmu->hotplug_node); - free_percpu_irq(spe_pmu->irq, spe_pmu->handle); + if (act == SPE_PMU_BUF_FAULT_ACT_OK) { + if (!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)) { + arm_spe_perf_aux_output_begin(handle, event); + isb(); + } + } }
-/* Driver and device probing */ -static int arm_spe_pmu_irq_probe(struct arm_spe_pmu *spe_pmu) +static bool arm_spe_pmu_set_cap(struct arm_spe_pmu *spe_pmu) { - struct platform_device *pdev = spe_pmu->pdev; - int irq = platform_get_irq(pdev, 0); - - if (irq < 0) - return -ENXIO; + struct arm_spe *p; + struct device *dev = &spe_pmu->pdev->dev;
- if (!irq_is_percpu(irq)) { - dev_err(&pdev->dev, "expected PPI but got SPI (%d)\n", irq); - return -EINVAL; + p = arm_spe_get_desc(); + if (!p) { + dev_err(dev, "get spe pmu cap from arm spe driver failed!"); + return false; }
- if (irq_get_percpu_devid_partition(irq, &spe_pmu->supported_cpus)) { - dev_err(&pdev->dev, "failed to get PPI partition (%d)\n", irq); - return -EINVAL; - } + spe_pmu->supported_cpus = p->supported_cpus; + spe_pmu->irq = p->irq; + spe_pmu->pmsver = p->pmsver; + spe_pmu->align = p->align; + spe_pmu->features = p->features; + spe_pmu->min_period = p->min_period; + spe_pmu->max_record_sz = p->max_record_sz; + spe_pmu->counter_sz = p->counter_sz;
- spe_pmu->irq = irq; - return 0; + return true; }
static const struct of_device_id arm_spe_pmu_of_match[] = { @@ -1188,7 +957,7 @@ static const struct of_device_id arm_spe_pmu_of_match[] = { MODULE_DEVICE_TABLE(of, arm_spe_pmu_of_match);
static const struct platform_device_id arm_spe_match[] = { - { ARMV8_SPE_PDEV_NAME, 0}, + { ARMV8_SPE_PMU_PDEV_NAME, 0}, { } }; MODULE_DEVICE_TABLE(platform, arm_spe_match); @@ -1221,22 +990,17 @@ static int arm_spe_pmu_device_probe(struct platform_device *pdev) spe_pmu->pdev = pdev; platform_set_drvdata(pdev, spe_pmu);
- ret = arm_spe_pmu_irq_probe(spe_pmu); - if (ret) + if (!arm_spe_pmu_set_cap(spe_pmu)) goto out_free_handle;
- ret = arm_spe_pmu_dev_init(spe_pmu); - if (ret) - goto out_free_handle; + spe_pmu_local = spe_pmu;
ret = arm_spe_pmu_perf_init(spe_pmu); if (ret) - goto out_teardown_dev; + goto out_free_handle;
return 0;
-out_teardown_dev: - arm_spe_pmu_dev_teardown(spe_pmu); out_free_handle: free_percpu(spe_pmu->handle); return ret; @@ -1247,7 +1011,6 @@ static int arm_spe_pmu_device_remove(struct platform_device *pdev) struct arm_spe_pmu *spe_pmu = platform_get_drvdata(pdev);
arm_spe_pmu_perf_destroy(spe_pmu); - arm_spe_pmu_dev_teardown(spe_pmu); free_percpu(spe_pmu->handle); return 0; } @@ -1265,29 +1028,17 @@ static struct platform_driver arm_spe_pmu_driver = {
static int __init arm_spe_pmu_init(void) { - int ret; - - ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, DRVNAME, - arm_spe_pmu_cpu_startup, - arm_spe_pmu_cpu_teardown); - if (ret < 0) - return ret; - arm_spe_pmu_online = ret; - - ret = platform_driver_register(&arm_spe_pmu_driver); - if (ret) - cpuhp_remove_multi_state(arm_spe_pmu_online); - - return ret; + arm_spe_sampling_for_perf_callback_register(arm_spe_sampling_process); + return platform_driver_register(&arm_spe_pmu_driver); }
static void __exit arm_spe_pmu_exit(void) { + arm_spe_sampling_for_perf_callback_register(NULL); platform_driver_unregister(&arm_spe_pmu_driver); - cpuhp_remove_multi_state(arm_spe_pmu_online); }
-module_init(arm_spe_pmu_init); +late_initcall(arm_spe_pmu_init); module_exit(arm_spe_pmu_exit);
MODULE_DESCRIPTION("Perf driver for the ARMv8.2 Statistical Profiling Extension"); diff --git a/include/linux/mem_sampling.h b/include/linux/mem_sampling.h index b27b5e1fd96e..518d262c5b8e 100644 --- a/include/linux/mem_sampling.h +++ b/include/linux/mem_sampling.h @@ -13,6 +13,7 @@ #define __MEM_SAMPLING_H
DECLARE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling); +DECLARE_STATIC_KEY_FALSE(mem_sampling_access_hints);
enum mem_sampling_sample_type { MEM_SAMPLING_L1D_ACCESS = 1 << 0, @@ -76,16 +77,24 @@ enum mem_sampling_type_enum { MEM_SAMPLING_UNSUPPORTED };
+enum user_switch_type { + USER_SWITCH_AWAY_FROM_MEM_SAMPLING, + USER_SWITCH_BACK_TO_MEM_SAMPLING, +}; +typedef void (*mem_sampling_user_switch_cb_type)(enum user_switch_type type); + #ifdef CONFIG_ARM_SPE int arm_spe_start(void); void arm_spe_stop(void); void arm_spe_continue(void); int arm_spe_enabled(void); void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb); +void arm_spe_user_switch_callback_register(mem_sampling_user_switch_cb_type cb); #else static inline void arm_spe_stop(void) { }; static inline void arm_spe_continue(void) { }; static inline void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb) { }; +static inline void arm_spe_user_switch_callback_register(mem_sampling_user_switch_cb_type cb) { };
static inline int arm_spe_start(void) { diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index c7a35d321272..64cef5f97080 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -14,6 +14,8 @@ #include <linux/sysfs.h> #include <asm/cputype.h>
+#include "../../../drivers/arm/spe/spe.h" + #ifdef CONFIG_ARM_PMU
/* @@ -205,6 +207,12 @@ void armpmu_free_irq(int irq, int cpu); #endif /* CONFIG_ARM_PMU */
#define ARMV8_SPE_PDEV_NAME "arm,spe-v1" +#define ARMV8_SPE_PMU_PDEV_NAME "arm,pmu,spe-v1" + #define ARMV8_TRBE_PDEV_NAME "arm,trbe"
+typedef void (*perf_sampling_cb_type)(enum arm_spe_buf_fault_action act); +void arm_spe_sampling_for_perf_callback_register(perf_sampling_cb_type cb); +struct arm_spe *arm_spe_get_desc(void); +void arm_spe_set_user(enum arm_spe_user_e user); #endif /* __ARM_PMU_H__ */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25d348fa0658..a545bae1f070 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2975,7 +2975,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) * Note that currently PMD-level page migration is not * supported. */ - if (static_branch_unlikely(&sched_numabalancing_mem_sampling)) + if (static_branch_unlikely(&mem_sampling_access_hints) && + static_branch_unlikely(&sched_numabalancing_mem_sampling)) return; #endif /* diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c index f08f782caf75..459ffc6906fa 100644 --- a/mm/mem_sampling.c +++ b/mm/mem_sampling.c @@ -31,7 +31,16 @@ struct mem_sampling_ops_struct mem_sampling_ops; #define NUMA_BALANCING_HW_DISABLED 0x0 #define NUMA_BALANCING_HW_NORMAL 0x1
+enum mem_sampling_saved_state_e { + MEM_SAMPLING_STATE_ENABLE, + MEM_SAMPLING_STATE_DISABLE, + MEM_SAMPLING_STATE_EMPTY, +}; + static int mem_sampling_override __initdata; + +enum mem_sampling_saved_state_e mem_sampling_saved_state = MEM_SAMPLING_STATE_EMPTY; + struct mem_sampling_record_cb_list_entry { struct list_head list; mem_sampling_record_cb_type cb; @@ -377,8 +386,14 @@ static int sysctl_mem_sampling_enable(struct ctl_table *table, int write, err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; - if (write) - set_mem_sampling_state(state); + if (write) { + if (mem_sampling_saved_state == MEM_SAMPLING_STATE_EMPTY) + set_mem_sampling_state(state); + else + mem_sampling_saved_state = state ? MEM_SAMPLING_STATE_ENABLE : + MEM_SAMPLING_STATE_DISABLE; + } + return err; } #endif @@ -417,6 +432,43 @@ static struct ctl_table mem_sampling_dir_table[] = { {} };
+static void mem_sampling_user_switch_process(enum user_switch_type type) +{ + bool state; + + if (type > USER_SWITCH_BACK_TO_MEM_SAMPLING) { + pr_err("user switch type error.\n"); + return; + } + + if (type == USER_SWITCH_AWAY_FROM_MEM_SAMPLING) { + /* save state only the status when leave mem_sampling for the first time */ + if (mem_sampling_saved_state != MEM_SAMPLING_STATE_EMPTY) + return; + + if (static_branch_unlikely(&mem_sampling_access_hints)) + mem_sampling_saved_state = MEM_SAMPLING_STATE_ENABLE; + else + mem_sampling_saved_state = MEM_SAMPLING_STATE_DISABLE; + + pr_debug("user switch away from mem_sampling, %s is saved, set to disable.\n", + mem_sampling_saved_state ? "disabled" : "enabled"); + + set_mem_sampling_state(false); + } else { + /* If the state is not backed up, do not restore it */ + if (mem_sampling_saved_state == MEM_SAMPLING_STATE_EMPTY) + return; + + state = (mem_sampling_saved_state == MEM_SAMPLING_STATE_ENABLE) ? true : false; + set_mem_sampling_state(state); + mem_sampling_saved_state = MEM_SAMPLING_STATE_EMPTY; + + pr_debug("user switch back to mem_sampling, set to saved %s.\n", + state ? "enalbe" : "disable"); + } +} + static void __init check_mem_sampling_enable(void) { bool mem_sampling_default = false; @@ -443,6 +495,7 @@ static int __init mem_sampling_init(void) mem_sampling_ops.sampling_continue = arm_spe_continue,
arm_spe_record_capture_callback_register(mem_sampling_process); + arm_spe_user_switch_callback_register(mem_sampling_user_switch_process); break;
default: