From: Qi Liu liuqi115@huawei.com
mainline inclusion from mainline-v5.17-rc1 commit 8404b0fbc7fb category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AZ87 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported to sample bandwidth, latency, buffer occupation etc.
Each PMU RCiEP device monitors multiple Root Ports, and each RCiEP is registered as a PMU in /sys/bus/event_source/devices, so users can select target PMU, and use filter to do further sets.
Filtering options contains: event - select the event. port - select target Root Ports. Information of Root Ports are shown under sysfs. bdf - select requester_id of target EP device. trig_len - set trigger condition for starting event statistics. trig_mode - set trigger mode. 0 means starting to statistic when bigger than trigger condition, and 1 means smaller. thr_len - set threshold for statistics. thr_mode - set threshold mode. 0 means count when bigger than threshold, and 1 means smaller.
Acked-by: Krzysztof WilczyĆski kw@linux.com Reviewed-by: John Garry john.garry@huawei.com Signed-off-by: Qi Liu liuqi115@huawei.com Reviewed-by: Shaokun Zhang zhangshaokun@hisilicon.com Link: https://lore.kernel.org/r/20211202080633.2919-3-liuqi115@huawei.com Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Wangming Shao shaowangming@h-partners.com Reviewed-by: Junhao He hejunhao3@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- MAINTAINERS | 2 + drivers/perf/hisilicon/Kconfig | 9 + drivers/perf/hisilicon/Makefile | 2 + drivers/perf/hisilicon/hisi_pcie_pmu.c | 951 +++++++++++++++++++++++++ include/linux/cpuhotplug.h | 3 + 5 files changed, 967 insertions(+) create mode 100644 drivers/perf/hisilicon/hisi_pcie_pmu.c
diff --git a/MAINTAINERS b/MAINTAINERS index 466b1c599848..9908e5442110 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7976,8 +7976,10 @@ F: Documentation/devicetree/bindings/misc/hisilicon-hikey-usb.yaml
HISILICON PMU DRIVER M: Shaokun Zhang zhangshaokun@hisilicon.com +M: Qi Liu liuqi115@huawei.com S: Supported W: http://www.hisilicon.com +F: Documentation/admin-guide/perf/hisi-pcie-pmu.rst F: Documentation/admin-guide/perf/hisi-pmu.rst F: drivers/perf/hisilicon
diff --git a/drivers/perf/hisilicon/Kconfig b/drivers/perf/hisilicon/Kconfig index c5d1b7019fff..5546218b5598 100644 --- a/drivers/perf/hisilicon/Kconfig +++ b/drivers/perf/hisilicon/Kconfig @@ -5,3 +5,12 @@ config HISI_PMU help Support for HiSilicon SoC L3 Cache performance monitor, Hydra Home Agent performance monitor and DDR Controller performance monitor. + +config HISI_PCIE_PMU + tristate "HiSilicon PCIE PERF PMU" + depends on PCI && ARM64 + help + Provide support for HiSilicon PCIe performance monitoring unit (PMU) + RCiEP devices. + Adds the PCIe PMU into perf events system for monitoring latency, + bandwidth etc. diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile index 22e384cdfd53..ad0e8110f373 100644 --- a/drivers/perf/hisilicon/Makefile +++ b/drivers/perf/hisilicon/Makefile @@ -4,3 +4,5 @@ obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \ hisi_uncore_pa_pmu.o \ hisi_uncore_l3t_pmu.o \ hisi_uncore_lpddrc_pmu.o + +obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c new file mode 100644 index 000000000000..2f18838754ec --- /dev/null +++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c @@ -0,0 +1,951 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This driver adds support for PCIe PMU RCiEP device. Related + * perf events are bandwidth, latency etc. + * + * Copyright (C) 2021 HiSilicon Limited + * Author: Qi Liu liuqi115@huawei.com + */ +#include <linux/bitfield.h> +#include <linux/bitmap.h> +#include <linux/bug.h> +#include <linux/device.h> +#include <linux/err.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/perf_event.h> + +#define DRV_NAME "hisi_pcie_pmu" +/* Define registers */ +#define HISI_PCIE_GLOBAL_CTRL 0x00 +#define HISI_PCIE_EVENT_CTRL 0x010 +#define HISI_PCIE_CNT 0x090 +#define HISI_PCIE_EXT_CNT 0x110 +#define HISI_PCIE_INT_STAT 0x150 +#define HISI_PCIE_INT_MASK 0x154 +#define HISI_PCIE_REG_BDF 0xfe0 +#define HISI_PCIE_REG_VERSION 0xfe4 +#define HISI_PCIE_REG_INFO 0xfe8 + +/* Define command in HISI_PCIE_GLOBAL_CTRL */ +#define HISI_PCIE_GLOBAL_EN 0x01 +#define HISI_PCIE_GLOBAL_NONE 0 + +/* Define command in HISI_PCIE_EVENT_CTRL */ +#define HISI_PCIE_EVENT_EN BIT_ULL(20) +#define HISI_PCIE_RESET_CNT BIT_ULL(22) +#define HISI_PCIE_INIT_SET BIT_ULL(34) +#define HISI_PCIE_THR_EN BIT_ULL(26) +#define HISI_PCIE_TARGET_EN BIT_ULL(32) +#define HISI_PCIE_TRIG_EN BIT_ULL(52) + +/* Define offsets in HISI_PCIE_EVENT_CTRL */ +#define HISI_PCIE_EVENT_M GENMASK_ULL(15, 0) +#define HISI_PCIE_THR_MODE_M GENMASK_ULL(27, 27) +#define HISI_PCIE_THR_M GENMASK_ULL(31, 28) +#define HISI_PCIE_TARGET_M GENMASK_ULL(52, 36) +#define HISI_PCIE_TRIG_MODE_M GENMASK_ULL(53, 53) +#define HISI_PCIE_TRIG_M GENMASK_ULL(59, 56) + +#define HISI_PCIE_MAX_COUNTERS 8 +#define HISI_PCIE_REG_STEP 8 +#define HISI_PCIE_THR_MAX_VAL 10 +#define HISI_PCIE_TRIG_MAX_VAL 10 +#define HISI_PCIE_MAX_PERIOD (GENMASK_ULL(63, 0)) +#define HISI_PCIE_INIT_VAL BIT_ULL(63) + +struct hisi_pcie_pmu { + struct perf_event *hw_events[HISI_PCIE_MAX_COUNTERS]; + struct hlist_node node; + struct pci_dev *pdev; + struct pmu pmu; + void __iomem *base; + int irq; + u32 identifier; + /* Minimum and maximum BDF of root ports monitored by PMU */ + u16 bdf_min; + u16 bdf_max; + int on_cpu; +}; + +struct hisi_pcie_reg_pair { + u16 lo; + u16 hi; +}; + +#define to_pcie_pmu(p) (container_of((p), struct hisi_pcie_pmu, pmu)) +#define GET_PCI_DEVFN(bdf) ((bdf) & 0xff) + +#define HISI_PCIE_PMU_FILTER_ATTR(_name, _config, _hi, _lo) \ + static u64 hisi_pcie_get_##_name(struct perf_event *event) \ + { \ + return FIELD_GET(GENMASK(_hi, _lo), event->attr._config); \ + } \ + +HISI_PCIE_PMU_FILTER_ATTR(event, config, 16, 0); +HISI_PCIE_PMU_FILTER_ATTR(thr_len, config1, 3, 0); +HISI_PCIE_PMU_FILTER_ATTR(thr_mode, config1, 4, 4); +HISI_PCIE_PMU_FILTER_ATTR(trig_len, config1, 8, 5); +HISI_PCIE_PMU_FILTER_ATTR(trig_mode, config1, 9, 9); +HISI_PCIE_PMU_FILTER_ATTR(port, config2, 15, 0); +HISI_PCIE_PMU_FILTER_ATTR(bdf, config2, 31, 16); + +static ssize_t hisi_pcie_format_sysfs_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct dev_ext_attribute *eattr; + + eattr = container_of(attr, struct dev_ext_attribute, attr); + + return sysfs_emit(buf, "%s\n", (char *)eattr->var); +} + +static ssize_t hisi_pcie_event_sysfs_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct perf_pmu_events_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_attr, attr); + + return sysfs_emit(buf, "config=0x%llx\n", pmu_attr->id); +} + +#define HISI_PCIE_PMU_FORMAT_ATTR(_name, _format) \ + (&((struct dev_ext_attribute[]){ \ + { .attr = __ATTR(_name, 0444, hisi_pcie_format_sysfs_show, \ + NULL), \ + .var = (void *)_format } \ + })[0].attr.attr) + +#define HISI_PCIE_PMU_EVENT_ATTR(_name, _id) \ + (&((struct perf_pmu_events_attr[]) { \ + { .attr = __ATTR(_name, 0444, hisi_pcie_event_sysfs_show, NULL), \ + .id = _id, } \ + })[0].attr.attr) + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev)); + + return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu)); +} +static DEVICE_ATTR_RO(cpumask); + +static ssize_t identifier_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%#x\n", pcie_pmu->identifier); +} +static DEVICE_ATTR_RO(identifier); + +static ssize_t bus_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%#04x\n", PCI_BUS_NUM(pcie_pmu->bdf_min)); +} +static DEVICE_ATTR_RO(bus); + +static struct hisi_pcie_reg_pair +hisi_pcie_parse_reg_value(struct hisi_pcie_pmu *pcie_pmu, u32 reg_off) +{ + u32 val = readl_relaxed(pcie_pmu->base + reg_off); + struct hisi_pcie_reg_pair regs = { + .lo = val, + .hi = val >> 16, + }; + + return regs; +} + +/* + * Hardware counter and ext_counter work together for bandwidth, latency, bus + * utilization and buffer occupancy events. For example, RX memory write latency + * events(index = 0x0010), counter counts total delay cycles and ext_counter + * counts RX memory write PCIe packets number. + * + * As we don't want PMU driver to process these two data, "delay cycles" can + * be treated as an independent event(index = 0x0010), "RX memory write packets + * number" as another(index = 0x10010). BIT 16 is used to distinguish and 0-15 + * bits are "real" event index, which can be used to set HISI_PCIE_EVENT_CTRL. + */ +#define EXT_COUNTER_IS_USED(idx) ((idx) & BIT(16)) + +static u32 hisi_pcie_get_real_event(struct perf_event *event) +{ + return hisi_pcie_get_event(event) & GENMASK(15, 0); +} + +static u32 hisi_pcie_pmu_get_offset(u32 offset, u32 idx) +{ + return offset + HISI_PCIE_REG_STEP * idx; +} + +static u32 hisi_pcie_pmu_readl(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, + u32 idx) +{ + u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx); + + return readl_relaxed(pcie_pmu->base + offset); +} + +static void hisi_pcie_pmu_writel(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx, u32 val) +{ + u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx); + + writel_relaxed(val, pcie_pmu->base + offset); +} + +static u64 hisi_pcie_pmu_readq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx) +{ + u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx); + + return readq_relaxed(pcie_pmu->base + offset); +} + +static void hisi_pcie_pmu_writeq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx, u64 val) +{ + u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx); + + writeq_relaxed(val, pcie_pmu->base + offset); +} + +static void hisi_pcie_pmu_config_filter(struct perf_event *event) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u64 reg = HISI_PCIE_INIT_SET; + u64 port, trig_len, thr_len; + + /* Config HISI_PCIE_EVENT_CTRL according to event. */ + reg |= FIELD_PREP(HISI_PCIE_EVENT_M, hisi_pcie_get_real_event(event)); + + /* Config HISI_PCIE_EVENT_CTRL according to root port or EP device. */ + port = hisi_pcie_get_port(event); + if (port) + reg |= FIELD_PREP(HISI_PCIE_TARGET_M, port); + else + reg |= HISI_PCIE_TARGET_EN | + FIELD_PREP(HISI_PCIE_TARGET_M, hisi_pcie_get_bdf(event)); + + /* Config HISI_PCIE_EVENT_CTRL according to trigger condition. */ + trig_len = hisi_pcie_get_trig_len(event); + if (trig_len) { + reg |= FIELD_PREP(HISI_PCIE_TRIG_M, trig_len); + reg |= FIELD_PREP(HISI_PCIE_TRIG_MODE_M, hisi_pcie_get_trig_mode(event)); + reg |= HISI_PCIE_TRIG_EN; + } + + /* Config HISI_PCIE_EVENT_CTRL according to threshold condition. */ + thr_len = hisi_pcie_get_thr_len(event); + if (thr_len) { + reg |= FIELD_PREP(HISI_PCIE_THR_M, thr_len); + reg |= FIELD_PREP(HISI_PCIE_THR_MODE_M, hisi_pcie_get_thr_mode(event)); + reg |= HISI_PCIE_THR_EN; + } + + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx, reg); +} + +static void hisi_pcie_pmu_clear_filter(struct perf_event *event) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx, HISI_PCIE_INIT_SET); +} + +static bool hisi_pcie_pmu_valid_requester_id(struct hisi_pcie_pmu *pcie_pmu, u32 bdf) +{ + struct pci_dev *root_port, *pdev; + u16 rp_bdf; + + pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pcie_pmu->pdev->bus), PCI_BUS_NUM(bdf), + GET_PCI_DEVFN(bdf)); + if (!pdev) + return false; + + root_port = pcie_find_root_port(pdev); + if (!root_port) { + pci_dev_put(pdev); + return false; + } + + pci_dev_put(pdev); + rp_bdf = pci_dev_id(root_port); + return rp_bdf >= pcie_pmu->bdf_min && rp_bdf <= pcie_pmu->bdf_max; +} + +static bool hisi_pcie_pmu_valid_filter(struct perf_event *event, + struct hisi_pcie_pmu *pcie_pmu) +{ + u32 requester_id = hisi_pcie_get_bdf(event); + + if (hisi_pcie_get_thr_len(event) > HISI_PCIE_THR_MAX_VAL) + return false; + + if (hisi_pcie_get_trig_len(event) > HISI_PCIE_TRIG_MAX_VAL) + return false; + + if (requester_id) { + if (!hisi_pcie_pmu_valid_requester_id(pcie_pmu, requester_id)) + return false; + } + + return true; +} + +static bool hisi_pcie_pmu_cmp_event(struct perf_event *target, + struct perf_event *event) +{ + return hisi_pcie_get_real_event(target) == hisi_pcie_get_real_event(event); +} + +static bool hisi_pcie_pmu_validate_event_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct perf_event *event_group[HISI_PCIE_MAX_COUNTERS]; + int counters = 1; + int num; + + event_group[0] = leader; + if (!is_software_event(leader)) { + if (leader->pmu != event->pmu) + return false; + + if (leader != event && !hisi_pcie_pmu_cmp_event(leader, event)) + event_group[counters++] = event; + } + + for_each_sibling_event(sibling, event->group_leader) { + if (is_software_event(sibling)) + continue; + + if (sibling->pmu != event->pmu) + return false; + + for (num = 0; num < counters; num++) { + if (hisi_pcie_pmu_cmp_event(event_group[num], sibling)) + break; + } + + if (num == counters) + event_group[counters++] = sibling; + } + + return counters <= HISI_PCIE_MAX_COUNTERS; +} + +static int hisi_pcie_pmu_event_init(struct perf_event *event) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + event->cpu = pcie_pmu->on_cpu; + + if (EXT_COUNTER_IS_USED(hisi_pcie_get_event(event))) + hwc->event_base = HISI_PCIE_EXT_CNT; + else + hwc->event_base = HISI_PCIE_CNT; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* Sampling is not supported. */ + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) + return -EOPNOTSUPP; + + if (!hisi_pcie_pmu_valid_filter(event, pcie_pmu)) + return -EINVAL; + + if (!hisi_pcie_pmu_validate_event_group(event)) + return -EINVAL; + + return 0; +} + +static u64 hisi_pcie_pmu_read_counter(struct perf_event *event) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + u32 idx = event->hw.idx; + + return hisi_pcie_pmu_readq(pcie_pmu, event->hw.event_base, idx); +} + +static int hisi_pcie_pmu_find_related_event(struct hisi_pcie_pmu *pcie_pmu, + struct perf_event *event) +{ + struct perf_event *sibling; + int idx; + + for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) { + sibling = pcie_pmu->hw_events[idx]; + if (!sibling) + continue; + + if (!hisi_pcie_pmu_cmp_event(sibling, event)) + continue; + + /* Related events must be used in group */ + if (sibling->group_leader == event->group_leader) + return idx; + else + return -EINVAL; + } + + return idx; +} + +static int hisi_pcie_pmu_get_event_idx(struct hisi_pcie_pmu *pcie_pmu) +{ + int idx; + + for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) { + if (!pcie_pmu->hw_events[idx]) + return idx; + } + + return -EINVAL; +} + +static void hisi_pcie_pmu_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 new_cnt, prev_cnt, delta; + + do { + prev_cnt = local64_read(&hwc->prev_count); + new_cnt = hisi_pcie_pmu_read_counter(event); + } while (local64_cmpxchg(&hwc->prev_count, prev_cnt, + new_cnt) != prev_cnt); + + delta = (new_cnt - prev_cnt) & HISI_PCIE_MAX_PERIOD; + local64_add(delta, &event->count); +} + +static void hisi_pcie_pmu_read(struct perf_event *event) +{ + hisi_pcie_pmu_event_update(event); +} + +static void hisi_pcie_pmu_set_period(struct perf_event *event) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + local64_set(&hwc->prev_count, HISI_PCIE_INIT_VAL); + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_CNT, idx, HISI_PCIE_INIT_VAL); + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EXT_CNT, idx, HISI_PCIE_INIT_VAL); +} + +static void hisi_pcie_pmu_enable_counter(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc) +{ + u32 idx = hwc->idx; + u64 val; + + val = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx); + val |= HISI_PCIE_EVENT_EN; + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, val); +} + +static void hisi_pcie_pmu_disable_counter(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc) +{ + u32 idx = hwc->idx; + u64 val; + + val = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx); + val &= ~HISI_PCIE_EVENT_EN; + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, val); +} + +static void hisi_pcie_pmu_enable_int(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc) +{ + u32 idx = hwc->idx; + + hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_MASK, idx, 0); +} + +static void hisi_pcie_pmu_disable_int(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc) +{ + u32 idx = hwc->idx; + + hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_MASK, idx, 1); +} + +static void hisi_pcie_pmu_reset_counter(struct hisi_pcie_pmu *pcie_pmu, int idx) +{ + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, HISI_PCIE_RESET_CNT); + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, HISI_PCIE_INIT_SET); +} + +static void hisi_pcie_pmu_start(struct perf_event *event, int flags) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + u64 prev_cnt; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + hisi_pcie_pmu_config_filter(event); + hisi_pcie_pmu_enable_counter(pcie_pmu, hwc); + hisi_pcie_pmu_enable_int(pcie_pmu, hwc); + hisi_pcie_pmu_set_period(event); + + if (flags & PERF_EF_RELOAD) { + prev_cnt = local64_read(&hwc->prev_count); + hisi_pcie_pmu_writeq(pcie_pmu, hwc->event_base, idx, prev_cnt); + } + + perf_event_update_userpage(event); +} + +static void hisi_pcie_pmu_stop(struct perf_event *event, int flags) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + hisi_pcie_pmu_event_update(event); + hisi_pcie_pmu_disable_int(pcie_pmu, hwc); + hisi_pcie_pmu_disable_counter(pcie_pmu, hwc); + hisi_pcie_pmu_clear_filter(event); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + if (hwc->state & PERF_HES_UPTODATE) + return; + + hwc->state |= PERF_HES_UPTODATE; +} + +static int hisi_pcie_pmu_add(struct perf_event *event, int flags) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx; + + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + + /* Check all working events to find a related event. */ + idx = hisi_pcie_pmu_find_related_event(pcie_pmu, event); + if (idx < 0) + return idx; + + /* Current event shares an enabled counter with the related event */ + if (idx < HISI_PCIE_MAX_COUNTERS) { + hwc->idx = idx; + goto start_count; + } + + idx = hisi_pcie_pmu_get_event_idx(pcie_pmu); + if (idx < 0) + return idx; + + hwc->idx = idx; + pcie_pmu->hw_events[idx] = event; + /* Reset Counter to avoid previous statistic interference. */ + hisi_pcie_pmu_reset_counter(pcie_pmu, idx); + +start_count: + if (flags & PERF_EF_START) + hisi_pcie_pmu_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void hisi_pcie_pmu_del(struct perf_event *event, int flags) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + hisi_pcie_pmu_stop(event, PERF_EF_UPDATE); + pcie_pmu->hw_events[hwc->idx] = NULL; + perf_event_update_userpage(event); +} + +static void hisi_pcie_pmu_enable(struct pmu *pmu) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu); + int num; + + for (num = 0; num < HISI_PCIE_MAX_COUNTERS; num++) { + if (pcie_pmu->hw_events[num]) + break; + } + + if (num == HISI_PCIE_MAX_COUNTERS) + return; + + writel(HISI_PCIE_GLOBAL_EN, pcie_pmu->base + HISI_PCIE_GLOBAL_CTRL); +} + +static void hisi_pcie_pmu_disable(struct pmu *pmu) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu); + + writel(HISI_PCIE_GLOBAL_NONE, pcie_pmu->base + HISI_PCIE_GLOBAL_CTRL); +} + +static irqreturn_t hisi_pcie_pmu_irq(int irq, void *data) +{ + struct hisi_pcie_pmu *pcie_pmu = data; + irqreturn_t ret = IRQ_NONE; + struct perf_event *event; + u32 overflown; + int idx; + + for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) { + overflown = hisi_pcie_pmu_readl(pcie_pmu, HISI_PCIE_INT_STAT, idx); + if (!overflown) + continue; + + /* Clear status of interrupt. */ + hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_STAT, idx, 1); + event = pcie_pmu->hw_events[idx]; + if (!event) + continue; + + hisi_pcie_pmu_event_update(event); + hisi_pcie_pmu_set_period(event); + ret = IRQ_HANDLED; + } + + return ret; +} + +static int hisi_pcie_pmu_irq_register(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu) +{ + int irq, ret; + + ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI); + if (ret < 0) { + pci_err(pdev, "Failed to enable MSI vectors: %d\n", ret); + return ret; + } + + irq = pci_irq_vector(pdev, 0); + ret = request_irq(irq, hisi_pcie_pmu_irq, IRQF_NOBALANCING | IRQF_NO_THREAD, DRV_NAME, + pcie_pmu); + if (ret) { + pci_err(pdev, "Failed to register IRQ: %d\n", ret); + pci_free_irq_vectors(pdev); + return ret; + } + + pcie_pmu->irq = irq; + + return 0; +} + +static void hisi_pcie_pmu_irq_unregister(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu) +{ + free_irq(pcie_pmu->irq, pcie_pmu); + pci_free_irq_vectors(pdev); +} + +static int hisi_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node, struct hisi_pcie_pmu, node); + + if (pcie_pmu->on_cpu == -1) { + pcie_pmu->on_cpu = cpu; + WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(cpu))); + } + + return 0; +} + +static int hisi_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node, struct hisi_pcie_pmu, node); + unsigned int target; + + /* Nothing to do if this CPU doesn't own the PMU */ + if (pcie_pmu->on_cpu != cpu) + return 0; + + pcie_pmu->on_cpu = -1; + /* Choose a new CPU from all online cpus. */ + target = cpumask_first(cpu_online_mask); + if (target >= nr_cpu_ids) { + pci_err(pcie_pmu->pdev, "There is no CPU to set\n"); + return 0; + } + + perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target); + /* Use this CPU for event counting */ + pcie_pmu->on_cpu = target; + WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(target))); + + return 0; +} + +static struct attribute *hisi_pcie_pmu_events_attr[] = { + HISI_PCIE_PMU_EVENT_ATTR(rx_mwr_latency, 0x0010), + HISI_PCIE_PMU_EVENT_ATTR(rx_mwr_cnt, 0x10010), + HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_latency, 0x0210), + HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_cnt, 0x10210), + HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_latency, 0x0011), + HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_cnt, 0x10011), + HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_flux, 0x1005), + HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_time, 0x11005), + HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_flux, 0x2004), + HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_time, 0x12004), + NULL +}; + +static struct attribute_group hisi_pcie_pmu_events_group = { + .name = "events", + .attrs = hisi_pcie_pmu_events_attr, +}; + +static struct attribute *hisi_pcie_pmu_format_attr[] = { + HISI_PCIE_PMU_FORMAT_ATTR(event, "config:0-16"), + HISI_PCIE_PMU_FORMAT_ATTR(thr_len, "config1:0-3"), + HISI_PCIE_PMU_FORMAT_ATTR(thr_mode, "config1:4"), + HISI_PCIE_PMU_FORMAT_ATTR(trig_len, "config1:5-8"), + HISI_PCIE_PMU_FORMAT_ATTR(trig_mode, "config1:9"), + HISI_PCIE_PMU_FORMAT_ATTR(port, "config2:0-15"), + HISI_PCIE_PMU_FORMAT_ATTR(bdf, "config2:16-31"), + NULL +}; + +static const struct attribute_group hisi_pcie_pmu_format_group = { + .name = "format", + .attrs = hisi_pcie_pmu_format_attr, +}; + +static struct attribute *hisi_pcie_pmu_bus_attrs[] = { + &dev_attr_bus.attr, + NULL +}; + +static const struct attribute_group hisi_pcie_pmu_bus_attr_group = { + .attrs = hisi_pcie_pmu_bus_attrs, +}; + +static struct attribute *hisi_pcie_pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static const struct attribute_group hisi_pcie_pmu_cpumask_attr_group = { + .attrs = hisi_pcie_pmu_cpumask_attrs, +}; + +static struct attribute *hisi_pcie_pmu_identifier_attrs[] = { + &dev_attr_identifier.attr, + NULL +}; + +static const struct attribute_group hisi_pcie_pmu_identifier_attr_group = { + .attrs = hisi_pcie_pmu_identifier_attrs, +}; + +static const struct attribute_group *hisi_pcie_pmu_attr_groups[] = { + &hisi_pcie_pmu_events_group, + &hisi_pcie_pmu_format_group, + &hisi_pcie_pmu_bus_attr_group, + &hisi_pcie_pmu_cpumask_attr_group, + &hisi_pcie_pmu_identifier_attr_group, + NULL +}; + +static int hisi_pcie_alloc_pmu(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu) +{ + struct hisi_pcie_reg_pair regs; + u16 sicl_id, core_id; + char *name; + + regs = hisi_pcie_parse_reg_value(pcie_pmu, HISI_PCIE_REG_BDF); + pcie_pmu->bdf_min = regs.lo; + pcie_pmu->bdf_max = regs.hi; + + regs = hisi_pcie_parse_reg_value(pcie_pmu, HISI_PCIE_REG_INFO); + sicl_id = regs.hi; + core_id = regs.lo; + + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_pcie%u_core%u", sicl_id, core_id); + if (!name) + return -ENOMEM; + + pcie_pmu->pdev = pdev; + pcie_pmu->on_cpu = -1; + pcie_pmu->identifier = readl(pcie_pmu->base + HISI_PCIE_REG_VERSION); + pcie_pmu->pmu = (struct pmu) { + .name = name, + .module = THIS_MODULE, + .event_init = hisi_pcie_pmu_event_init, + .pmu_enable = hisi_pcie_pmu_enable, + .pmu_disable = hisi_pcie_pmu_disable, + .add = hisi_pcie_pmu_add, + .del = hisi_pcie_pmu_del, + .start = hisi_pcie_pmu_start, + .stop = hisi_pcie_pmu_stop, + .read = hisi_pcie_pmu_read, + .task_ctx_nr = perf_invalid_context, + .attr_groups = hisi_pcie_pmu_attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + }; + + return 0; +} + +static int hisi_pcie_init_pmu(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu) +{ + int ret; + + pcie_pmu->base = pci_ioremap_bar(pdev, 2); + if (!pcie_pmu->base) { + pci_err(pdev, "Ioremap failed for pcie_pmu resource\n"); + return -ENOMEM; + } + + ret = hisi_pcie_alloc_pmu(pdev, pcie_pmu); + if (ret) + goto err_iounmap; + + ret = hisi_pcie_pmu_irq_register(pdev, pcie_pmu); + if (ret) + goto err_iounmap; + + ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node); + if (ret) { + pci_err(pdev, "Failed to register hotplug: %d\n", ret); + goto err_irq_unregister; + } + + ret = perf_pmu_register(&pcie_pmu->pmu, pcie_pmu->pmu.name, -1); + if (ret) { + pci_err(pdev, "Failed to register PCIe PMU: %d\n", ret); + goto err_hotplug_unregister; + } + + return ret; + +err_hotplug_unregister: + cpuhp_state_remove_instance_nocalls( + CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node); + +err_irq_unregister: + hisi_pcie_pmu_irq_unregister(pdev, pcie_pmu); + +err_iounmap: + iounmap(pcie_pmu->base); + + return ret; +} + +static void hisi_pcie_uninit_pmu(struct pci_dev *pdev) +{ + struct hisi_pcie_pmu *pcie_pmu = pci_get_drvdata(pdev); + + perf_pmu_unregister(&pcie_pmu->pmu); + cpuhp_state_remove_instance_nocalls( + CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node); + hisi_pcie_pmu_irq_unregister(pdev, pcie_pmu); + iounmap(pcie_pmu->base); +} + +static int hisi_pcie_init_dev(struct pci_dev *pdev) +{ + int ret; + + ret = pcim_enable_device(pdev); + if (ret) { + pci_err(pdev, "Failed to enable PCI device: %d\n", ret); + return ret; + } + + ret = pcim_iomap_regions(pdev, BIT(2), DRV_NAME); + if (ret < 0) { + pci_err(pdev, "Failed to request PCI mem regions: %d\n", ret); + return ret; + } + + pci_set_master(pdev); + + return 0; +} + +static int hisi_pcie_pmu_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct hisi_pcie_pmu *pcie_pmu; + int ret; + + pcie_pmu = devm_kzalloc(&pdev->dev, sizeof(*pcie_pmu), GFP_KERNEL); + if (!pcie_pmu) + return -ENOMEM; + + ret = hisi_pcie_init_dev(pdev); + if (ret) + return ret; + + ret = hisi_pcie_init_pmu(pdev, pcie_pmu); + if (ret) + return ret; + + pci_set_drvdata(pdev, pcie_pmu); + + return ret; +} + +static void hisi_pcie_pmu_remove(struct pci_dev *pdev) +{ + hisi_pcie_uninit_pmu(pdev); + pci_set_drvdata(pdev, NULL); +} + +static const struct pci_device_id hisi_pcie_pmu_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, 0xa12d) }, + { 0, } +}; +MODULE_DEVICE_TABLE(pci, hisi_pcie_pmu_ids); + +static struct pci_driver hisi_pcie_pmu_driver = { + .name = DRV_NAME, + .id_table = hisi_pcie_pmu_ids, + .probe = hisi_pcie_pmu_probe, + .remove = hisi_pcie_pmu_remove, +}; + +static int __init hisi_pcie_module_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, + "AP_PERF_ARM_HISI_PCIE_PMU_ONLINE", + hisi_pcie_pmu_online_cpu, + hisi_pcie_pmu_offline_cpu); + if (ret) { + pr_err("Failed to setup PCIe PMU hotplug: %d\n", ret); + return ret; + } + + ret = pci_register_driver(&hisi_pcie_pmu_driver); + if (ret) + cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE); + + return ret; +} +module_init(hisi_pcie_module_init); + +static void __exit hisi_pcie_module_exit(void) +{ + pci_unregister_driver(&hisi_pcie_pmu_driver); + cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE); +} +module_exit(hisi_pcie_module_exit); + +MODULE_DESCRIPTION("HiSilicon PCIe PMU driver"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Qi Liu liuqi115@huawei.com"); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index b98b9eb7d5f8..24e7be132046 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -178,6 +178,9 @@ enum cpuhp_state { CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, CPUHP_AP_PERF_ARM_HISI_PA_ONLINE, CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE, + #ifndef __GENKSYMS__ + CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, + #endif CPUHP_AP_PERF_ARM_L2X0_ONLINE, CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE, CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
From: Qi Liu liuqi115@huawei.com
mainline inclusion from mainline-v5.19-rc1 commit 807907dae970 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AZ87 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
If a PMU is in a SICL (Super IO cluster), it is not appropriate to associate this PMU with a CPU die. So we associate it with all CPUs online, rather than CPUs in the nearest SCCL.
As the firmware of Hip09 platform hasn't been published yet, change of PMU driver will not influence backwards compatibility between driver and firmware.
Signed-off-by: Qi Liu liuqi115@huawei.com Reviewed-by: John Garry john.garry@huawei.com Link: https://lore.kernel.org/r/20220415102352.6665-2-liuqi115@huawei.com Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Wangming Shao shaowangming@h-partners.com Reviewed-by: Junhao He hejunhao3@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/perf/hisilicon/hisi_uncore_pa_pmu.c | 18 +++++++----------- drivers/perf/hisilicon/hisi_uncore_pmu.c | 4 ++++ drivers/perf/hisilicon/hisi_uncore_pmu.h | 1 + 3 files changed, 12 insertions(+), 11 deletions(-)
diff --git a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c index 390e59f4ef60..f1e6b5cee075 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c @@ -258,13 +258,12 @@ static int hisi_pa_pmu_init_data(struct platform_device *pdev, struct hisi_pmu *pa_pmu) { /* - * Use the SCCL_ID and the index ID to identify the PA PMU, - * while SCCL_ID is the nearst SCCL_ID from this SICL and - * CPU core is chosen from this SCCL to manage this PMU. + * As PA PMU is in a SICL, use the SICL_ID and the index ID + * to identify the PA PMU. */ if (device_property_read_u32(&pdev->dev, "hisilicon,scl-id", - &pa_pmu->sccl_id)) { - dev_err(&pdev->dev, "Cannot read sccl-id!\n"); + &pa_pmu->sicl_id)) { + dev_err(&pdev->dev, "Cannot read sicl-id!\n"); return -EINVAL; }
@@ -275,6 +274,7 @@ static int hisi_pa_pmu_init_data(struct platform_device *pdev, }
pa_pmu->ccl_id = -1; + pa_pmu->sccl_id = -1;
pa_pmu->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(pa_pmu->base)) { @@ -399,13 +399,9 @@ static int hisi_pa_pmu_probe(struct platform_device *pdev) ret = hisi_pa_pmu_dev_probe(pdev, pa_pmu); if (ret) return ret; - /* - * PA is attached in SICL and the CPU core is chosen to manage this - * PMU which is the nearest SCCL, while its SCCL_ID is greater than - * one with the SICL_ID. - */ + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sicl%u_pa%u", - pa_pmu->sccl_id - 1, pa_pmu->index_id); + pa_pmu->sicl_id, pa_pmu->index_id); if (!name) return -ENOMEM;
diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c index 868fec1f933d..66ed28c2c544 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c @@ -465,6 +465,10 @@ static bool hisi_pmu_cpu_is_associated_pmu(struct hisi_pmu *hisi_pmu) { int sccl_id, ccl_id;
+ /* If SCCL_ID is -1, the PMU is in a SICL and has no CPU affinity */ + if (hisi_pmu->sccl_id == -1) + return true; + if (hisi_pmu->ccl_id == -1) { /* If CCL_ID is -1, the PMU only shares the same SCCL */ hisi_read_sccl_and_ccl_id(&sccl_id, NULL); diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h index ea9d89bbc1ea..0a90c2881854 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.h +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h @@ -81,6 +81,7 @@ struct hisi_pmu { struct device *dev; struct hlist_node node; int sccl_id; + int sicl_id; int ccl_id; void __iomem *base; /* the ID of the PMU modules */
From: Qi Liu liuqi115@huawei.com
mainline inclusion from mainline-v5.19-rc1 commit 6b79738b6ed9 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AZ87 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
On HiSilicon Hip09 platform, there is a CPA (Coherency Protocol Agent) on each SICL (Super IO Cluster) which implements packet format translation, route parsing and traffic statistics.
CPA PMU has 8 PMU counters and interrupt is supported to handle counter overflow. Let's support its driver under the framework of HiSilicon PMU driver.
Signed-off-by: Qi Liu liuqi115@huawei.com Reviewed-by: John Garry john.garry@huawei.com Reviewed-by: Shaokun Zhang zhangshaokun@hisilicon.com Link: https://lore.kernel.org/r/20220415102352.6665-3-liuqi115@huawei.com Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Wangming Shao shaowangming@h-partners.com Reviewed-by: Junhao He hejunhao3@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/perf/hisilicon/Makefile | 2 +- drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c | 409 +++++++++++++++++++ include/linux/cpuhotplug.h | 3 + 3 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c
diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile index ad0e8110f373..a3522abb3975 100644 --- a/drivers/perf/hisilicon/Makefile +++ b/drivers/perf/hisilicon/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \ hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o \ - hisi_uncore_pa_pmu.o \ + hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o \ hisi_uncore_l3t_pmu.o \ hisi_uncore_lpddrc_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c new file mode 100644 index 000000000000..a9bb73f76be4 --- /dev/null +++ b/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * HiSilicon SoC CPA(Coherency Protocol Agent) hardware event counters support + * + * Copyright (C) 2022 HiSilicon Limited + * Author: Qi Liu liuqi115@huawei.com + * + * This code is based on the uncore PMUs like arm-cci and arm-ccn. + */ + +#define pr_fmt(fmt) "cpa pmu: " fmt +#include <linux/acpi.h> +#include <linux/bug.h> +#include <linux/cpuhotplug.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/list.h> +#include <linux/smp.h> + +#include "hisi_uncore_pmu.h" + +/* CPA register definition */ +#define CPA_PERF_CTRL 0x1c00 +#define CPA_EVENT_CTRL 0x1c04 +#define CPA_INT_MASK 0x1c70 +#define CPA_INT_STATUS 0x1c78 +#define CPA_INT_CLEAR 0x1c7c +#define CPA_EVENT_TYPE0 0x1c80 +#define CPA_VERSION 0x1cf0 +#define CPA_CNT0_LOWER 0x1d00 +#define CPA_CFG_REG 0x0534 + +/* CPA operation command */ +#define CPA_PERF_CTRL_EN BIT_ULL(0) +#define CPA_EVTYPE_MASK 0xffUL +#define CPA_PM_CTRL BIT_ULL(9) + +/* CPA has 8-counters */ +#define CPA_NR_COUNTERS 0x8 +#define CPA_COUNTER_BITS 64 +#define CPA_NR_EVENTS 0xff +#define CPA_REG_OFFSET 0x8 + +static u32 hisi_cpa_pmu_get_counter_offset(int idx) +{ + return (CPA_CNT0_LOWER + idx * CPA_REG_OFFSET); +} + +static u64 hisi_cpa_pmu_read_counter(struct hisi_pmu *cpa_pmu, + struct hw_perf_event *hwc) +{ + return readq(cpa_pmu->base + hisi_cpa_pmu_get_counter_offset(hwc->idx)); +} + +static void hisi_cpa_pmu_write_counter(struct hisi_pmu *cpa_pmu, + struct hw_perf_event *hwc, u64 val) +{ + writeq(val, cpa_pmu->base + hisi_cpa_pmu_get_counter_offset(hwc->idx)); +} + +static void hisi_cpa_pmu_write_evtype(struct hisi_pmu *cpa_pmu, int idx, + u32 type) +{ + u32 reg, reg_idx, shift, val; + + /* + * Select the appropriate event select register(CPA_EVENT_TYPE0/1). + * There are 2 event select registers for the 8 hardware counters. + * Event code is 8-bits and for the former 4 hardware counters, + * CPA_EVENT_TYPE0 is chosen. For the latter 4 hardware counters, + * CPA_EVENT_TYPE1 is chosen. + */ + reg = CPA_EVENT_TYPE0 + (idx / 4) * 4; + reg_idx = idx % 4; + shift = CPA_REG_OFFSET * reg_idx; + + /* Write event code to CPA_EVENT_TYPEx Register */ + val = readl(cpa_pmu->base + reg); + val &= ~(CPA_EVTYPE_MASK << shift); + val |= type << shift; + writel(val, cpa_pmu->base + reg); +} + +static void hisi_cpa_pmu_start_counters(struct hisi_pmu *cpa_pmu) +{ + u32 val; + + val = readl(cpa_pmu->base + CPA_PERF_CTRL); + val |= CPA_PERF_CTRL_EN; + writel(val, cpa_pmu->base + CPA_PERF_CTRL); +} + +static void hisi_cpa_pmu_stop_counters(struct hisi_pmu *cpa_pmu) +{ + u32 val; + + val = readl(cpa_pmu->base + CPA_PERF_CTRL); + val &= ~(CPA_PERF_CTRL_EN); + writel(val, cpa_pmu->base + CPA_PERF_CTRL); +} + +static void hisi_cpa_pmu_disable_pm(struct hisi_pmu *cpa_pmu) +{ + u32 val; + + val = readl(cpa_pmu->base + CPA_CFG_REG); + val |= CPA_PM_CTRL; + writel(val, cpa_pmu->base + CPA_CFG_REG); +} + +static void hisi_cpa_pmu_enable_pm(struct hisi_pmu *cpa_pmu) +{ + u32 val; + + val = readl(cpa_pmu->base + CPA_CFG_REG); + val &= ~(CPA_PM_CTRL); + writel(val, cpa_pmu->base + CPA_CFG_REG); +} + +static void hisi_cpa_pmu_enable_counter(struct hisi_pmu *cpa_pmu, + struct hw_perf_event *hwc) +{ + u32 val; + + /* Enable counter index in CPA_EVENT_CTRL register */ + val = readl(cpa_pmu->base + CPA_EVENT_CTRL); + val |= 1 << hwc->idx; + writel(val, cpa_pmu->base + CPA_EVENT_CTRL); +} + +static void hisi_cpa_pmu_disable_counter(struct hisi_pmu *cpa_pmu, + struct hw_perf_event *hwc) +{ + u32 val; + + /* Clear counter index in CPA_EVENT_CTRL register */ + val = readl(cpa_pmu->base + CPA_EVENT_CTRL); + val &= ~(1UL << hwc->idx); + writel(val, cpa_pmu->base + CPA_EVENT_CTRL); +} + +static void hisi_cpa_pmu_enable_counter_int(struct hisi_pmu *cpa_pmu, + struct hw_perf_event *hwc) +{ + u32 val; + + /* Write 0 to enable interrupt */ + val = readl(cpa_pmu->base + CPA_INT_MASK); + val &= ~(1UL << hwc->idx); + writel(val, cpa_pmu->base + CPA_INT_MASK); +} + +static void hisi_cpa_pmu_disable_counter_int(struct hisi_pmu *cpa_pmu, + struct hw_perf_event *hwc) +{ + u32 val; + + /* Write 1 to mask interrupt */ + val = readl(cpa_pmu->base + CPA_INT_MASK); + val |= 1 << hwc->idx; + writel(val, cpa_pmu->base + CPA_INT_MASK); +} + +static u32 hisi_cpa_pmu_get_int_status(struct hisi_pmu *cpa_pmu) +{ + return readl(cpa_pmu->base + CPA_INT_STATUS); +} + +static void hisi_cpa_pmu_clear_int_status(struct hisi_pmu *cpa_pmu, int idx) +{ + writel(1 << idx, cpa_pmu->base + CPA_INT_CLEAR); +} + +static const struct acpi_device_id hisi_cpa_pmu_acpi_match[] = { + { "HISI0281", }, + {} +}; +MODULE_DEVICE_TABLE(acpi, hisi_cpa_pmu_acpi_match); + +static int hisi_cpa_pmu_init_data(struct platform_device *pdev, + struct hisi_pmu *cpa_pmu) +{ + if (device_property_read_u32(&pdev->dev, "hisilicon,scl-id", + &cpa_pmu->sicl_id)) { + dev_err(&pdev->dev, "Can not read sicl-id\n"); + return -EINVAL; + } + + if (device_property_read_u32(&pdev->dev, "hisilicon,idx-id", + &cpa_pmu->index_id)) { + dev_err(&pdev->dev, "Cannot read idx-id\n"); + return -EINVAL; + } + + cpa_pmu->ccl_id = -1; + cpa_pmu->sccl_id = -1; + cpa_pmu->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(cpa_pmu->base)) + return PTR_ERR(cpa_pmu->base); + + cpa_pmu->identifier = readl(cpa_pmu->base + CPA_VERSION); + + return 0; +} + +static struct attribute *hisi_cpa_pmu_format_attr[] = { + HISI_PMU_FORMAT_ATTR(event, "config:0-15"), + NULL +}; + +static const struct attribute_group hisi_cpa_pmu_format_group = { + .name = "format", + .attrs = hisi_cpa_pmu_format_attr, +}; + +static struct attribute *hisi_cpa_pmu_events_attr[] = { + HISI_PMU_EVENT_ATTR(cpa_cycles, 0x00), + HISI_PMU_EVENT_ATTR(cpa_p1_wr_dat, 0x61), + HISI_PMU_EVENT_ATTR(cpa_p1_rd_dat, 0x62), + HISI_PMU_EVENT_ATTR(cpa_p0_wr_dat, 0xE1), + HISI_PMU_EVENT_ATTR(cpa_p0_rd_dat, 0xE2), + NULL +}; + +static const struct attribute_group hisi_cpa_pmu_events_group = { + .name = "events", + .attrs = hisi_cpa_pmu_events_attr, +}; + +static DEVICE_ATTR(cpumask, 0444, hisi_cpumask_sysfs_show, NULL); + +static struct attribute *hisi_cpa_pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static const struct attribute_group hisi_cpa_pmu_cpumask_attr_group = { + .attrs = hisi_cpa_pmu_cpumask_attrs, +}; + +static struct device_attribute hisi_cpa_pmu_identifier_attr = + __ATTR(identifier, 0444, hisi_uncore_pmu_identifier_attr_show, NULL); + +static struct attribute *hisi_cpa_pmu_identifier_attrs[] = { + &hisi_cpa_pmu_identifier_attr.attr, + NULL +}; + +static const struct attribute_group hisi_cpa_pmu_identifier_group = { + .attrs = hisi_cpa_pmu_identifier_attrs, +}; + +static const struct attribute_group *hisi_cpa_pmu_attr_groups[] = { + &hisi_cpa_pmu_format_group, + &hisi_cpa_pmu_events_group, + &hisi_cpa_pmu_cpumask_attr_group, + &hisi_cpa_pmu_identifier_group, + NULL +}; + +static const struct hisi_uncore_ops hisi_uncore_cpa_pmu_ops = { + .write_evtype = hisi_cpa_pmu_write_evtype, + .get_event_idx = hisi_uncore_pmu_get_event_idx, + .start_counters = hisi_cpa_pmu_start_counters, + .stop_counters = hisi_cpa_pmu_stop_counters, + .enable_counter = hisi_cpa_pmu_enable_counter, + .disable_counter = hisi_cpa_pmu_disable_counter, + .enable_counter_int = hisi_cpa_pmu_enable_counter_int, + .disable_counter_int = hisi_cpa_pmu_disable_counter_int, + .write_counter = hisi_cpa_pmu_write_counter, + .read_counter = hisi_cpa_pmu_read_counter, + .get_int_status = hisi_cpa_pmu_get_int_status, + .clear_int_status = hisi_cpa_pmu_clear_int_status, +}; + +static int hisi_cpa_pmu_dev_probe(struct platform_device *pdev, + struct hisi_pmu *cpa_pmu) +{ + int ret; + + ret = hisi_cpa_pmu_init_data(pdev, cpa_pmu); + if (ret) + return ret; + + ret = hisi_uncore_pmu_init_irq(cpa_pmu, pdev); + if (ret) + return ret; + + cpa_pmu->counter_bits = CPA_COUNTER_BITS; + cpa_pmu->check_event = CPA_NR_EVENTS; + cpa_pmu->pmu_events.attr_groups = hisi_cpa_pmu_attr_groups; + cpa_pmu->ops = &hisi_uncore_cpa_pmu_ops; + cpa_pmu->num_counters = CPA_NR_COUNTERS; + cpa_pmu->dev = &pdev->dev; + cpa_pmu->on_cpu = -1; + + return 0; +} + +static int hisi_cpa_pmu_probe(struct platform_device *pdev) +{ + struct hisi_pmu *cpa_pmu; + char *name; + int ret; + + cpa_pmu = devm_kzalloc(&pdev->dev, sizeof(*cpa_pmu), GFP_KERNEL); + if (!cpa_pmu) + return -ENOMEM; + + ret = hisi_cpa_pmu_dev_probe(pdev, cpa_pmu); + if (ret) + return ret; + + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sicl%d_cpa%u", + cpa_pmu->sicl_id, cpa_pmu->index_id); + if (!name) + return -ENOMEM; + + cpa_pmu->pmu = (struct pmu) { + .name = name, + .module = THIS_MODULE, + .task_ctx_nr = perf_invalid_context, + .event_init = hisi_uncore_pmu_event_init, + .pmu_enable = hisi_uncore_pmu_enable, + .pmu_disable = hisi_uncore_pmu_disable, + .add = hisi_uncore_pmu_add, + .del = hisi_uncore_pmu_del, + .start = hisi_uncore_pmu_start, + .stop = hisi_uncore_pmu_stop, + .read = hisi_uncore_pmu_read, + .attr_groups = cpa_pmu->pmu_events.attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + }; + + /* Power Management should be disabled before using CPA PMU. */ + hisi_cpa_pmu_disable_pm(cpa_pmu); + ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE, + &cpa_pmu->node); + if (ret) { + dev_err(&pdev->dev, "Error %d registering hotplug\n", ret); + hisi_cpa_pmu_enable_pm(cpa_pmu); + return ret; + } + + ret = perf_pmu_register(&cpa_pmu->pmu, name, -1); + if (ret) { + dev_err(cpa_pmu->dev, "PMU register failed\n"); + cpuhp_state_remove_instance_nocalls( + CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE, &cpa_pmu->node); + hisi_cpa_pmu_enable_pm(cpa_pmu); + return ret; + } + + platform_set_drvdata(pdev, cpa_pmu); + return ret; +} + +static int hisi_cpa_pmu_remove(struct platform_device *pdev) +{ + struct hisi_pmu *cpa_pmu = platform_get_drvdata(pdev); + + perf_pmu_unregister(&cpa_pmu->pmu); + cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE, + &cpa_pmu->node); + hisi_cpa_pmu_enable_pm(cpa_pmu); + return 0; +} + +static struct platform_driver hisi_cpa_pmu_driver = { + .driver = { + .name = "hisi_cpa_pmu", + .acpi_match_table = ACPI_PTR(hisi_cpa_pmu_acpi_match), + .suppress_bind_attrs = true, + }, + .probe = hisi_cpa_pmu_probe, + .remove = hisi_cpa_pmu_remove, +}; + +static int __init hisi_cpa_pmu_module_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE, + "AP_PERF_ARM_HISI_CPA_ONLINE", + hisi_uncore_pmu_online_cpu, + hisi_uncore_pmu_offline_cpu); + if (ret) { + pr_err("setup hotplug failed: %d\n", ret); + return ret; + } + + ret = platform_driver_register(&hisi_cpa_pmu_driver); + if (ret) + cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE); + + return ret; +} +module_init(hisi_cpa_pmu_module_init); + +static void __exit hisi_cpa_pmu_module_exit(void) +{ + platform_driver_unregister(&hisi_cpa_pmu_driver); + cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE); +} +module_exit(hisi_cpa_pmu_module_exit); + +MODULE_DESCRIPTION("HiSilicon SoC CPA PMU driver"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Qi Liu liuqi115@huawei.com"); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 24e7be132046..5571bfc2ec6e 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -173,6 +173,9 @@ enum cpuhp_state { CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE, CPUHP_AP_PERF_ARM_CCN_ONLINE, + #ifndef __GENKSYMS__ + CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE, + #endif CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE, CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE, CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
From: Junhao He hejunhao3@huawei.com
driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5EZY2
------------------------------------------------------------------
In FIFO mode, when the state of sink buffer is full, the sink device will continuously backpressures the ETM, so that the ETM cannot switch to the idle state. In this case, the WFx instruction cannot be executed because the CPU detects that the ETM is not in the idle state which that will cause CPU hung. We workaround this issue on HiSilicon ETM by setting bit 13 of TRCAUXCTLR which is used to indicate that the ETM is in the idle state.
The call trace is shown below: rcu: INFO: rcu_sched detected stalls on CPUs/tasks: rcu: 10-...0: (1 ticks this GP) idle=5b6/1/0x4000000000000000 softirq=12309/12318 fqs=114196 (detected by 67, t=330041 jiffies, g=309253, q=453663) Task dump for CPU 10: task:ksoftirqd/10 state:R running task stack: 0 pid: 64 ppid: 2 flags:0x0000000a Call trace: __switch_to+0xbc/0xfc irqtime_account_irq+0x58/0xc4 __do_softirq+0x6c/0x358 run_ksoftirqd+0x68/0x90 smpboot_thread_fn+0x15c/0x1a0 kthread+0x108/0x13c ret_from_fork+0x10/0x18 watchdog: BUG: soft lockup - CPU#35 stuck for 22s! [bash:133345] ... Call trace: smp_call_function_single+0x178/0x190 etm4_disable_sysfs+0x74/0xfc [coresight_etm4x] etm4_disable+0x6c/0x70 [coresight_etm4x] coresight_disable_source+0x7c/0xa0 [coresight] coresight_disable+0x6c/0x13c [coresight] enable_source_store+0x88/0xa0 [coresight] dev_attr_store+0x20/0x34 sysfs_kf_write+0x4c/0x5c kernfs_fop_write_iter+0x130/0x1c0 new_sync_write+0xec/0x18c vfs_write+0x214/0x2ac ksys_write+0x70/0xfc __arm64_sys_write+0x24/0x30 el0_svc_common.constprop.0+0x7c/0x1bc do_el0_svc+0x2c/0x94 el0_svc+0x20/0x30 el0_sync_handler+0xb0/0xb4 el0_sync+0x160/0x180
Signed-off-by: Qi Liu liuqi115@huawei.com Signed-off-by: Junhao He hejunhao3@huawei.com Reviewed-by: Jay Fang f.fangjian@huawei.com Acked-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../coresight/coresight-etm4x-core.c | 37 +++++++++++++++---- drivers/hwtracing/coresight/coresight-etm4x.h | 1 + 2 files changed, 30 insertions(+), 8 deletions(-)
diff --git a/drivers/hwtracing/coresight/coresight-etm4x-core.c b/drivers/hwtracing/coresight/coresight-etm4x-core.c index d4d9c8bb88ca..881da29cbd5a 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x-core.c +++ b/drivers/hwtracing/coresight/coresight-etm4x-core.c @@ -116,8 +116,10 @@ struct etm4_enable_arg { #define HISI_HIP08_CORE_COMMIT_LVL_1 0b01 #define HISI_HIP08_CORE_COMMIT_REG sys_reg(3, 1, 15, 2, 5)
+#define HISI_HIP08_AUXCTRL_CHICKEN_BIT BIT(13) + struct etm4_arch_features { - void (*arch_callback)(bool enable); + void (*arch_callback)(void *info); };
static bool etm4_hisi_match_pid(unsigned int id) @@ -125,8 +127,9 @@ static bool etm4_hisi_match_pid(unsigned int id) return (id & ETM4_AMBA_MASK) == HISI_HIP08_AMBA_ID; }
-static void etm4_hisi_config_core_commit(bool enable) +static void etm4_hisi_config_core_commit(void *info) { + bool enable = *(bool *)info; u8 commit = enable ? HISI_HIP08_CORE_COMMIT_LVL_1 : HISI_HIP08_CORE_COMMIT_FULL; u64 val; @@ -143,48 +146,67 @@ static void etm4_hisi_config_core_commit(bool enable) write_sysreg_s(val, HISI_HIP08_CORE_COMMIT_REG); }
+static void etm4_hisi_config_auxctrlr(void *info) +{ + struct etmv4_drvdata *drvdata = info; + + /* Switch the ETM to idle state */ + writel_relaxed(HISI_HIP08_AUXCTRL_CHICKEN_BIT, drvdata->base + TRCAUXCTLR); +} + static struct etm4_arch_features etm4_features[] = { [ETM4_IMPDEF_HISI_CORE_COMMIT] = { .arch_callback = etm4_hisi_config_core_commit, }, + [ETM4_IMPDEF_HISI_SET_AUXCTRLR] = { + .arch_callback = etm4_hisi_config_auxctrlr, + }, {}, };
static void etm4_enable_arch_specific(struct etmv4_drvdata *drvdata) { struct etm4_arch_features *ftr; + bool enable = true; int bit;
for_each_set_bit(bit, drvdata->arch_features, ETM4_IMPDEF_FEATURE_MAX) { ftr = &etm4_features[bit];
- if (ftr->arch_callback) - ftr->arch_callback(true); + if (bit == ETM4_IMPDEF_HISI_CORE_COMMIT && ftr->arch_callback) + ftr->arch_callback(&enable); + + if (bit == ETM4_IMPDEF_HISI_SET_AUXCTRLR && ftr->arch_callback) + ftr->arch_callback(drvdata); } }
static void etm4_disable_arch_specific(struct etmv4_drvdata *drvdata) { struct etm4_arch_features *ftr; + bool enable = false; int bit;
for_each_set_bit(bit, drvdata->arch_features, ETM4_IMPDEF_FEATURE_MAX) { ftr = &etm4_features[bit];
- if (ftr->arch_callback) - ftr->arch_callback(false); + if (bit == ETM4_IMPDEF_HISI_CORE_COMMIT && ftr->arch_callback) + ftr->arch_callback(&enable); } }
static void etm4_check_arch_features(struct etmv4_drvdata *drvdata, unsigned int id) { - if (etm4_hisi_match_pid(id)) + if (etm4_hisi_match_pid(id)) { set_bit(ETM4_IMPDEF_HISI_CORE_COMMIT, drvdata->arch_features); + set_bit(ETM4_IMPDEF_HISI_SET_AUXCTRLR, drvdata->arch_features); + } } #else static void etm4_enable_arch_specific(struct etmv4_drvdata *drvdata) { + writel_relaxed(0x0, drvdata->base + TRCAUXCTLR); }
static void etm4_disable_arch_specific(struct etmv4_drvdata *drvdata) @@ -223,7 +245,6 @@ static int etm4_enable_hw(struct etmv4_drvdata *drvdata) writel_relaxed(config->pe_sel, drvdata->base + TRCPROCSELR); writel_relaxed(config->cfg, drvdata->base + TRCCONFIGR); /* nothing specific implemented */ - writel_relaxed(0x0, drvdata->base + TRCAUXCTLR); writel_relaxed(config->eventctrl0, drvdata->base + TRCEVENTCTL0R); writel_relaxed(config->eventctrl1, drvdata->base + TRCEVENTCTL1R); if (drvdata->stallctl) diff --git a/drivers/hwtracing/coresight/coresight-etm4x.h b/drivers/hwtracing/coresight/coresight-etm4x.h index 3dd3e0633328..ef9d7365c2da 100644 --- a/drivers/hwtracing/coresight/coresight-etm4x.h +++ b/drivers/hwtracing/coresight/coresight-etm4x.h @@ -206,6 +206,7 @@
enum etm_impdef_type { ETM4_IMPDEF_HISI_CORE_COMMIT, + ETM4_IMPDEF_HISI_SET_AUXCTRLR, ETM4_IMPDEF_FEATURE_MAX, };
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: 187369, https://gitee.com/openeuler/kernel/issues/I5K66O CVE: NA
--------------------------------
Following process triggers an use-after-free problem while iterating every process's fs:
main fd = setup_iouring fork => main' // task->fs->users = 1 submit(fd, STATX, async) id->fs = current->fs req->work.identity = id io_submit_sqes ... io_grab_identity id = req->work.identity id->fs->users++ // fs->users = 2
io_wqe_worker current->fs = work->identity->fs io_req_clean_work fs = req->work.identity->fs --fs->users // fs->user = 1
main' exit exit_fs --fs->users // fs->user = 0 free_fs_struct(fs) // FREE fs
pivot_root chroot_fs_refs do_each_thread(g, p) { fs = p->fs // io_wqe_worker->fs if (fs) spin_lock(&fs->lock) // UAF! }
io_wqe_worker io_worker_exit __io_worker_unuse if (current->fs != worker->restore_fs) current->fs = worker->restore_fs
Task's fs_struct is used in do_work() and destroyed in free_work(), this problem can be fixed by switching io_wqe_worker's fs before releasing request.
Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/io-wq.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/fs/io-wq.c b/fs/io-wq.c index 3d5fc76b92d0..56d00f31003b 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -578,6 +578,8 @@ static void io_worker_handle_work(struct io_worker *worker) linked = NULL; } io_assign_current_work(worker, work); + if (current->fs != worker->restore_fs) + current->fs = worker->restore_fs; wq->free_work(old_work);
if (linked)
From: Jens Axboe axboe@kernel.dk
stable inclusion from stable-v5.10.125 commit df3f3bb5059d20ef094d6b2f0256c4bf4127a859 category: bugfix bugzilla: 187376, https://gitee.com/src-openeuler/kernel/issues/I5IM3T CVE: CVE-2022-2327
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
Any read/write should grab current->nsproxy, denoted by IO_WQ_WORK_FILES as it refers to current->files as well, and connect and recv/recvmsg, send/sendmsg should grab current->fs which is denoted by IO_WQ_WORK_FS.
No upstream commit exists for this issue.
Reported-by: Bing-Jhong Billy Jheng billy@starlabs.sg Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/io_uring.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index 3da9c6f457ae..f1bd00005c03 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -773,7 +773,8 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FILES, }, [IORING_OP_WRITEV] = { .needs_file = 1, @@ -783,7 +784,7 @@ static const struct io_op_def io_op_defs[] = { .needs_async_data = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FSIZE, + IO_WQ_WORK_FSIZE | IO_WQ_WORK_FILES, }, [IORING_OP_FSYNC] = { .needs_file = 1, @@ -794,7 +795,8 @@ static const struct io_op_def io_op_defs[] = { .unbound_nonreg_file = 1, .pollin = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, + .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM | + IO_WQ_WORK_FILES, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -803,7 +805,7 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | - IO_WQ_WORK_MM, + IO_WQ_WORK_MM | IO_WQ_WORK_FILES, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -857,7 +859,7 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_connect), - .work_flags = IO_WQ_WORK_MM, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FS, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, @@ -885,7 +887,8 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .buffer_select = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FILES, }, [IORING_OP_WRITE] = { .needs_file = 1, @@ -894,7 +897,7 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .async_size = sizeof(struct io_async_rw), .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FSIZE, + IO_WQ_WORK_FSIZE | IO_WQ_WORK_FILES, }, [IORING_OP_FADVISE] = { .needs_file = 1, @@ -907,14 +910,16 @@ static const struct io_op_def io_op_defs[] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FS, }, [IORING_OP_RECV] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | + IO_WQ_WORK_FS, }, [IORING_OP_OPENAT2] = { .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
From: Jens Axboe axboe@kernel.dk
stable inclusion from stable-v5.10.126 commit fb2fbb3c10d779c0163c9c2c7ca1aeb75ef3f7ca category: bugfix bugzilla: 187376, https://gitee.com/src-openeuler/kernel/issues/I5IM3T CVE: CVE-2022-2327
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
A previous commit ended up enabling file tracking for iopoll requests, which conflicts with both of them using the same list entry for tracking. Add a separate list entry just for iopoll requests, avoid this issue.
No upstream commit exists for this issue.
Reported-by: Greg Thelen gthelen@google.com Fixes: df3f3bb5059d ("io_uring: add missing item types for various requests") Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/io_uring.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-)
diff --git a/fs/io_uring.c b/fs/io_uring.c index f1bd00005c03..35d7dfecb104 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -696,6 +696,8 @@ struct io_kiocb { */ struct list_head inflight_entry;
+ struct list_head iopoll_entry; + struct percpu_ref *fixed_file_refs; struct callback_head task_work; /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */ @@ -2353,8 +2355,8 @@ static void io_iopoll_queue(struct list_head *again) struct io_kiocb *req;
do { - req = list_first_entry(again, struct io_kiocb, inflight_entry); - list_del(&req->inflight_entry); + req = list_first_entry(again, struct io_kiocb, iopoll_entry); + list_del(&req->iopoll_entry); __io_complete_rw(req, -EAGAIN, 0, NULL); } while (!list_empty(again)); } @@ -2376,14 +2378,14 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, while (!list_empty(done)) { int cflags = 0;
- req = list_first_entry(done, struct io_kiocb, inflight_entry); + req = list_first_entry(done, struct io_kiocb, iopoll_entry); if (READ_ONCE(req->result) == -EAGAIN) { req->result = 0; req->iopoll_completed = 0; - list_move_tail(&req->inflight_entry, &again); + list_move_tail(&req->iopoll_entry, &again); continue; } - list_del(&req->inflight_entry); + list_del(&req->iopoll_entry);
if (req->flags & REQ_F_BUFFER_SELECTED) cflags = io_put_rw_kbuf(req); @@ -2419,7 +2421,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, spin = !ctx->poll_multi_file && *nr_events < min;
ret = 0; - list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) { + list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, iopoll_entry) { struct kiocb *kiocb = &req->rw.kiocb;
/* @@ -2428,7 +2430,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, * and complete those lists first, if we have entries there. */ if (READ_ONCE(req->iopoll_completed)) { - list_move_tail(&req->inflight_entry, &done); + list_move_tail(&req->iopoll_entry, &done); continue; } if (!list_empty(&done)) @@ -2440,7 +2442,7 @@ static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events,
/* iopoll may have completed current req */ if (READ_ONCE(req->iopoll_completed)) - list_move_tail(&req->inflight_entry, &done); + list_move_tail(&req->iopoll_entry, &done);
if (ret && spin) spin = false; @@ -2673,7 +2675,7 @@ static void io_iopoll_req_issued(struct io_kiocb *req) struct io_kiocb *list_req;
list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb, - inflight_entry); + iopoll_entry); if (list_req->file != req->file) ctx->poll_multi_file = true; } @@ -2683,9 +2685,9 @@ static void io_iopoll_req_issued(struct io_kiocb *req) * it to the front so we find it first. */ if (READ_ONCE(req->iopoll_completed)) - list_add(&req->inflight_entry, &ctx->iopoll_list); + list_add(&req->iopoll_entry, &ctx->iopoll_list); else - list_add_tail(&req->inflight_entry, &ctx->iopoll_list); + list_add_tail(&req->iopoll_entry, &ctx->iopoll_list);
if ((ctx->flags & IORING_SETUP_SQPOLL) && wq_has_sleeper(&ctx->sq_data->wait))
From: Florian Westphal fw@strlen.de
stable inclusion from stable-v5.10.135 commit 440dccd80f627e0e11ceb0429e4cdab61857d17e category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5J9R4?from=project-issue CVE: CVE-2022-36946
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[ Upstream commit 99a63d36cb3ed5ca3aa6fcb64cffbeaf3b0fb164 ]
Domingo Dirutigliano and Nicola Guerrera report kernel panic when sending nf_queue verdict with 1-byte nfta_payload attribute.
The IP/IPv6 stack pulls the IP(v6) header from the packet after the input hook.
If user truncates the packet below the header size, this skb_pull() will result in a malformed skb (skb->len < 0).
Fixes: 7af4cc3fa158 ("[NETFILTER]: Add "nfnetlink_queue" netfilter queue handler over nfnetlink") Reported-by: Domingo Dirutigliano pwnzer0tt1@proton.me Signed-off-by: Florian Westphal fw@strlen.de Reviewed-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/netfilter/nfnetlink_queue.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 1640da5c5077..72d30922ed29 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -838,11 +838,16 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) }
static int -nfqnl_mangle(void *data, int data_len, struct nf_queue_entry *e, int diff) +nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff) { struct sk_buff *nskb;
if (diff < 0) { + unsigned int min_len = skb_transport_offset(e->skb); + + if (data_len < min_len) + return -EINVAL; + if (pskb_trim(e->skb, data_len)) return -ENOMEM; } else if (diff > 0) {
From: GUO Zihua guozihua@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5K5HM CVE: NA
--------------------------------
Support SM3 algorithm for module automatic signature during "make modules_install" and "make modules_sign".
Signed-off-by: GUO Zihua guozihua@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Chao Liu liuchao173@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- init/Kconfig | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig index 27c5ed16fef1..ea91c4864783 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2288,6 +2288,9 @@ config MODULE_SIG_SHA512 bool "Sign modules with SHA-512" select CRYPTO_SHA512
+config MODULE_SIG_SM3 + bool "Sign modules with SM3" + select CRYPTO_SM3 endchoice
config MODULE_SIG_HASH @@ -2298,6 +2301,7 @@ config MODULE_SIG_HASH default "sha256" if MODULE_SIG_SHA256 default "sha384" if MODULE_SIG_SHA384 default "sha512" if MODULE_SIG_SHA512 + default "sm3" if MODULE_SIG_SM3
config MODULE_COMPRESS bool "Compress modules on installation"
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IMAK CVE: NA
-------------------------------
This reverts commit 58824c83239b31c76f855366c03833deaeb4c982.
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/vmscan.c | 4 ---- 1 file changed, 4 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index 1bfbe1fc67d0..5b5cc00b195b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2839,7 +2839,6 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
static bool is_memcg_kswapd_stopped(struct scan_control *sc) { -#ifdef CONFIG_MEMCG struct mem_cgroup *memcg = sc->target_mem_cgroup; bool is_stop = false; unsigned long stop_flag = 0; @@ -2855,9 +2854,6 @@ static bool is_memcg_kswapd_stopped(struct scan_control *sc) is_stop = page_counter_read(&memcg->memory) < stop_flag;
return (current_is_kswapd() && is_stop); -#else - return false; -#endif }
static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IMAK CVE: NA
-------------------------------
This reverts commit 84355bcc1e8008181ed3e7275b2cd60a45d7476d.
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/memcontrol.h | 3 --- mm/memcontrol.c | 43 ++++++++------------------------------ mm/vmscan.c | 3 +-- 3 files changed, 10 insertions(+), 39 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3a4ea9ed39fd..6959aca37bc2 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1895,9 +1895,6 @@ static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
#endif /* CONFIG_CGROUP_WRITEBACK */
-extern struct static_key_false memcg_kswapd_key; -#define mem_cgroup_kswapd_enabled static_branch_unlikely(&memcg_kswapd_key) - struct sock; bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a850d1f3fc5b..0b7fc99724f2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -97,10 +97,6 @@ bool cgroup_memory_noswap __read_mostly; static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif
-static bool cgroup_memory_kswapd = false; -DEFINE_STATIC_KEY_FALSE(memcg_kswapd_key); -EXPORT_SYMBOL(memcg_kswapd_key); - /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { @@ -2371,15 +2367,10 @@ static void high_work_func(struct work_struct *work) { struct mem_cgroup *memcg;
- if (mem_cgroup_kswapd_enabled) { - current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD; - memcg = container_of(work, struct mem_cgroup, high_work); - reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); - current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD); - } else { - memcg = container_of(work, struct mem_cgroup, high_work); - reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); - } + current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD; + memcg = container_of(work, struct mem_cgroup, high_work); + reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); + current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD); }
/* @@ -2549,17 +2540,11 @@ void mem_cgroup_handle_over_high(void) * memory.high is currently batched, whereas memory.max and the page * allocator run every time an allocation is made. */ - if (mem_cgroup_kswapd_enabled) { - current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD; - nr_reclaimed = reclaim_high(memcg, - in_retry ? SWAP_CLUSTER_MAX : nr_pages, - GFP_KERNEL); - current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD); - } else { - nr_reclaimed = reclaim_high(memcg, - in_retry ? SWAP_CLUSTER_MAX : nr_pages, - GFP_KERNEL); - } + current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD; + nr_reclaimed = reclaim_high(memcg, + in_retry ? SWAP_CLUSTER_MAX : nr_pages, + GFP_KERNEL); + current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD);
/* * memory.high is breached and reclaim is unable to keep up. Throttle @@ -5647,9 +5632,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) struct mem_cgroup *memcg, *old_memcg; long error = -ENOMEM;
- if (cgroup_memory_kswapd) - static_branch_enable(&memcg_kswapd_key); - old_memcg = set_active_memcg(parent); memcg = mem_cgroup_alloc(); set_active_memcg(old_memcg); @@ -7464,13 +7446,6 @@ static int __init cgroup_memory(char *s) } __setup("cgroup.memory=", cgroup_memory);
-static int __init memcg_kswapd(char *s) -{ - cgroup_memory_kswapd = true; - return 0; -} -__setup("memcg_kswapd", memcg_kswapd); - /* * subsys_initcall() for memory controller. * diff --git a/mm/vmscan.c b/mm/vmscan.c index 5b5cc00b195b..7fec6cf7a0ae 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2843,9 +2843,8 @@ static bool is_memcg_kswapd_stopped(struct scan_control *sc) bool is_stop = false; unsigned long stop_flag = 0;
- if (!cgroup_reclaim(sc) || !mem_cgroup_kswapd_enabled) + if (!cgroup_reclaim(sc)) return false; - if (memcg->memory.max == PAGE_COUNTER_MAX) stop_flag = memcg->memory.high / 6; else
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IMAK CVE: NA
-------------------------------
This reverts commit 70d020aec8e7e8b17c8db7919b6cbc99620cff3e.
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mmzone.h | 8 ++++---- mm/vmscan.c | 11 ++++------- 2 files changed, 8 insertions(+), 11 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5c7753da89f4..8719d891848f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -272,10 +272,6 @@ enum lruvec_flags { LRUVEC_CONGESTED, /* lruvec has many dirty pages * backed by a congested BDI */ - LRUVEC_DIRTY, /* reclaim scanning has recently found - * many dirty file pages at the tail of - * the LRU. - */ };
struct lruvec { @@ -599,6 +595,10 @@ struct zone { } ____cacheline_internodealigned_in_smp;
enum pgdat_flags { + PGDAT_DIRTY, /* reclaim scanning has recently found + * many dirty file pages at the tail + * of the LRU. + */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 7fec6cf7a0ae..e1e44f0c486d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1289,7 +1289,6 @@ static unsigned int shrink_page_list(struct list_head *page_list, LIST_HEAD(free_pages); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; - struct lruvec *target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
memset(stat, 0, sizeof(*stat)); cond_resched(); @@ -1536,7 +1535,7 @@ static unsigned int shrink_page_list(struct list_head *page_list, */ if (page_is_file_lru(page) && (!current_is_kswapd() || !PageReclaim(page) || - !test_bit(LRUVEC_DIRTY, &target_lruvec->flags))) { + !test_bit(PGDAT_DIRTY, &pgdat->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -3069,7 +3068,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
/* Allow kswapd to start writing pages during reclaim.*/ if (sc->nr.unqueued_dirty == sc->nr.file_taken) - set_bit(LRUVEC_DIRTY, &target_lruvec->flags); + set_bit(PGDAT_DIRTY, &pgdat->flags);
/* * If kswapd scans pages marked for immediate @@ -3089,7 +3088,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) * Legacy memcg will stall in page writeback so avoid forcibly * stalling in wait_iff_congested(). */ - if (((current_is_kswapd() && !cgroup_reclaim(sc))|| + if ((current_is_kswapd() || (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && sc->nr.dirty && sc->nr.dirty == sc->nr.congested) set_bit(LRUVEC_CONGESTED, &target_lruvec->flags); @@ -3323,8 +3322,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, zone->zone_pgdat); clear_bit(LRUVEC_CONGESTED, &lruvec->flags); - if (current_is_kswapd()) - clear_bit(LRUVEC_DIRTY, &lruvec->flags); } }
@@ -3715,7 +3712,7 @@ static void clear_pgdat_congested(pg_data_t *pgdat) struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
clear_bit(LRUVEC_CONGESTED, &lruvec->flags); - clear_bit(LRUVEC_DIRTY, &pgdat->flags); + clear_bit(PGDAT_DIRTY, &pgdat->flags); clear_bit(PGDAT_WRITEBACK, &pgdat->flags); }
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IMAK CVE: NA
-------------------------------
This reverts commit 1496d67c65b1679341a4da6d3c9709c937be3d59.
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/memcontrol.c | 4 ---- mm/vmscan.c | 40 ++-------------------------------------- 2 files changed, 2 insertions(+), 42 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0b7fc99724f2..16decaf4844d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2367,10 +2367,8 @@ static void high_work_func(struct work_struct *work) { struct mem_cgroup *memcg;
- current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD; memcg = container_of(work, struct mem_cgroup, high_work); reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); - current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD); }
/* @@ -2540,11 +2538,9 @@ void mem_cgroup_handle_over_high(void) * memory.high is currently batched, whereas memory.max and the page * allocator run every time an allocation is made. */ - current->flags |= PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD; nr_reclaimed = reclaim_high(memcg, in_retry ? SWAP_CLUSTER_MAX : nr_pages, GFP_KERNEL); - current->flags &= ~(PF_SWAPWRITE | PF_MEMALLOC | PF_KSWAPD);
/* * memory.high is breached and reclaim is unable to keep up. Throttle diff --git a/mm/vmscan.c b/mm/vmscan.c index e1e44f0c486d..c851e5f91842 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -61,8 +61,6 @@
#include "internal.h"
-#define MEMCG_KSWAPD_SCATOR 10 - #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h>
@@ -2836,24 +2834,6 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return inactive_lru_pages > pages_for_compaction; }
-static bool is_memcg_kswapd_stopped(struct scan_control *sc) -{ - struct mem_cgroup *memcg = sc->target_mem_cgroup; - bool is_stop = false; - unsigned long stop_flag = 0; - - if (!cgroup_reclaim(sc)) - return false; - if (memcg->memory.max == PAGE_COUNTER_MAX) - stop_flag = memcg->memory.high / 6; - else - stop_flag = memcg->memory.high - memcg->memory.max * - MEMCG_KSWAPD_SCATOR / 1000; - is_stop = page_counter_read(&memcg->memory) < stop_flag; - - return (current_is_kswapd() && is_stop); -} - static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { struct mem_cgroup *target_memcg = sc->target_mem_cgroup; @@ -2909,14 +2889,6 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed);
- /* - * Memcg background reclaim would break iter once memcg kswapd - * flag is satisfied. - */ - if (is_memcg_kswapd_stopped(sc)) { - mem_cgroup_iter_break(target_memcg, memcg); - break; - } } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); }
@@ -3285,9 +3257,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
do { - if (is_memcg_kswapd_stopped(sc)) - break; - vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; @@ -3350,13 +3319,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, goto retry; }
- /* - * Untapped cgroup reserves? Don't OOM, retry. - * memcg usage is lower than memory.high / 2, memcg kswapd will lead to - * stop memcg reclaim, but should not break low protection. - */ - if (sc->memcg_low_skipped && - !(current_is_kswapd() && cgroup_reclaim(sc))) { + /* Untapped cgroup reserves? Don't OOM, retry. */ + if (sc->memcg_low_skipped) { sc->priority = initial_priority; sc->force_deactivate = 0; sc->memcg_low_reclaim = 1;
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IMAK CVE: NA
-------------------------------
Enable memcg async reclaim when memcg usage is larger than memory_high * memcg->high_async_ratio / 10; if memcg usage is larger than memory_high * (memcg->high_async_ratio - 1)/ 10, the reclaim pages is the diff of memcg usage and memory_high * (memcg->high_async_ratio - 1)/ 10; else reclaim pages is MEMCG_CHARGE_BATCH; The default memcg->high_async_ratio is 0; when memcg->high_async_ratio is 0, memcg async reclaim is disabled;
The situation when enable memcg async reclaim is 1) try_charge; 2) reset memory_high
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/memcontrol.h | 5 +++ mm/memcontrol.c | 72 +++++++++++++++++++++++++++++++++++--- 2 files changed, 73 insertions(+), 4 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6959aca37bc2..631a3bfeb066 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -375,7 +375,12 @@ struct mem_cgroup { #ifdef CONFIG_DYNAMIC_HUGETLB struct dhugetlb_pool *hpool; #endif +#ifndef __GENKSYMS__ + int high_async_ratio; + bool high_async_reclaim; +#else KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 16decaf4844d..a6ee6091afb2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -106,6 +106,22 @@ static bool do_memsw_account(void) #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024
+/* + * when memcg->high_async_ratio is HIGH_ASYNC_RATIO_DEFAULT, memcg async + * relcaim is disabled; + * when mem_usage is larger than memory.high * memcg->high_async_ratio/ + * HIGH_ASYNC_RATIO_BASE, start async reclaim; + * if mem_usage is larger than memory.high * (memcg->high_async_ratio - + * HIGH_ASYNC_RATIO_GAP) / HIGH_ASYNC_RATIO_BASE, the aim reclaim page is + * the diff of mem_usage and memory.high * (memcg->high_async_ratio - + * HIGH_ASYNC_RATIO_GAP) / HIGH_ASYNC_RATIO_BASE else the aim reclaim + * page is MEMCG_CHARGE_BATCH; + */ + +#define HIGH_ASYNC_RATIO_DEFAULT 0 +#define HIGH_ASYNC_RATIO_BASE 10 +#define HIGH_ASYNC_RATIO_GAP 1 + /* * Cgroups above their limits are maintained in a RB-Tree, independent of * their hierarchy representation @@ -2338,18 +2354,41 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) return 0; }
+static bool is_high_async_reclaim(struct mem_cgroup *memcg) +{ + int ratio = READ_ONCE(memcg->high_async_ratio); + + if (ratio == HIGH_ASYNC_RATIO_DEFAULT) + return false; + + if (READ_ONCE(memcg->memory.high) == PAGE_COUNTER_MAX) + return false; + + return page_counter_read(&memcg->memory) > + (READ_ONCE(memcg->memory.high) * ratio / HIGH_ASYNC_RATIO_BASE); +} + static unsigned long reclaim_high(struct mem_cgroup *memcg, unsigned int nr_pages, gfp_t gfp_mask) { unsigned long nr_reclaimed = 0; + bool high_async_reclaim = READ_ONCE(memcg->high_async_reclaim); + + if (high_async_reclaim) + WRITE_ONCE(memcg->high_async_reclaim, false);
do { unsigned long pflags;
- if (page_counter_read(&memcg->memory) <= - READ_ONCE(memcg->memory.high)) - continue; + if (high_async_reclaim) { + if (!is_high_async_reclaim(memcg)) + continue; + } else { + if (page_counter_read(&memcg->memory) <= + READ_ONCE(memcg->memory.high)) + continue; + }
memcg_memory_event(memcg, MEMCG_HIGH);
@@ -2363,12 +2402,26 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, return nr_reclaimed; }
+static unsigned long get_reclaim_pages(struct mem_cgroup *memcg) +{ + unsigned long nr_pages = page_counter_read(&memcg->memory); + int ratio = READ_ONCE(memcg->high_async_ratio) - HIGH_ASYNC_RATIO_GAP; + unsigned long safe_pages = READ_ONCE(memcg->memory.high) * ratio / + HIGH_ASYNC_RATIO_BASE; + + return (nr_pages > safe_pages) ? (nr_pages - safe_pages) : + MEMCG_CHARGE_BATCH; +} + static void high_work_func(struct work_struct *work) { struct mem_cgroup *memcg;
memcg = container_of(work, struct mem_cgroup, high_work); - reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); + if (memcg->high_async_reclaim) + reclaim_high(memcg, get_reclaim_pages(memcg), GFP_KERNEL); + else + reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); }
/* @@ -2755,6 +2808,11 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, continue; }
+ if (is_high_async_reclaim(memcg)) { + WRITE_ONCE(memcg->high_async_reclaim, true); + schedule_work(&memcg->high_work); + } + if (mem_high || swap_high) { /* * The allocating tasks in this cgroup will need to do @@ -5128,6 +5186,11 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
page_counter_set_high(&memcg->memory, high);
+ if (is_high_async_reclaim(memcg)) { + WRITE_ONCE(memcg->high_async_reclaim, true); + schedule_work(&memcg->high_work); + } + for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); unsigned long reclaimed; @@ -5636,6 +5699,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; + memcg->high_async_ratio = HIGH_ASYNC_RATIO_DEFAULT; page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent);
From: Lu Jialin lujialin4@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4IMAK CVE: NA
-------------------------------
User can set high_async_ratio from 0 to 9; start memcg high async when memcg_usage is larger than memory.high * high_async_ratio / 10;
Signed-off-by: Lu Jialin lujialin4@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/memcontrol.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a6ee6091afb2..e15389aeb641 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5286,6 +5286,38 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, return nbytes; }
+static int memcg_high_async_ratio_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->high_async_ratio)); +} + +static ssize_t memcg_high_async_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, high_async_ratio; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &high_async_ratio); + if (ret) + return ret; + + if (high_async_ratio >= HIGH_ASYNC_RATIO_BASE || + high_async_ratio < HIGH_ASYNC_RATIO_DEFAULT) + return -EINVAL; + + WRITE_ONCE(memcg->high_async_ratio, high_async_ratio); + + if (is_high_async_reclaim(memcg)) + schedule_work(&memcg->high_work); + + return nbytes; +} + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -5487,6 +5519,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .name = "reclaim", .write = memory_reclaim, }, + { + .name = "high_async_ratio", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memcg_high_async_ratio_show, + .write = memcg_high_async_ratio_write, + }, { }, /* terminate */ };