From: Qi Liu liuqi115@huawei.com
mainline inclusion from mainline-v5.17-rc1 commit 8404b0fbc7fb category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AZ87 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported to sample bandwidth, latency, buffer occupation etc.
Each PMU RCiEP device monitors multiple Root Ports, and each RCiEP is registered as a PMU in /sys/bus/event_source/devices, so users can select target PMU, and use filter to do further sets.
Filtering options contains: event - select the event. port - select target Root Ports. Information of Root Ports are shown under sysfs. bdf - select requester_id of target EP device. trig_len - set trigger condition for starting event statistics. trig_mode - set trigger mode. 0 means starting to statistic when bigger than trigger condition, and 1 means smaller. thr_len - set threshold for statistics. thr_mode - set threshold mode. 0 means count when bigger than threshold, and 1 means smaller.
Acked-by: Krzysztof Wilczyński kw@linux.com Reviewed-by: John Garry john.garry@huawei.com Signed-off-by: Qi Liu liuqi115@huawei.com Reviewed-by: Shaokun Zhang zhangshaokun@hisilicon.com Link: https://lore.kernel.org/r/20211202080633.2919-3-liuqi115@huawei.com Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Wangming Shao shaowangming@h-partners.com Reviewed-by: Junhao He hejunhao3@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- MAINTAINERS | 2 + drivers/perf/hisilicon/Kconfig | 9 + drivers/perf/hisilicon/Makefile | 2 + drivers/perf/hisilicon/hisi_pcie_pmu.c | 951 +++++++++++++++++++++++++ include/linux/cpuhotplug.h | 3 + 5 files changed, 967 insertions(+) create mode 100644 drivers/perf/hisilicon/hisi_pcie_pmu.c
diff --git a/MAINTAINERS b/MAINTAINERS index 23a23bd94c00..d3d6cad63b94 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7976,8 +7976,10 @@ F: Documentation/devicetree/bindings/misc/hisilicon-hikey-usb.yaml
HISILICON PMU DRIVER M: Shaokun Zhang zhangshaokun@hisilicon.com +M: Qi Liu liuqi115@huawei.com S: Supported W: http://www.hisilicon.com +F: Documentation/admin-guide/perf/hisi-pcie-pmu.rst F: Documentation/admin-guide/perf/hisi-pmu.rst F: drivers/perf/hisilicon
diff --git a/drivers/perf/hisilicon/Kconfig b/drivers/perf/hisilicon/Kconfig index c5d1b7019fff..5546218b5598 100644 --- a/drivers/perf/hisilicon/Kconfig +++ b/drivers/perf/hisilicon/Kconfig @@ -5,3 +5,12 @@ config HISI_PMU help Support for HiSilicon SoC L3 Cache performance monitor, Hydra Home Agent performance monitor and DDR Controller performance monitor. + +config HISI_PCIE_PMU + tristate "HiSilicon PCIE PERF PMU" + depends on PCI && ARM64 + help + Provide support for HiSilicon PCIe performance monitoring unit (PMU) + RCiEP devices. + Adds the PCIe PMU into perf events system for monitoring latency, + bandwidth etc. diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile index 7643c9f93e36..506ed39e3266 100644 --- a/drivers/perf/hisilicon/Makefile +++ b/drivers/perf/hisilicon/Makefile @@ -2,3 +2,5 @@ obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \ hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o \ hisi_uncore_pa_pmu.o + +obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c new file mode 100644 index 000000000000..2f18838754ec --- /dev/null +++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c @@ -0,0 +1,951 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * This driver adds support for PCIe PMU RCiEP device. Related + * perf events are bandwidth, latency etc. + * + * Copyright (C) 2021 HiSilicon Limited + * Author: Qi Liu liuqi115@huawei.com + */ +#include <linux/bitfield.h> +#include <linux/bitmap.h> +#include <linux/bug.h> +#include <linux/device.h> +#include <linux/err.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/pci.h> +#include <linux/perf_event.h> + +#define DRV_NAME "hisi_pcie_pmu" +/* Define registers */ +#define HISI_PCIE_GLOBAL_CTRL 0x00 +#define HISI_PCIE_EVENT_CTRL 0x010 +#define HISI_PCIE_CNT 0x090 +#define HISI_PCIE_EXT_CNT 0x110 +#define HISI_PCIE_INT_STAT 0x150 +#define HISI_PCIE_INT_MASK 0x154 +#define HISI_PCIE_REG_BDF 0xfe0 +#define HISI_PCIE_REG_VERSION 0xfe4 +#define HISI_PCIE_REG_INFO 0xfe8 + +/* Define command in HISI_PCIE_GLOBAL_CTRL */ +#define HISI_PCIE_GLOBAL_EN 0x01 +#define HISI_PCIE_GLOBAL_NONE 0 + +/* Define command in HISI_PCIE_EVENT_CTRL */ +#define HISI_PCIE_EVENT_EN BIT_ULL(20) +#define HISI_PCIE_RESET_CNT BIT_ULL(22) +#define HISI_PCIE_INIT_SET BIT_ULL(34) +#define HISI_PCIE_THR_EN BIT_ULL(26) +#define HISI_PCIE_TARGET_EN BIT_ULL(32) +#define HISI_PCIE_TRIG_EN BIT_ULL(52) + +/* Define offsets in HISI_PCIE_EVENT_CTRL */ +#define HISI_PCIE_EVENT_M GENMASK_ULL(15, 0) +#define HISI_PCIE_THR_MODE_M GENMASK_ULL(27, 27) +#define HISI_PCIE_THR_M GENMASK_ULL(31, 28) +#define HISI_PCIE_TARGET_M GENMASK_ULL(52, 36) +#define HISI_PCIE_TRIG_MODE_M GENMASK_ULL(53, 53) +#define HISI_PCIE_TRIG_M GENMASK_ULL(59, 56) + +#define HISI_PCIE_MAX_COUNTERS 8 +#define HISI_PCIE_REG_STEP 8 +#define HISI_PCIE_THR_MAX_VAL 10 +#define HISI_PCIE_TRIG_MAX_VAL 10 +#define HISI_PCIE_MAX_PERIOD (GENMASK_ULL(63, 0)) +#define HISI_PCIE_INIT_VAL BIT_ULL(63) + +struct hisi_pcie_pmu { + struct perf_event *hw_events[HISI_PCIE_MAX_COUNTERS]; + struct hlist_node node; + struct pci_dev *pdev; + struct pmu pmu; + void __iomem *base; + int irq; + u32 identifier; + /* Minimum and maximum BDF of root ports monitored by PMU */ + u16 bdf_min; + u16 bdf_max; + int on_cpu; +}; + +struct hisi_pcie_reg_pair { + u16 lo; + u16 hi; +}; + +#define to_pcie_pmu(p) (container_of((p), struct hisi_pcie_pmu, pmu)) +#define GET_PCI_DEVFN(bdf) ((bdf) & 0xff) + +#define HISI_PCIE_PMU_FILTER_ATTR(_name, _config, _hi, _lo) \ + static u64 hisi_pcie_get_##_name(struct perf_event *event) \ + { \ + return FIELD_GET(GENMASK(_hi, _lo), event->attr._config); \ + } \ + +HISI_PCIE_PMU_FILTER_ATTR(event, config, 16, 0); +HISI_PCIE_PMU_FILTER_ATTR(thr_len, config1, 3, 0); +HISI_PCIE_PMU_FILTER_ATTR(thr_mode, config1, 4, 4); +HISI_PCIE_PMU_FILTER_ATTR(trig_len, config1, 8, 5); +HISI_PCIE_PMU_FILTER_ATTR(trig_mode, config1, 9, 9); +HISI_PCIE_PMU_FILTER_ATTR(port, config2, 15, 0); +HISI_PCIE_PMU_FILTER_ATTR(bdf, config2, 31, 16); + +static ssize_t hisi_pcie_format_sysfs_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct dev_ext_attribute *eattr; + + eattr = container_of(attr, struct dev_ext_attribute, attr); + + return sysfs_emit(buf, "%s\n", (char *)eattr->var); +} + +static ssize_t hisi_pcie_event_sysfs_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct perf_pmu_events_attr *pmu_attr = + container_of(attr, struct perf_pmu_events_attr, attr); + + return sysfs_emit(buf, "config=0x%llx\n", pmu_attr->id); +} + +#define HISI_PCIE_PMU_FORMAT_ATTR(_name, _format) \ + (&((struct dev_ext_attribute[]){ \ + { .attr = __ATTR(_name, 0444, hisi_pcie_format_sysfs_show, \ + NULL), \ + .var = (void *)_format } \ + })[0].attr.attr) + +#define HISI_PCIE_PMU_EVENT_ATTR(_name, _id) \ + (&((struct perf_pmu_events_attr[]) { \ + { .attr = __ATTR(_name, 0444, hisi_pcie_event_sysfs_show, NULL), \ + .id = _id, } \ + })[0].attr.attr) + +static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev)); + + return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu)); +} +static DEVICE_ATTR_RO(cpumask); + +static ssize_t identifier_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%#x\n", pcie_pmu->identifier); +} +static DEVICE_ATTR_RO(identifier); + +static ssize_t bus_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%#04x\n", PCI_BUS_NUM(pcie_pmu->bdf_min)); +} +static DEVICE_ATTR_RO(bus); + +static struct hisi_pcie_reg_pair +hisi_pcie_parse_reg_value(struct hisi_pcie_pmu *pcie_pmu, u32 reg_off) +{ + u32 val = readl_relaxed(pcie_pmu->base + reg_off); + struct hisi_pcie_reg_pair regs = { + .lo = val, + .hi = val >> 16, + }; + + return regs; +} + +/* + * Hardware counter and ext_counter work together for bandwidth, latency, bus + * utilization and buffer occupancy events. For example, RX memory write latency + * events(index = 0x0010), counter counts total delay cycles and ext_counter + * counts RX memory write PCIe packets number. + * + * As we don't want PMU driver to process these two data, "delay cycles" can + * be treated as an independent event(index = 0x0010), "RX memory write packets + * number" as another(index = 0x10010). BIT 16 is used to distinguish and 0-15 + * bits are "real" event index, which can be used to set HISI_PCIE_EVENT_CTRL. + */ +#define EXT_COUNTER_IS_USED(idx) ((idx) & BIT(16)) + +static u32 hisi_pcie_get_real_event(struct perf_event *event) +{ + return hisi_pcie_get_event(event) & GENMASK(15, 0); +} + +static u32 hisi_pcie_pmu_get_offset(u32 offset, u32 idx) +{ + return offset + HISI_PCIE_REG_STEP * idx; +} + +static u32 hisi_pcie_pmu_readl(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, + u32 idx) +{ + u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx); + + return readl_relaxed(pcie_pmu->base + offset); +} + +static void hisi_pcie_pmu_writel(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx, u32 val) +{ + u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx); + + writel_relaxed(val, pcie_pmu->base + offset); +} + +static u64 hisi_pcie_pmu_readq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx) +{ + u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx); + + return readq_relaxed(pcie_pmu->base + offset); +} + +static void hisi_pcie_pmu_writeq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx, u64 val) +{ + u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx); + + writeq_relaxed(val, pcie_pmu->base + offset); +} + +static void hisi_pcie_pmu_config_filter(struct perf_event *event) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u64 reg = HISI_PCIE_INIT_SET; + u64 port, trig_len, thr_len; + + /* Config HISI_PCIE_EVENT_CTRL according to event. */ + reg |= FIELD_PREP(HISI_PCIE_EVENT_M, hisi_pcie_get_real_event(event)); + + /* Config HISI_PCIE_EVENT_CTRL according to root port or EP device. */ + port = hisi_pcie_get_port(event); + if (port) + reg |= FIELD_PREP(HISI_PCIE_TARGET_M, port); + else + reg |= HISI_PCIE_TARGET_EN | + FIELD_PREP(HISI_PCIE_TARGET_M, hisi_pcie_get_bdf(event)); + + /* Config HISI_PCIE_EVENT_CTRL according to trigger condition. */ + trig_len = hisi_pcie_get_trig_len(event); + if (trig_len) { + reg |= FIELD_PREP(HISI_PCIE_TRIG_M, trig_len); + reg |= FIELD_PREP(HISI_PCIE_TRIG_MODE_M, hisi_pcie_get_trig_mode(event)); + reg |= HISI_PCIE_TRIG_EN; + } + + /* Config HISI_PCIE_EVENT_CTRL according to threshold condition. */ + thr_len = hisi_pcie_get_thr_len(event); + if (thr_len) { + reg |= FIELD_PREP(HISI_PCIE_THR_M, thr_len); + reg |= FIELD_PREP(HISI_PCIE_THR_MODE_M, hisi_pcie_get_thr_mode(event)); + reg |= HISI_PCIE_THR_EN; + } + + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx, reg); +} + +static void hisi_pcie_pmu_clear_filter(struct perf_event *event) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx, HISI_PCIE_INIT_SET); +} + +static bool hisi_pcie_pmu_valid_requester_id(struct hisi_pcie_pmu *pcie_pmu, u32 bdf) +{ + struct pci_dev *root_port, *pdev; + u16 rp_bdf; + + pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pcie_pmu->pdev->bus), PCI_BUS_NUM(bdf), + GET_PCI_DEVFN(bdf)); + if (!pdev) + return false; + + root_port = pcie_find_root_port(pdev); + if (!root_port) { + pci_dev_put(pdev); + return false; + } + + pci_dev_put(pdev); + rp_bdf = pci_dev_id(root_port); + return rp_bdf >= pcie_pmu->bdf_min && rp_bdf <= pcie_pmu->bdf_max; +} + +static bool hisi_pcie_pmu_valid_filter(struct perf_event *event, + struct hisi_pcie_pmu *pcie_pmu) +{ + u32 requester_id = hisi_pcie_get_bdf(event); + + if (hisi_pcie_get_thr_len(event) > HISI_PCIE_THR_MAX_VAL) + return false; + + if (hisi_pcie_get_trig_len(event) > HISI_PCIE_TRIG_MAX_VAL) + return false; + + if (requester_id) { + if (!hisi_pcie_pmu_valid_requester_id(pcie_pmu, requester_id)) + return false; + } + + return true; +} + +static bool hisi_pcie_pmu_cmp_event(struct perf_event *target, + struct perf_event *event) +{ + return hisi_pcie_get_real_event(target) == hisi_pcie_get_real_event(event); +} + +static bool hisi_pcie_pmu_validate_event_group(struct perf_event *event) +{ + struct perf_event *sibling, *leader = event->group_leader; + struct perf_event *event_group[HISI_PCIE_MAX_COUNTERS]; + int counters = 1; + int num; + + event_group[0] = leader; + if (!is_software_event(leader)) { + if (leader->pmu != event->pmu) + return false; + + if (leader != event && !hisi_pcie_pmu_cmp_event(leader, event)) + event_group[counters++] = event; + } + + for_each_sibling_event(sibling, event->group_leader) { + if (is_software_event(sibling)) + continue; + + if (sibling->pmu != event->pmu) + return false; + + for (num = 0; num < counters; num++) { + if (hisi_pcie_pmu_cmp_event(event_group[num], sibling)) + break; + } + + if (num == counters) + event_group[counters++] = sibling; + } + + return counters <= HISI_PCIE_MAX_COUNTERS; +} + +static int hisi_pcie_pmu_event_init(struct perf_event *event) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + event->cpu = pcie_pmu->on_cpu; + + if (EXT_COUNTER_IS_USED(hisi_pcie_get_event(event))) + hwc->event_base = HISI_PCIE_EXT_CNT; + else + hwc->event_base = HISI_PCIE_CNT; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* Sampling is not supported. */ + if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK) + return -EOPNOTSUPP; + + if (!hisi_pcie_pmu_valid_filter(event, pcie_pmu)) + return -EINVAL; + + if (!hisi_pcie_pmu_validate_event_group(event)) + return -EINVAL; + + return 0; +} + +static u64 hisi_pcie_pmu_read_counter(struct perf_event *event) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + u32 idx = event->hw.idx; + + return hisi_pcie_pmu_readq(pcie_pmu, event->hw.event_base, idx); +} + +static int hisi_pcie_pmu_find_related_event(struct hisi_pcie_pmu *pcie_pmu, + struct perf_event *event) +{ + struct perf_event *sibling; + int idx; + + for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) { + sibling = pcie_pmu->hw_events[idx]; + if (!sibling) + continue; + + if (!hisi_pcie_pmu_cmp_event(sibling, event)) + continue; + + /* Related events must be used in group */ + if (sibling->group_leader == event->group_leader) + return idx; + else + return -EINVAL; + } + + return idx; +} + +static int hisi_pcie_pmu_get_event_idx(struct hisi_pcie_pmu *pcie_pmu) +{ + int idx; + + for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) { + if (!pcie_pmu->hw_events[idx]) + return idx; + } + + return -EINVAL; +} + +static void hisi_pcie_pmu_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 new_cnt, prev_cnt, delta; + + do { + prev_cnt = local64_read(&hwc->prev_count); + new_cnt = hisi_pcie_pmu_read_counter(event); + } while (local64_cmpxchg(&hwc->prev_count, prev_cnt, + new_cnt) != prev_cnt); + + delta = (new_cnt - prev_cnt) & HISI_PCIE_MAX_PERIOD; + local64_add(delta, &event->count); +} + +static void hisi_pcie_pmu_read(struct perf_event *event) +{ + hisi_pcie_pmu_event_update(event); +} + +static void hisi_pcie_pmu_set_period(struct perf_event *event) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + local64_set(&hwc->prev_count, HISI_PCIE_INIT_VAL); + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_CNT, idx, HISI_PCIE_INIT_VAL); + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EXT_CNT, idx, HISI_PCIE_INIT_VAL); +} + +static void hisi_pcie_pmu_enable_counter(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc) +{ + u32 idx = hwc->idx; + u64 val; + + val = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx); + val |= HISI_PCIE_EVENT_EN; + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, val); +} + +static void hisi_pcie_pmu_disable_counter(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc) +{ + u32 idx = hwc->idx; + u64 val; + + val = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx); + val &= ~HISI_PCIE_EVENT_EN; + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, val); +} + +static void hisi_pcie_pmu_enable_int(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc) +{ + u32 idx = hwc->idx; + + hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_MASK, idx, 0); +} + +static void hisi_pcie_pmu_disable_int(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc) +{ + u32 idx = hwc->idx; + + hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_MASK, idx, 1); +} + +static void hisi_pcie_pmu_reset_counter(struct hisi_pcie_pmu *pcie_pmu, int idx) +{ + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, HISI_PCIE_RESET_CNT); + hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, HISI_PCIE_INIT_SET); +} + +static void hisi_pcie_pmu_start(struct perf_event *event, int flags) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + u64 prev_cnt; + + if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED))) + return; + + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + hwc->state = 0; + + hisi_pcie_pmu_config_filter(event); + hisi_pcie_pmu_enable_counter(pcie_pmu, hwc); + hisi_pcie_pmu_enable_int(pcie_pmu, hwc); + hisi_pcie_pmu_set_period(event); + + if (flags & PERF_EF_RELOAD) { + prev_cnt = local64_read(&hwc->prev_count); + hisi_pcie_pmu_writeq(pcie_pmu, hwc->event_base, idx, prev_cnt); + } + + perf_event_update_userpage(event); +} + +static void hisi_pcie_pmu_stop(struct perf_event *event, int flags) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + hisi_pcie_pmu_event_update(event); + hisi_pcie_pmu_disable_int(pcie_pmu, hwc); + hisi_pcie_pmu_disable_counter(pcie_pmu, hwc); + hisi_pcie_pmu_clear_filter(event); + WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); + hwc->state |= PERF_HES_STOPPED; + + if (hwc->state & PERF_HES_UPTODATE) + return; + + hwc->state |= PERF_HES_UPTODATE; +} + +static int hisi_pcie_pmu_add(struct perf_event *event, int flags) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx; + + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + + /* Check all working events to find a related event. */ + idx = hisi_pcie_pmu_find_related_event(pcie_pmu, event); + if (idx < 0) + return idx; + + /* Current event shares an enabled counter with the related event */ + if (idx < HISI_PCIE_MAX_COUNTERS) { + hwc->idx = idx; + goto start_count; + } + + idx = hisi_pcie_pmu_get_event_idx(pcie_pmu); + if (idx < 0) + return idx; + + hwc->idx = idx; + pcie_pmu->hw_events[idx] = event; + /* Reset Counter to avoid previous statistic interference. */ + hisi_pcie_pmu_reset_counter(pcie_pmu, idx); + +start_count: + if (flags & PERF_EF_START) + hisi_pcie_pmu_start(event, PERF_EF_RELOAD); + + return 0; +} + +static void hisi_pcie_pmu_del(struct perf_event *event, int flags) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + + hisi_pcie_pmu_stop(event, PERF_EF_UPDATE); + pcie_pmu->hw_events[hwc->idx] = NULL; + perf_event_update_userpage(event); +} + +static void hisi_pcie_pmu_enable(struct pmu *pmu) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu); + int num; + + for (num = 0; num < HISI_PCIE_MAX_COUNTERS; num++) { + if (pcie_pmu->hw_events[num]) + break; + } + + if (num == HISI_PCIE_MAX_COUNTERS) + return; + + writel(HISI_PCIE_GLOBAL_EN, pcie_pmu->base + HISI_PCIE_GLOBAL_CTRL); +} + +static void hisi_pcie_pmu_disable(struct pmu *pmu) +{ + struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu); + + writel(HISI_PCIE_GLOBAL_NONE, pcie_pmu->base + HISI_PCIE_GLOBAL_CTRL); +} + +static irqreturn_t hisi_pcie_pmu_irq(int irq, void *data) +{ + struct hisi_pcie_pmu *pcie_pmu = data; + irqreturn_t ret = IRQ_NONE; + struct perf_event *event; + u32 overflown; + int idx; + + for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) { + overflown = hisi_pcie_pmu_readl(pcie_pmu, HISI_PCIE_INT_STAT, idx); + if (!overflown) + continue; + + /* Clear status of interrupt. */ + hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_STAT, idx, 1); + event = pcie_pmu->hw_events[idx]; + if (!event) + continue; + + hisi_pcie_pmu_event_update(event); + hisi_pcie_pmu_set_period(event); + ret = IRQ_HANDLED; + } + + return ret; +} + +static int hisi_pcie_pmu_irq_register(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu) +{ + int irq, ret; + + ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI); + if (ret < 0) { + pci_err(pdev, "Failed to enable MSI vectors: %d\n", ret); + return ret; + } + + irq = pci_irq_vector(pdev, 0); + ret = request_irq(irq, hisi_pcie_pmu_irq, IRQF_NOBALANCING | IRQF_NO_THREAD, DRV_NAME, + pcie_pmu); + if (ret) { + pci_err(pdev, "Failed to register IRQ: %d\n", ret); + pci_free_irq_vectors(pdev); + return ret; + } + + pcie_pmu->irq = irq; + + return 0; +} + +static void hisi_pcie_pmu_irq_unregister(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu) +{ + free_irq(pcie_pmu->irq, pcie_pmu); + pci_free_irq_vectors(pdev); +} + +static int hisi_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node, struct hisi_pcie_pmu, node); + + if (pcie_pmu->on_cpu == -1) { + pcie_pmu->on_cpu = cpu; + WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(cpu))); + } + + return 0; +} + +static int hisi_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node) +{ + struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node, struct hisi_pcie_pmu, node); + unsigned int target; + + /* Nothing to do if this CPU doesn't own the PMU */ + if (pcie_pmu->on_cpu != cpu) + return 0; + + pcie_pmu->on_cpu = -1; + /* Choose a new CPU from all online cpus. */ + target = cpumask_first(cpu_online_mask); + if (target >= nr_cpu_ids) { + pci_err(pcie_pmu->pdev, "There is no CPU to set\n"); + return 0; + } + + perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target); + /* Use this CPU for event counting */ + pcie_pmu->on_cpu = target; + WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(target))); + + return 0; +} + +static struct attribute *hisi_pcie_pmu_events_attr[] = { + HISI_PCIE_PMU_EVENT_ATTR(rx_mwr_latency, 0x0010), + HISI_PCIE_PMU_EVENT_ATTR(rx_mwr_cnt, 0x10010), + HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_latency, 0x0210), + HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_cnt, 0x10210), + HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_latency, 0x0011), + HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_cnt, 0x10011), + HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_flux, 0x1005), + HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_time, 0x11005), + HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_flux, 0x2004), + HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_time, 0x12004), + NULL +}; + +static struct attribute_group hisi_pcie_pmu_events_group = { + .name = "events", + .attrs = hisi_pcie_pmu_events_attr, +}; + +static struct attribute *hisi_pcie_pmu_format_attr[] = { + HISI_PCIE_PMU_FORMAT_ATTR(event, "config:0-16"), + HISI_PCIE_PMU_FORMAT_ATTR(thr_len, "config1:0-3"), + HISI_PCIE_PMU_FORMAT_ATTR(thr_mode, "config1:4"), + HISI_PCIE_PMU_FORMAT_ATTR(trig_len, "config1:5-8"), + HISI_PCIE_PMU_FORMAT_ATTR(trig_mode, "config1:9"), + HISI_PCIE_PMU_FORMAT_ATTR(port, "config2:0-15"), + HISI_PCIE_PMU_FORMAT_ATTR(bdf, "config2:16-31"), + NULL +}; + +static const struct attribute_group hisi_pcie_pmu_format_group = { + .name = "format", + .attrs = hisi_pcie_pmu_format_attr, +}; + +static struct attribute *hisi_pcie_pmu_bus_attrs[] = { + &dev_attr_bus.attr, + NULL +}; + +static const struct attribute_group hisi_pcie_pmu_bus_attr_group = { + .attrs = hisi_pcie_pmu_bus_attrs, +}; + +static struct attribute *hisi_pcie_pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static const struct attribute_group hisi_pcie_pmu_cpumask_attr_group = { + .attrs = hisi_pcie_pmu_cpumask_attrs, +}; + +static struct attribute *hisi_pcie_pmu_identifier_attrs[] = { + &dev_attr_identifier.attr, + NULL +}; + +static const struct attribute_group hisi_pcie_pmu_identifier_attr_group = { + .attrs = hisi_pcie_pmu_identifier_attrs, +}; + +static const struct attribute_group *hisi_pcie_pmu_attr_groups[] = { + &hisi_pcie_pmu_events_group, + &hisi_pcie_pmu_format_group, + &hisi_pcie_pmu_bus_attr_group, + &hisi_pcie_pmu_cpumask_attr_group, + &hisi_pcie_pmu_identifier_attr_group, + NULL +}; + +static int hisi_pcie_alloc_pmu(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu) +{ + struct hisi_pcie_reg_pair regs; + u16 sicl_id, core_id; + char *name; + + regs = hisi_pcie_parse_reg_value(pcie_pmu, HISI_PCIE_REG_BDF); + pcie_pmu->bdf_min = regs.lo; + pcie_pmu->bdf_max = regs.hi; + + regs = hisi_pcie_parse_reg_value(pcie_pmu, HISI_PCIE_REG_INFO); + sicl_id = regs.hi; + core_id = regs.lo; + + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_pcie%u_core%u", sicl_id, core_id); + if (!name) + return -ENOMEM; + + pcie_pmu->pdev = pdev; + pcie_pmu->on_cpu = -1; + pcie_pmu->identifier = readl(pcie_pmu->base + HISI_PCIE_REG_VERSION); + pcie_pmu->pmu = (struct pmu) { + .name = name, + .module = THIS_MODULE, + .event_init = hisi_pcie_pmu_event_init, + .pmu_enable = hisi_pcie_pmu_enable, + .pmu_disable = hisi_pcie_pmu_disable, + .add = hisi_pcie_pmu_add, + .del = hisi_pcie_pmu_del, + .start = hisi_pcie_pmu_start, + .stop = hisi_pcie_pmu_stop, + .read = hisi_pcie_pmu_read, + .task_ctx_nr = perf_invalid_context, + .attr_groups = hisi_pcie_pmu_attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + }; + + return 0; +} + +static int hisi_pcie_init_pmu(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu) +{ + int ret; + + pcie_pmu->base = pci_ioremap_bar(pdev, 2); + if (!pcie_pmu->base) { + pci_err(pdev, "Ioremap failed for pcie_pmu resource\n"); + return -ENOMEM; + } + + ret = hisi_pcie_alloc_pmu(pdev, pcie_pmu); + if (ret) + goto err_iounmap; + + ret = hisi_pcie_pmu_irq_register(pdev, pcie_pmu); + if (ret) + goto err_iounmap; + + ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node); + if (ret) { + pci_err(pdev, "Failed to register hotplug: %d\n", ret); + goto err_irq_unregister; + } + + ret = perf_pmu_register(&pcie_pmu->pmu, pcie_pmu->pmu.name, -1); + if (ret) { + pci_err(pdev, "Failed to register PCIe PMU: %d\n", ret); + goto err_hotplug_unregister; + } + + return ret; + +err_hotplug_unregister: + cpuhp_state_remove_instance_nocalls( + CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node); + +err_irq_unregister: + hisi_pcie_pmu_irq_unregister(pdev, pcie_pmu); + +err_iounmap: + iounmap(pcie_pmu->base); + + return ret; +} + +static void hisi_pcie_uninit_pmu(struct pci_dev *pdev) +{ + struct hisi_pcie_pmu *pcie_pmu = pci_get_drvdata(pdev); + + perf_pmu_unregister(&pcie_pmu->pmu); + cpuhp_state_remove_instance_nocalls( + CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node); + hisi_pcie_pmu_irq_unregister(pdev, pcie_pmu); + iounmap(pcie_pmu->base); +} + +static int hisi_pcie_init_dev(struct pci_dev *pdev) +{ + int ret; + + ret = pcim_enable_device(pdev); + if (ret) { + pci_err(pdev, "Failed to enable PCI device: %d\n", ret); + return ret; + } + + ret = pcim_iomap_regions(pdev, BIT(2), DRV_NAME); + if (ret < 0) { + pci_err(pdev, "Failed to request PCI mem regions: %d\n", ret); + return ret; + } + + pci_set_master(pdev); + + return 0; +} + +static int hisi_pcie_pmu_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + struct hisi_pcie_pmu *pcie_pmu; + int ret; + + pcie_pmu = devm_kzalloc(&pdev->dev, sizeof(*pcie_pmu), GFP_KERNEL); + if (!pcie_pmu) + return -ENOMEM; + + ret = hisi_pcie_init_dev(pdev); + if (ret) + return ret; + + ret = hisi_pcie_init_pmu(pdev, pcie_pmu); + if (ret) + return ret; + + pci_set_drvdata(pdev, pcie_pmu); + + return ret; +} + +static void hisi_pcie_pmu_remove(struct pci_dev *pdev) +{ + hisi_pcie_uninit_pmu(pdev); + pci_set_drvdata(pdev, NULL); +} + +static const struct pci_device_id hisi_pcie_pmu_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, 0xa12d) }, + { 0, } +}; +MODULE_DEVICE_TABLE(pci, hisi_pcie_pmu_ids); + +static struct pci_driver hisi_pcie_pmu_driver = { + .name = DRV_NAME, + .id_table = hisi_pcie_pmu_ids, + .probe = hisi_pcie_pmu_probe, + .remove = hisi_pcie_pmu_remove, +}; + +static int __init hisi_pcie_module_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, + "AP_PERF_ARM_HISI_PCIE_PMU_ONLINE", + hisi_pcie_pmu_online_cpu, + hisi_pcie_pmu_offline_cpu); + if (ret) { + pr_err("Failed to setup PCIe PMU hotplug: %d\n", ret); + return ret; + } + + ret = pci_register_driver(&hisi_pcie_pmu_driver); + if (ret) + cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE); + + return ret; +} +module_init(hisi_pcie_module_init); + +static void __exit hisi_pcie_module_exit(void) +{ + pci_unregister_driver(&hisi_pcie_pmu_driver); + cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE); +} +module_exit(hisi_pcie_module_exit); + +MODULE_DESCRIPTION("HiSilicon PCIe PMU driver"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Qi Liu liuqi115@huawei.com"); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index b98b9eb7d5f8..24e7be132046 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -178,6 +178,9 @@ enum cpuhp_state { CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, CPUHP_AP_PERF_ARM_HISI_PA_ONLINE, CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE, + #ifndef __GENKSYMS__ + CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, + #endif CPUHP_AP_PERF_ARM_L2X0_ONLINE, CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE, CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,
From: Qi Liu liuqi115@huawei.com
mainline inclusion from mainline-v5.19-rc1 commit 807907dae970 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AZ87 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
If a PMU is in a SICL (Super IO cluster), it is not appropriate to associate this PMU with a CPU die. So we associate it with all CPUs online, rather than CPUs in the nearest SCCL.
As the firmware of Hip09 platform hasn't been published yet, change of PMU driver will not influence backwards compatibility between driver and firmware.
Signed-off-by: Qi Liu liuqi115@huawei.com Reviewed-by: John Garry john.garry@huawei.com Link: https://lore.kernel.org/r/20220415102352.6665-2-liuqi115@huawei.com Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Wangming Shao shaowangming@h-partners.com Reviewed-by: Junhao He hejunhao3@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/perf/hisilicon/hisi_uncore_pa_pmu.c | 18 +++++++----------- drivers/perf/hisilicon/hisi_uncore_pmu.c | 4 ++++ drivers/perf/hisilicon/hisi_uncore_pmu.h | 1 + 3 files changed, 12 insertions(+), 11 deletions(-)
diff --git a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c index 390e59f4ef60..f1e6b5cee075 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pa_pmu.c @@ -258,13 +258,12 @@ static int hisi_pa_pmu_init_data(struct platform_device *pdev, struct hisi_pmu *pa_pmu) { /* - * Use the SCCL_ID and the index ID to identify the PA PMU, - * while SCCL_ID is the nearst SCCL_ID from this SICL and - * CPU core is chosen from this SCCL to manage this PMU. + * As PA PMU is in a SICL, use the SICL_ID and the index ID + * to identify the PA PMU. */ if (device_property_read_u32(&pdev->dev, "hisilicon,scl-id", - &pa_pmu->sccl_id)) { - dev_err(&pdev->dev, "Cannot read sccl-id!\n"); + &pa_pmu->sicl_id)) { + dev_err(&pdev->dev, "Cannot read sicl-id!\n"); return -EINVAL; }
@@ -275,6 +274,7 @@ static int hisi_pa_pmu_init_data(struct platform_device *pdev, }
pa_pmu->ccl_id = -1; + pa_pmu->sccl_id = -1;
pa_pmu->base = devm_platform_ioremap_resource(pdev, 0); if (IS_ERR(pa_pmu->base)) { @@ -399,13 +399,9 @@ static int hisi_pa_pmu_probe(struct platform_device *pdev) ret = hisi_pa_pmu_dev_probe(pdev, pa_pmu); if (ret) return ret; - /* - * PA is attached in SICL and the CPU core is chosen to manage this - * PMU which is the nearest SCCL, while its SCCL_ID is greater than - * one with the SICL_ID. - */ + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sicl%u_pa%u", - pa_pmu->sccl_id - 1, pa_pmu->index_id); + pa_pmu->sicl_id, pa_pmu->index_id); if (!name) return -ENOMEM;
diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c index 07f0c7015181..a87568af27a0 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c @@ -458,6 +458,10 @@ static bool hisi_pmu_cpu_is_associated_pmu(struct hisi_pmu *hisi_pmu) { int sccl_id, ccl_id;
+ /* If SCCL_ID is -1, the PMU is in a SICL and has no CPU affinity */ + if (hisi_pmu->sccl_id == -1) + return true; + if (hisi_pmu->ccl_id == -1) { /* If CCL_ID is -1, the PMU only shares the same SCCL */ hisi_read_sccl_and_ccl_id(&sccl_id, NULL); diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h index ea9d89bbc1ea..0a90c2881854 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.h +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h @@ -81,6 +81,7 @@ struct hisi_pmu { struct device *dev; struct hlist_node node; int sccl_id; + int sicl_id; int ccl_id; void __iomem *base; /* the ID of the PMU modules */
From: Qi Liu liuqi115@huawei.com
mainline inclusion from mainline-v5.19-rc1 commit 6b79738b6ed9 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5AZ87 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------------------------------
On HiSilicon Hip09 platform, there is a CPA (Coherency Protocol Agent) on each SICL (Super IO Cluster) which implements packet format translation, route parsing and traffic statistics.
CPA PMU has 8 PMU counters and interrupt is supported to handle counter overflow. Let's support its driver under the framework of HiSilicon PMU driver.
Signed-off-by: Qi Liu liuqi115@huawei.com Reviewed-by: John Garry john.garry@huawei.com Reviewed-by: Shaokun Zhang zhangshaokun@hisilicon.com Link: https://lore.kernel.org/r/20220415102352.6665-3-liuqi115@huawei.com Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Wangming Shao shaowangming@h-partners.com Reviewed-by: Junhao He hejunhao3@huawei.com Reviewed-by: Junhao He hejunhao3@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/perf/hisilicon/Makefile | 2 +- drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c | 409 +++++++++++++++++++ include/linux/cpuhotplug.h | 3 + 3 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c
diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile index 506ed39e3266..6be83517acaa 100644 --- a/drivers/perf/hisilicon/Makefile +++ b/drivers/perf/hisilicon/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \ hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o \ - hisi_uncore_pa_pmu.o + hisi_uncore_pa_pmu.o hisi_uncore_cpa_pmu.o
obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o diff --git a/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c b/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c new file mode 100644 index 000000000000..a9bb73f76be4 --- /dev/null +++ b/drivers/perf/hisilicon/hisi_uncore_cpa_pmu.c @@ -0,0 +1,409 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * HiSilicon SoC CPA(Coherency Protocol Agent) hardware event counters support + * + * Copyright (C) 2022 HiSilicon Limited + * Author: Qi Liu liuqi115@huawei.com + * + * This code is based on the uncore PMUs like arm-cci and arm-ccn. + */ + +#define pr_fmt(fmt) "cpa pmu: " fmt +#include <linux/acpi.h> +#include <linux/bug.h> +#include <linux/cpuhotplug.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/list.h> +#include <linux/smp.h> + +#include "hisi_uncore_pmu.h" + +/* CPA register definition */ +#define CPA_PERF_CTRL 0x1c00 +#define CPA_EVENT_CTRL 0x1c04 +#define CPA_INT_MASK 0x1c70 +#define CPA_INT_STATUS 0x1c78 +#define CPA_INT_CLEAR 0x1c7c +#define CPA_EVENT_TYPE0 0x1c80 +#define CPA_VERSION 0x1cf0 +#define CPA_CNT0_LOWER 0x1d00 +#define CPA_CFG_REG 0x0534 + +/* CPA operation command */ +#define CPA_PERF_CTRL_EN BIT_ULL(0) +#define CPA_EVTYPE_MASK 0xffUL +#define CPA_PM_CTRL BIT_ULL(9) + +/* CPA has 8-counters */ +#define CPA_NR_COUNTERS 0x8 +#define CPA_COUNTER_BITS 64 +#define CPA_NR_EVENTS 0xff +#define CPA_REG_OFFSET 0x8 + +static u32 hisi_cpa_pmu_get_counter_offset(int idx) +{ + return (CPA_CNT0_LOWER + idx * CPA_REG_OFFSET); +} + +static u64 hisi_cpa_pmu_read_counter(struct hisi_pmu *cpa_pmu, + struct hw_perf_event *hwc) +{ + return readq(cpa_pmu->base + hisi_cpa_pmu_get_counter_offset(hwc->idx)); +} + +static void hisi_cpa_pmu_write_counter(struct hisi_pmu *cpa_pmu, + struct hw_perf_event *hwc, u64 val) +{ + writeq(val, cpa_pmu->base + hisi_cpa_pmu_get_counter_offset(hwc->idx)); +} + +static void hisi_cpa_pmu_write_evtype(struct hisi_pmu *cpa_pmu, int idx, + u32 type) +{ + u32 reg, reg_idx, shift, val; + + /* + * Select the appropriate event select register(CPA_EVENT_TYPE0/1). + * There are 2 event select registers for the 8 hardware counters. + * Event code is 8-bits and for the former 4 hardware counters, + * CPA_EVENT_TYPE0 is chosen. For the latter 4 hardware counters, + * CPA_EVENT_TYPE1 is chosen. + */ + reg = CPA_EVENT_TYPE0 + (idx / 4) * 4; + reg_idx = idx % 4; + shift = CPA_REG_OFFSET * reg_idx; + + /* Write event code to CPA_EVENT_TYPEx Register */ + val = readl(cpa_pmu->base + reg); + val &= ~(CPA_EVTYPE_MASK << shift); + val |= type << shift; + writel(val, cpa_pmu->base + reg); +} + +static void hisi_cpa_pmu_start_counters(struct hisi_pmu *cpa_pmu) +{ + u32 val; + + val = readl(cpa_pmu->base + CPA_PERF_CTRL); + val |= CPA_PERF_CTRL_EN; + writel(val, cpa_pmu->base + CPA_PERF_CTRL); +} + +static void hisi_cpa_pmu_stop_counters(struct hisi_pmu *cpa_pmu) +{ + u32 val; + + val = readl(cpa_pmu->base + CPA_PERF_CTRL); + val &= ~(CPA_PERF_CTRL_EN); + writel(val, cpa_pmu->base + CPA_PERF_CTRL); +} + +static void hisi_cpa_pmu_disable_pm(struct hisi_pmu *cpa_pmu) +{ + u32 val; + + val = readl(cpa_pmu->base + CPA_CFG_REG); + val |= CPA_PM_CTRL; + writel(val, cpa_pmu->base + CPA_CFG_REG); +} + +static void hisi_cpa_pmu_enable_pm(struct hisi_pmu *cpa_pmu) +{ + u32 val; + + val = readl(cpa_pmu->base + CPA_CFG_REG); + val &= ~(CPA_PM_CTRL); + writel(val, cpa_pmu->base + CPA_CFG_REG); +} + +static void hisi_cpa_pmu_enable_counter(struct hisi_pmu *cpa_pmu, + struct hw_perf_event *hwc) +{ + u32 val; + + /* Enable counter index in CPA_EVENT_CTRL register */ + val = readl(cpa_pmu->base + CPA_EVENT_CTRL); + val |= 1 << hwc->idx; + writel(val, cpa_pmu->base + CPA_EVENT_CTRL); +} + +static void hisi_cpa_pmu_disable_counter(struct hisi_pmu *cpa_pmu, + struct hw_perf_event *hwc) +{ + u32 val; + + /* Clear counter index in CPA_EVENT_CTRL register */ + val = readl(cpa_pmu->base + CPA_EVENT_CTRL); + val &= ~(1UL << hwc->idx); + writel(val, cpa_pmu->base + CPA_EVENT_CTRL); +} + +static void hisi_cpa_pmu_enable_counter_int(struct hisi_pmu *cpa_pmu, + struct hw_perf_event *hwc) +{ + u32 val; + + /* Write 0 to enable interrupt */ + val = readl(cpa_pmu->base + CPA_INT_MASK); + val &= ~(1UL << hwc->idx); + writel(val, cpa_pmu->base + CPA_INT_MASK); +} + +static void hisi_cpa_pmu_disable_counter_int(struct hisi_pmu *cpa_pmu, + struct hw_perf_event *hwc) +{ + u32 val; + + /* Write 1 to mask interrupt */ + val = readl(cpa_pmu->base + CPA_INT_MASK); + val |= 1 << hwc->idx; + writel(val, cpa_pmu->base + CPA_INT_MASK); +} + +static u32 hisi_cpa_pmu_get_int_status(struct hisi_pmu *cpa_pmu) +{ + return readl(cpa_pmu->base + CPA_INT_STATUS); +} + +static void hisi_cpa_pmu_clear_int_status(struct hisi_pmu *cpa_pmu, int idx) +{ + writel(1 << idx, cpa_pmu->base + CPA_INT_CLEAR); +} + +static const struct acpi_device_id hisi_cpa_pmu_acpi_match[] = { + { "HISI0281", }, + {} +}; +MODULE_DEVICE_TABLE(acpi, hisi_cpa_pmu_acpi_match); + +static int hisi_cpa_pmu_init_data(struct platform_device *pdev, + struct hisi_pmu *cpa_pmu) +{ + if (device_property_read_u32(&pdev->dev, "hisilicon,scl-id", + &cpa_pmu->sicl_id)) { + dev_err(&pdev->dev, "Can not read sicl-id\n"); + return -EINVAL; + } + + if (device_property_read_u32(&pdev->dev, "hisilicon,idx-id", + &cpa_pmu->index_id)) { + dev_err(&pdev->dev, "Cannot read idx-id\n"); + return -EINVAL; + } + + cpa_pmu->ccl_id = -1; + cpa_pmu->sccl_id = -1; + cpa_pmu->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(cpa_pmu->base)) + return PTR_ERR(cpa_pmu->base); + + cpa_pmu->identifier = readl(cpa_pmu->base + CPA_VERSION); + + return 0; +} + +static struct attribute *hisi_cpa_pmu_format_attr[] = { + HISI_PMU_FORMAT_ATTR(event, "config:0-15"), + NULL +}; + +static const struct attribute_group hisi_cpa_pmu_format_group = { + .name = "format", + .attrs = hisi_cpa_pmu_format_attr, +}; + +static struct attribute *hisi_cpa_pmu_events_attr[] = { + HISI_PMU_EVENT_ATTR(cpa_cycles, 0x00), + HISI_PMU_EVENT_ATTR(cpa_p1_wr_dat, 0x61), + HISI_PMU_EVENT_ATTR(cpa_p1_rd_dat, 0x62), + HISI_PMU_EVENT_ATTR(cpa_p0_wr_dat, 0xE1), + HISI_PMU_EVENT_ATTR(cpa_p0_rd_dat, 0xE2), + NULL +}; + +static const struct attribute_group hisi_cpa_pmu_events_group = { + .name = "events", + .attrs = hisi_cpa_pmu_events_attr, +}; + +static DEVICE_ATTR(cpumask, 0444, hisi_cpumask_sysfs_show, NULL); + +static struct attribute *hisi_cpa_pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL +}; + +static const struct attribute_group hisi_cpa_pmu_cpumask_attr_group = { + .attrs = hisi_cpa_pmu_cpumask_attrs, +}; + +static struct device_attribute hisi_cpa_pmu_identifier_attr = + __ATTR(identifier, 0444, hisi_uncore_pmu_identifier_attr_show, NULL); + +static struct attribute *hisi_cpa_pmu_identifier_attrs[] = { + &hisi_cpa_pmu_identifier_attr.attr, + NULL +}; + +static const struct attribute_group hisi_cpa_pmu_identifier_group = { + .attrs = hisi_cpa_pmu_identifier_attrs, +}; + +static const struct attribute_group *hisi_cpa_pmu_attr_groups[] = { + &hisi_cpa_pmu_format_group, + &hisi_cpa_pmu_events_group, + &hisi_cpa_pmu_cpumask_attr_group, + &hisi_cpa_pmu_identifier_group, + NULL +}; + +static const struct hisi_uncore_ops hisi_uncore_cpa_pmu_ops = { + .write_evtype = hisi_cpa_pmu_write_evtype, + .get_event_idx = hisi_uncore_pmu_get_event_idx, + .start_counters = hisi_cpa_pmu_start_counters, + .stop_counters = hisi_cpa_pmu_stop_counters, + .enable_counter = hisi_cpa_pmu_enable_counter, + .disable_counter = hisi_cpa_pmu_disable_counter, + .enable_counter_int = hisi_cpa_pmu_enable_counter_int, + .disable_counter_int = hisi_cpa_pmu_disable_counter_int, + .write_counter = hisi_cpa_pmu_write_counter, + .read_counter = hisi_cpa_pmu_read_counter, + .get_int_status = hisi_cpa_pmu_get_int_status, + .clear_int_status = hisi_cpa_pmu_clear_int_status, +}; + +static int hisi_cpa_pmu_dev_probe(struct platform_device *pdev, + struct hisi_pmu *cpa_pmu) +{ + int ret; + + ret = hisi_cpa_pmu_init_data(pdev, cpa_pmu); + if (ret) + return ret; + + ret = hisi_uncore_pmu_init_irq(cpa_pmu, pdev); + if (ret) + return ret; + + cpa_pmu->counter_bits = CPA_COUNTER_BITS; + cpa_pmu->check_event = CPA_NR_EVENTS; + cpa_pmu->pmu_events.attr_groups = hisi_cpa_pmu_attr_groups; + cpa_pmu->ops = &hisi_uncore_cpa_pmu_ops; + cpa_pmu->num_counters = CPA_NR_COUNTERS; + cpa_pmu->dev = &pdev->dev; + cpa_pmu->on_cpu = -1; + + return 0; +} + +static int hisi_cpa_pmu_probe(struct platform_device *pdev) +{ + struct hisi_pmu *cpa_pmu; + char *name; + int ret; + + cpa_pmu = devm_kzalloc(&pdev->dev, sizeof(*cpa_pmu), GFP_KERNEL); + if (!cpa_pmu) + return -ENOMEM; + + ret = hisi_cpa_pmu_dev_probe(pdev, cpa_pmu); + if (ret) + return ret; + + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_sicl%d_cpa%u", + cpa_pmu->sicl_id, cpa_pmu->index_id); + if (!name) + return -ENOMEM; + + cpa_pmu->pmu = (struct pmu) { + .name = name, + .module = THIS_MODULE, + .task_ctx_nr = perf_invalid_context, + .event_init = hisi_uncore_pmu_event_init, + .pmu_enable = hisi_uncore_pmu_enable, + .pmu_disable = hisi_uncore_pmu_disable, + .add = hisi_uncore_pmu_add, + .del = hisi_uncore_pmu_del, + .start = hisi_uncore_pmu_start, + .stop = hisi_uncore_pmu_stop, + .read = hisi_uncore_pmu_read, + .attr_groups = cpa_pmu->pmu_events.attr_groups, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + }; + + /* Power Management should be disabled before using CPA PMU. */ + hisi_cpa_pmu_disable_pm(cpa_pmu); + ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE, + &cpa_pmu->node); + if (ret) { + dev_err(&pdev->dev, "Error %d registering hotplug\n", ret); + hisi_cpa_pmu_enable_pm(cpa_pmu); + return ret; + } + + ret = perf_pmu_register(&cpa_pmu->pmu, name, -1); + if (ret) { + dev_err(cpa_pmu->dev, "PMU register failed\n"); + cpuhp_state_remove_instance_nocalls( + CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE, &cpa_pmu->node); + hisi_cpa_pmu_enable_pm(cpa_pmu); + return ret; + } + + platform_set_drvdata(pdev, cpa_pmu); + return ret; +} + +static int hisi_cpa_pmu_remove(struct platform_device *pdev) +{ + struct hisi_pmu *cpa_pmu = platform_get_drvdata(pdev); + + perf_pmu_unregister(&cpa_pmu->pmu); + cpuhp_state_remove_instance_nocalls(CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE, + &cpa_pmu->node); + hisi_cpa_pmu_enable_pm(cpa_pmu); + return 0; +} + +static struct platform_driver hisi_cpa_pmu_driver = { + .driver = { + .name = "hisi_cpa_pmu", + .acpi_match_table = ACPI_PTR(hisi_cpa_pmu_acpi_match), + .suppress_bind_attrs = true, + }, + .probe = hisi_cpa_pmu_probe, + .remove = hisi_cpa_pmu_remove, +}; + +static int __init hisi_cpa_pmu_module_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE, + "AP_PERF_ARM_HISI_CPA_ONLINE", + hisi_uncore_pmu_online_cpu, + hisi_uncore_pmu_offline_cpu); + if (ret) { + pr_err("setup hotplug failed: %d\n", ret); + return ret; + } + + ret = platform_driver_register(&hisi_cpa_pmu_driver); + if (ret) + cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE); + + return ret; +} +module_init(hisi_cpa_pmu_module_init); + +static void __exit hisi_cpa_pmu_module_exit(void) +{ + platform_driver_unregister(&hisi_cpa_pmu_driver); + cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE); +} +module_exit(hisi_cpa_pmu_module_exit); + +MODULE_DESCRIPTION("HiSilicon SoC CPA PMU driver"); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Qi Liu liuqi115@huawei.com"); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 24e7be132046..5571bfc2ec6e 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -173,6 +173,9 @@ enum cpuhp_state { CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE, CPUHP_AP_PERF_ARM_CCN_ONLINE, + #ifndef __GENKSYMS__ + CPUHP_AP_PERF_ARM_HISI_CPA_ONLINE, + #endif CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE, CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE, CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56I4P CVE: NA backport: openEuler-22.03-LTS
--------------------------------
This reverts commit e56e8310a3ea2751463ab8ed03dd64baab3fee46.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/drop_caches.c | 36 ++------------------------------ include/linux/fs.h | 9 -------- include/linux/page_cache_limit.h | 3 --- kernel/sysctl.c | 8 ------- mm/page_cache_limit.c | 2 -- mm/truncate.c | 34 +++--------------------------- 6 files changed, 5 insertions(+), 87 deletions(-)
diff --git a/fs/drop_caches.c b/fs/drop_caches.c index ff70ef7674e3..f00fcc4a4f72 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -9,17 +9,12 @@ #include <linux/writeback.h> #include <linux/sysctl.h> #include <linux/gfp.h> - -#ifdef CONFIG_SHRINK_PAGECACHE -#include <linux/page_cache_limit.h> -#endif - #include "internal.h"
/* A global variable is a bit ugly, but it keeps the code simple */ int sysctl_drop_caches;
-static void drop_pagecache_sb(struct super_block *sb, void *nid) +static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL;
@@ -40,12 +35,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *nid) spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock);
- if (!nid) - invalidate_mapping_pages(inode->i_mapping, 0, -1); - else - node_invalidate_mapping_pages(inode->i_mapping, - *(int *)nid, 0, -1); - + invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode;
@@ -84,25 +74,3 @@ int drop_caches_sysctl_handler(struct ctl_table *table, int write, } return 0; } - -#ifdef CONFIG_SHRINK_PAGECACHE -int proc_shrink_node_caches(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int ret; - - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (ret || !write) - return ret; - - if (node_to_shrink >= MAX_NUMNODES) - return -EINVAL; - - if (!node_isset(node_to_shrink, node_states[N_MEMORY])) - return 0; - - iterate_supers(drop_pagecache_sb, &node_to_shrink); - - return 0; -} -#endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 18259e38dcd7..5c71b826c89c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2680,15 +2680,6 @@ extern bool is_bad_inode(struct inode *); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end);
-#ifdef CONFIG_SHRINK_PAGECACHE -unsigned long node_invalidate_mapping_pages(struct address_space *mapping, - int nid, pgoff_t start, pgoff_t end); -#else -static inline unsigned long -node_invalidate_mapping_pages(struct address_space *mapping, int nid, - pgoff_t start, pgoff_t end) { return 0; } -#endif - void invalidate_mapping_pagevec(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec); diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h index 442d6126c529..2df08a0604d8 100644 --- a/include/linux/page_cache_limit.h +++ b/include/linux/page_cache_limit.h @@ -12,7 +12,6 @@ enum page_cache_reclaim_flag { extern int pagecache_reclaim_enable; extern int pagecache_limit_ratio; extern int pagecache_reclaim_ratio; -extern int node_to_shrink;
int proc_page_cache_limit(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -21,8 +20,6 @@ unsigned long __shrink_node_page_cache(int nid, gfp_t mask, void kpagecache_limitd_stop(int nid); int kpagecache_limitd_run(int nid); void wakeup_all_kpagecache_limitd(void); -int proc_shrink_node_caches(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); #else static inline void kpagecache_limitd_stop(int nid) {} static inline int kpagecache_limitd_run(int nid) { return 0; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 91812d673c6b..c81713cd19f0 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -3229,14 +3229,6 @@ static struct ctl_table vm_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = (void *)&one_hundred, }, - { - .procname = "node_drop_caches", - .data = &node_to_shrink, - .maxlen = sizeof(node_to_shrink), - .mode = 0600, - .proc_handler = proc_shrink_node_caches, - .extra1 = SYSCTL_ZERO, - }, #endif #ifdef CONFIG_ASCEND_SHARE_POOL { diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 0ccc1388c8dc..0a3098c9bb33 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -5,13 +5,11 @@ #include <linux/module.h> #include <linux/err.h> #include <linux/swap.h> -#include <linux/fs.h> #include <linux/page_cache_limit.h>
int pagecache_reclaim_enable; int pagecache_limit_ratio; int pagecache_reclaim_ratio; -int node_to_shrink;
static unsigned long pagecache_limit_pages; static unsigned long node_pagecache_limit_pages[MAX_NUMNODES]; diff --git a/mm/truncate.c b/mm/truncate.c index 6d4887a43cd8..98d08f197766 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -465,7 +465,7 @@ void truncate_inode_pages_final(struct address_space *mapping) EXPORT_SYMBOL(truncate_inode_pages_final);
static unsigned long __invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end, unsigned long *nr_pagevec, int nid) + pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { pgoff_t indices[PAGEVEC_SIZE]; struct pagevec pvec; @@ -487,10 +487,6 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, page); continue; } - - if (nid != NUMA_NO_NODE && page_to_nid(page) != nid) - continue; - index += thp_nr_pages(page) - 1;
ret = invalidate_inode_page(page); @@ -533,34 +529,10 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping, unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { - return __invalidate_mapping_pages(mapping, start, end, NULL, NUMA_NO_NODE); + return __invalidate_mapping_pages(mapping, start, end, NULL); } EXPORT_SYMBOL(invalidate_mapping_pages);
- -/** - * node_invalidate_mapping_pages - Invalidate all the unlocked pages in @nid of one inode - * @mapping: the address_space which holds the pages to invalidate - * @nid: pages belong to this node will be invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. - * - * node_invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. - * - * Return: the number of the pages that were invalidated - */ -#ifdef CONFIG_SHRINK_PAGECACHE -unsigned long node_invalidate_mapping_pages(struct address_space *mapping, - int nid, pgoff_t start, pgoff_t end) -{ - return __invalidate_mapping_pages(mapping, start, end, NULL, nid); -} -#endif /** * This helper is similar with the above one, except that it accounts for pages * that are likely on a pagevec and count them in @nr_pagevec, which will used by @@ -569,7 +541,7 @@ unsigned long node_invalidate_mapping_pages(struct address_space *mapping, void invalidate_mapping_pagevec(struct address_space *mapping, pgoff_t start, pgoff_t end, unsigned long *nr_pagevec) { - __invalidate_mapping_pages(mapping, start, end, nr_pagevec, NUMA_NO_NODE); + __invalidate_mapping_pages(mapping, start, end, nr_pagevec); }
/*
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56I4P CVE: NA backport: openEuler-22.03-LTS
--------------------------------
This reverts commit 9fea105d74885fef299b43e6734d19ba8921242e.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page_cache_limit.h | 2 -- include/linux/pagemap.h | 2 -- mm/filemap.c | 2 -- 3 files changed, 6 deletions(-)
diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h index 2df08a0604d8..7906b12af947 100644 --- a/include/linux/page_cache_limit.h +++ b/include/linux/page_cache_limit.h @@ -19,11 +19,9 @@ unsigned long __shrink_node_page_cache(int nid, gfp_t mask, unsigned long nr_to_reclaim, enum page_cache_reclaim_flag flag); void kpagecache_limitd_stop(int nid); int kpagecache_limitd_run(int nid); -void wakeup_all_kpagecache_limitd(void); #else static inline void kpagecache_limitd_stop(int nid) {} static inline int kpagecache_limitd_run(int nid) { return 0; } -static inline void wakeup_all_kpagecache_limitd(void) {} #endif
#endif diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index dbb25f1dc2e9..0bfa9cce6589 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -15,7 +15,6 @@ #include <linux/bitops.h> #include <linux/hardirq.h> /* for in_interrupt() */ #include <linux/hugetlb_inline.h> -#include <linux/page_cache_limit.h>
struct pagevec;
@@ -778,7 +777,6 @@ static inline int add_to_page_cache(struct page *page, { int error;
- wakeup_all_kpagecache_limitd(); __SetPageLocked(page); error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); if (unlikely(error)) diff --git a/mm/filemap.c b/mm/filemap.c index edb94663c5df..98b448d9873f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -42,7 +42,6 @@ #include <linux/psi.h> #include <linux/ramfs.h> #include <linux/page_idle.h> -#include <linux/page_cache_limit.h> #include "internal.h"
#define CREATE_TRACE_POINTS @@ -924,7 +923,6 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, void *shadow = NULL; int ret;
- wakeup_all_kpagecache_limitd(); __SetPageLocked(page); ret = __add_to_page_cache_locked(page, mapping, offset, gfp_mask, &shadow);
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56I4P CVE: NA backport: openEuler-22.03-LTS
--------------------------------
This reverts commit 955d63aec936df0ffbb53118ab28b4c208ac8abf.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page_cache_limit.h | 4 --- mm/memory_hotplug.c | 3 --- mm/page_cache_limit.c | 45 +++++++------------------------- 3 files changed, 9 insertions(+), 43 deletions(-)
diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h index 7906b12af947..e4ef5919cb92 100644 --- a/include/linux/page_cache_limit.h +++ b/include/linux/page_cache_limit.h @@ -17,11 +17,7 @@ int proc_page_cache_limit(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); unsigned long __shrink_node_page_cache(int nid, gfp_t mask, unsigned long nr_to_reclaim, enum page_cache_reclaim_flag flag); -void kpagecache_limitd_stop(int nid); -int kpagecache_limitd_run(int nid); #else -static inline void kpagecache_limitd_stop(int nid) {} -static inline int kpagecache_limitd_run(int nid) { return 0; } #endif
#endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 73ea92dae74a..7456d825414d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -38,7 +38,6 @@ #include <linux/rmap.h>
#include <asm/tlbflush.h> -#include <linux/page_cache_limit.h>
#include "internal.h" #include "shuffle.h" @@ -736,7 +735,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
kswapd_run(nid); kcompactd_run(nid); - kpagecache_limitd_run(nid);
writeback_set_ratelimit();
@@ -1487,7 +1485,6 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages) if (arg.status_change_nid >= 0) { kswapd_stop(node); kcompactd_stop(node); - kpagecache_limitd_stop(node); }
writeback_set_ratelimit(); diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 0a3098c9bb33..1581334429e1 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -31,27 +31,18 @@ static unsigned long get_node_total_pages(int nid) return managed_pages; }
-static void setup_node_pagecache_limit(int nid) -{ - unsigned long node_total_pages; - - node_total_pages = get_node_total_pages(nid); - node_pagecache_limit_pages[nid] = node_total_pages * pagecache_limit_ratio / 100; -} - -#define ALL_NODE (-1) -static void setup_pagecache_limit(int nid) +static void setup_pagecache_limit(void) { int i; + unsigned long node_total_pages;
pagecache_limit_pages = pagecache_limit_ratio * totalram_pages() / 100;
- if (nid != ALL_NODE) - setup_node_pagecache_limit(nid); - - else - for (i = 0; i < MAX_NUMNODES; i++) - setup_node_pagecache_limit(i); + for (i = 0; i < MAX_NUMNODES; i++) { + node_total_pages = get_node_total_pages(i); + node_pagecache_limit_pages[i] = node_total_pages * + pagecache_limit_ratio / 100; + } }
int proc_page_cache_limit(struct ctl_table *table, int write, @@ -62,7 +53,7 @@ int proc_page_cache_limit(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (write && !ret) - setup_pagecache_limit(ALL_NODE); + setup_pagecache_limit();
return ret; } @@ -81,8 +72,6 @@ void kpagecache_limitd_stop(int nid) kvfree(pagecache_limitd_wait_queue[nid]); pagecache_limitd_wait_queue[nid] = NULL; } - - setup_pagecache_limit(nid); }
static void wakeup_kpagecache_limitd(int nid) @@ -218,7 +207,7 @@ static int pagecache_limitd(void *arg) return 0; }
-static int __kpagecache_limitd_run(int nid) +int kpagecache_limitd_run(int nid) { int ret = 0; wait_queue_head_t *queue_head = NULL; @@ -247,22 +236,6 @@ static int __kpagecache_limitd_run(int nid) return ret; }
-int kpagecache_limitd_run(int nid) -{ - int ret; - - if (nid < 0 || nid >= MAX_NUMNODES) - return -EINVAL; - - ret = __kpagecache_limitd_run(nid); - if (ret) - return ret; - - setup_pagecache_limit(nid); - - return 0; -} - static int __init kpagecache_limitd_init(void) { int nid;
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56I4P CVE: NA backport: openEuler-22.03-LTS
--------------------------------
This reverts commit 7be2f4c4fdd7938f161ff6e43a0c60d9c9412a62.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page_cache_limit.h | 9 ------ mm/page_cache_limit.c | 27 +++------------- mm/vmscan.c | 53 ++------------------------------ 3 files changed, 6 insertions(+), 83 deletions(-)
diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h index e4ef5919cb92..98f12734114b 100644 --- a/include/linux/page_cache_limit.h +++ b/include/linux/page_cache_limit.h @@ -2,21 +2,12 @@ #define _PAGECACHE_H
#ifdef CONFIG_SHRINK_PAGECACHE -enum page_cache_reclaim_flag { - PAGE_CACHE_RECLAIM_NO_UNMAP, - PAGE_CACHE_RECLAIM_UNMAP, - PAGE_CACHE_RECLAIM_WRITEPAGE, - PAGE_CACHE_RECLAIM_NR_FLAGS, -}; - extern int pagecache_reclaim_enable; extern int pagecache_limit_ratio; extern int pagecache_reclaim_ratio;
int proc_page_cache_limit(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -unsigned long __shrink_node_page_cache(int nid, gfp_t mask, - unsigned long nr_to_reclaim, enum page_cache_reclaim_flag flag); #else #endif
diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 1581334429e1..33164e19cfa2 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -4,8 +4,6 @@ #include <linux/kthread.h> #include <linux/module.h> #include <linux/err.h> -#include <linux/swap.h> -#include <linux/page_cache_limit.h>
int pagecache_reclaim_enable; int pagecache_limit_ratio; @@ -144,31 +142,14 @@ static unsigned long node_nr_page_reclaim(int nid) return nr_to_reclaim; }
-static void shrink_node_page_cache(int nid, gfp_t mask) +static void shrink_node_page_cache(int nid) { - int i; unsigned long nr_to_reclaim; - unsigned long nr_reclaimed; - enum page_cache_reclaim_flag flag;
nr_to_reclaim = node_nr_page_reclaim(nid); - if (nr_to_reclaim <= 0) - return; - - flag = 0; - for (i = PAGE_CACHE_RECLAIM_NO_UNMAP; - i < PAGE_CACHE_RECLAIM_NR_FLAGS; i++) { - nr_reclaimed = __shrink_node_page_cache(nid, mask, nr_to_reclaim, flag); - nr_to_reclaim -= nr_reclaimed; - - if (nr_to_reclaim <= 0) - break; - - flag |= i; - } }
-static void shrink_page_cache(gfp_t mask) +static void shrink_page_cache(void) { int nid;
@@ -176,7 +157,7 @@ static void shrink_page_cache(gfp_t mask) return;
for_each_node_state(nid, N_MEMORY) - shrink_node_page_cache(nid, mask); + shrink_node_page_cache(nid); }
static DECLARE_COMPLETION(setup_done); @@ -192,7 +173,7 @@ static int pagecache_limitd(void *arg) set_freezable(); for (;;) { try_to_freeze(); - shrink_page_cache(GFP_KERNEL | __GFP_HIGHMEM); + shrink_page_cache();
prepare_to_wait(pagecache_limitd_wait_queue[nid], &wait, TASK_INTERRUPTIBLE); diff --git a/mm/vmscan.c b/mm/vmscan.c index 1bfbe1fc67d0..c504e530287b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -66,10 +66,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/vmscan.h>
-#ifdef CONFIG_SHRINK_PAGECACHE -#include <linux/page_cache_limit.h> -#endif - struct scan_control { /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; @@ -130,9 +126,6 @@ struct scan_control { /* The file pages on the current node are dangerously low */ unsigned int file_is_tiny:1;
- /* can't shrink slab pages */ - unsigned int no_shrink_slab:1; - /* Allocation order */ s8 order;
@@ -2906,9 +2899,8 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
shrink_lruvec(lruvec, sc);
- if (!sc->no_shrink_slab) - shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, - sc->priority); + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority);
/* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -4644,44 +4636,3 @@ struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) return page; } EXPORT_SYMBOL_GPL(get_page_from_vaddr); - -#ifdef CONFIG_SHRINK_PAGECACHE -/* - * return the number of reclaimed pages - */ -unsigned long __shrink_node_page_cache(int nid, gfp_t mask, unsigned long nr_to_reclaim, - enum page_cache_reclaim_flag reclaim_flag) -{ - struct scan_control sc = { - .nr_to_reclaim = nr_to_reclaim, - .gfp_mask = mask, - .may_swap = 0, - .may_unmap = reclaim_flag | PAGE_CACHE_RECLAIM_UNMAP, - .may_writepage = reclaim_flag | PAGE_CACHE_RECLAIM_WRITEPAGE, - .target_mem_cgroup = NULL, - .priority = DEF_PRIORITY, - .reclaim_idx = MAX_NR_ZONES, - .no_shrink_slab = 1, - }; - - struct zonelist *zonelist = node_zonelist(nid, __GFP_THISNODE); - struct reclaim_state *old_rs = current->reclaim_state; - unsigned long nr_reclaimed; - unsigned int noreclaim_flag; - - if (!(mask & __GFP_RECLAIM)) - return 0; - - noreclaim_flag = memalloc_noreclaim_save(); - fs_reclaim_acquire(sc.gfp_mask); - current->reclaim_state = NULL; - - nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - - current->reclaim_state = old_rs; - fs_reclaim_release(sc.gfp_mask); - memalloc_noreclaim_restore(noreclaim_flag); - - return nr_reclaimed; -} -#endif
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56I4P CVE: NA backport: openEuler-22.03-LTS
--------------------------------
This reverts commit 425bce986237e0409673c28774bbe546b433dee7.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/page_cache_limit.c | 52 +------------------------------------------ 1 file changed, 1 insertion(+), 51 deletions(-)
diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 33164e19cfa2..4afc08373a35 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -104,60 +104,10 @@ void wakeup_all_kpagecache_limitd(void) wakeup_kpagecache_limitd(nid); }
-static unsigned long node_nr_page_cache(int nid) -{ - struct pglist_data *pgdat; - unsigned long num = 0; - - pgdat = NODE_DATA(nid); - if (!pgdat) - return 0; - - num = node_page_state(pgdat, NR_FILE_PAGES); - num -= node_page_state(pgdat, NR_SHMEM); - - return num; -} - -static unsigned long node_nr_page_reclaim(int nid) -{ - unsigned long nr_page_cache; - unsigned long nr_to_reclaim; - unsigned long total_pages; - - if (!node_pagecache_limit_pages[nid]) - return 0; - - nr_page_cache = node_nr_page_cache(nid); - if (!nr_page_cache) - return 0; - - if (nr_page_cache < node_pagecache_limit_pages[nid]) - return 0; - - total_pages = get_node_total_pages(nid); - nr_to_reclaim = nr_page_cache - node_pagecache_limit_pages[nid]; - nr_to_reclaim += total_pages * pagecache_reclaim_ratio / 100; - - return nr_to_reclaim; -} - -static void shrink_node_page_cache(int nid) -{ - unsigned long nr_to_reclaim; - - nr_to_reclaim = node_nr_page_reclaim(nid); -} - static void shrink_page_cache(void) { - int nid; - - if (!pagecache_reclaim_enable || !pagecache_overlimit()) + if (!pagecache_overlimit()) return; - - for_each_node_state(nid, N_MEMORY) - shrink_node_page_cache(nid); }
static DECLARE_COMPLETION(setup_done);
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56I4P CVE: NA backport: openEuler-22.03-LTS
--------------------------------
This reverts commit b072a9d4198f820a00979e0c38ee0cd85a55b779.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/page_cache_limit.c | 133 ------------------------------------------ 1 file changed, 133 deletions(-)
diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c index 4afc08373a35..55fdea087804 100644 --- a/mm/page_cache_limit.c +++ b/mm/page_cache_limit.c @@ -1,9 +1,5 @@ #include <linux/mm.h> #include <linux/sysctl.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <linux/module.h> -#include <linux/err.h>
int pagecache_reclaim_enable; int pagecache_limit_ratio; @@ -11,8 +7,6 @@ int pagecache_reclaim_ratio;
static unsigned long pagecache_limit_pages; static unsigned long node_pagecache_limit_pages[MAX_NUMNODES]; -static wait_queue_head_t *pagecache_limitd_wait_queue[MAX_NUMNODES]; -static struct task_struct *pagecache_limitd_tasks[MAX_NUMNODES];
static unsigned long get_node_total_pages(int nid) { @@ -55,130 +49,3 @@ int proc_page_cache_limit(struct ctl_table *table, int write,
return ret; } - -void kpagecache_limitd_stop(int nid) -{ - if (nid < 0 || nid >= MAX_NUMNODES) - return; - - if (pagecache_limitd_tasks[nid]) { - kthread_stop(pagecache_limitd_tasks[nid]); - pagecache_limitd_tasks[nid] = NULL; - } - - if (pagecache_limitd_wait_queue[nid]) { - kvfree(pagecache_limitd_wait_queue[nid]); - pagecache_limitd_wait_queue[nid] = NULL; - } -} - -static void wakeup_kpagecache_limitd(int nid) -{ - if (!pagecache_limitd_wait_queue[nid]) - return; - - if (!waitqueue_active(pagecache_limitd_wait_queue[nid])) - return; - - wake_up_interruptible(pagecache_limitd_wait_queue[nid]); -} - -static bool pagecache_overlimit(void) -{ - unsigned long total_pagecache; - - total_pagecache = global_node_page_state(NR_FILE_PAGES); - total_pagecache -= global_node_page_state(NR_SHMEM); - - return total_pagecache > pagecache_limit_pages; -} - -void wakeup_all_kpagecache_limitd(void) -{ - int nid; - - if (!pagecache_reclaim_enable || !pagecache_overlimit()) - return; - - for_each_node_state(nid, N_MEMORY) - wakeup_kpagecache_limitd(nid); -} - -static void shrink_page_cache(void) -{ - if (!pagecache_overlimit()) - return; -} - -static DECLARE_COMPLETION(setup_done); -static int pagecache_limitd(void *arg) -{ - DEFINE_WAIT(wait); - int nid = *(int *)arg; - - if (nid < 0 || nid >= MAX_NUMNODES) - nid = numa_node_id(); - - complete(&setup_done); - set_freezable(); - for (;;) { - try_to_freeze(); - shrink_page_cache(); - - prepare_to_wait(pagecache_limitd_wait_queue[nid], &wait, - TASK_INTERRUPTIBLE); - if (kthread_should_stop()) - break; - schedule(); - finish_wait(pagecache_limitd_wait_queue[nid], &wait); - } - - finish_wait(pagecache_limitd_wait_queue[nid], &wait); - - return 0; -} - -int kpagecache_limitd_run(int nid) -{ - int ret = 0; - wait_queue_head_t *queue_head = NULL; - - if (pagecache_limitd_tasks[nid] && pagecache_limitd_wait_queue[nid]) - return 0; - - queue_head = kvmalloc(sizeof(wait_queue_head_t), GFP_KERNEL); - if (!queue_head) - return -ENOMEM; - - init_waitqueue_head(queue_head); - pagecache_limitd_wait_queue[nid] = queue_head; - pagecache_limitd_tasks[nid] = kthread_run(pagecache_limitd, - (void *)&nid, "kpagecache_limitd%d", nid); - - if (IS_ERR(pagecache_limitd_tasks[nid])) { - BUG_ON(system_state < SYSTEM_RUNNING); - ret = PTR_ERR(pagecache_limitd_tasks[nid]); - pr_err("Failed to start pagecache_limitd on node %d\n", nid); - pagecache_limitd_tasks[nid] = NULL; - kvfree(queue_head); - } else - wait_for_completion(&setup_done); - - return ret; -} - -static int __init kpagecache_limitd_init(void) -{ - int nid; - int ret; - - for_each_node_state(nid, N_MEMORY) { - ret = kpagecache_limitd_run(nid); - if (ret == -ENOMEM) - break; - } - - return 0; -} - -module_init(kpagecache_limitd_init);
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56I4P CVE: NA backport: openEuler-22.03-LTS
--------------------------------
This reverts commit 933db18abca2a5d0a2eaa2fc40a85c2d88cf896a.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Tong Tiangen tongtiangen@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page_cache_limit.h | 14 --------- kernel/sysctl.c | 32 -------------------- mm/Kconfig | 12 -------- mm/Makefile | 1 - mm/page_cache_limit.c | 51 -------------------------------- 5 files changed, 110 deletions(-) delete mode 100644 include/linux/page_cache_limit.h delete mode 100644 mm/page_cache_limit.c
diff --git a/include/linux/page_cache_limit.h b/include/linux/page_cache_limit.h deleted file mode 100644 index 98f12734114b..000000000000 --- a/include/linux/page_cache_limit.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _PAGECACHE_H -#define _PAGECACHE_H - -#ifdef CONFIG_SHRINK_PAGECACHE -extern int pagecache_reclaim_enable; -extern int pagecache_limit_ratio; -extern int pagecache_reclaim_ratio; - -int proc_page_cache_limit(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); -#else -#endif - -#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c81713cd19f0..2f7b48f92103 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -104,9 +104,6 @@ #ifdef CONFIG_LOCKUP_DETECTOR #include <linux/nmi.h> #endif -#ifdef CONFIG_SHRINK_PAGECACHE -#include <linux/page_cache_limit.h> -#endif
#if defined(CONFIG_SYSCTL)
@@ -3201,35 +3198,6 @@ static struct ctl_table vm_table[] = { .extra2 = SYSCTL_ONE, }, #endif -#ifdef CONFIG_SHRINK_PAGECACHE - { - .procname = "cache_reclaim_enable", - .data = &pagecache_reclaim_enable, - .maxlen = sizeof(pagecache_reclaim_enable), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, - { - .procname = "cache_limit_ratio", - .data = &pagecache_limit_ratio, - .maxlen = sizeof(pagecache_limit_ratio), - .mode = 0600, - .proc_handler = proc_page_cache_limit, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&one_hundred, - }, - { - .procname = "cache_reclaim_ratio", - .data = &pagecache_reclaim_ratio, - .maxlen = sizeof(pagecache_reclaim_ratio), - .mode = 0600, - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = (void *)&one_hundred, - }, -#endif #ifdef CONFIG_ASCEND_SHARE_POOL { .procname = "sharepool_debug_mode", diff --git a/mm/Kconfig b/mm/Kconfig index 4475bd9f8762..9e66dfb15c52 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -500,18 +500,6 @@ config FRONTSWAP
If unsure, say Y to enable frontswap.
-config SHRINK_PAGECACHE - bool "Enable shrinking the page cache" - depends on MMU - default n - help - SHRINK_PAGECACHE means that we do not want to keep the large number - of page cache in the system, even though page cache can greatly improve - the performance of the machine. Large number of page cache may result - in short of memory, which will result OOM at the same time, so in order - to keep page cache in a reasonable range, the number of page cache - should be limited, and that is what SHRINK_PAGECACHE does. - config MEMCG_QOS bool "Enable Memory Cgroup Priority" depends on MEMCG diff --git a/mm/Makefile b/mm/Makefile index d2a6a786f915..f782a0305654 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -126,7 +126,6 @@ obj-$(CONFIG_PTDUMP_CORE) += ptdump.o obj-$(CONFIG_PAGE_REPORTING) += page_reporting.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_PIN_MEMORY) += pin_mem.o -obj-$(CONFIG_SHRINK_PAGECACHE) += page_cache_limit.o obj-$(CONFIG_ASCEND_SHARE_POOL) += share_pool.o obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o diff --git a/mm/page_cache_limit.c b/mm/page_cache_limit.c deleted file mode 100644 index 55fdea087804..000000000000 --- a/mm/page_cache_limit.c +++ /dev/null @@ -1,51 +0,0 @@ -#include <linux/mm.h> -#include <linux/sysctl.h> - -int pagecache_reclaim_enable; -int pagecache_limit_ratio; -int pagecache_reclaim_ratio; - -static unsigned long pagecache_limit_pages; -static unsigned long node_pagecache_limit_pages[MAX_NUMNODES]; - -static unsigned long get_node_total_pages(int nid) -{ - int zone_type; - unsigned long managed_pages = 0; - pg_data_t *pgdat = NODE_DATA(nid); - - if (!pgdat) - return 0; - - for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) - managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]); - - return managed_pages; -} - -static void setup_pagecache_limit(void) -{ - int i; - unsigned long node_total_pages; - - pagecache_limit_pages = pagecache_limit_ratio * totalram_pages() / 100; - - for (i = 0; i < MAX_NUMNODES; i++) { - node_total_pages = get_node_total_pages(i); - node_pagecache_limit_pages[i] = node_total_pages * - pagecache_limit_ratio / 100; - } -} - -int proc_page_cache_limit(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - int ret; - - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - - if (write && !ret) - setup_pagecache_limit(); - - return ret; -}
From: Gaosheng Cui cuigaosheng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5J0X7 CVE: NA
--------------------------------
Since arm64 does not use a builtin decompressor, the EFI stub is built into the kernel proper. So instead, separate the contents of libstub and its dependencies, by putting them into their own namespace by prefixing all of its symbols with __efistub.This way, we have tight control over what parts of the kernel proper are referenced by the stub.
Follow the rules and support strchr function for EFI stub, it will be used to parse cmdline args.
Signed-off-by: Gaosheng Cui cuigaosheng1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/image-vars.h | 1 + arch/arm64/lib/strchr.S | 4 ++-- drivers/firmware/efi/libstub/string.c | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index c615b285ff5b..7ea4b84f1518 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -35,6 +35,7 @@ __efistub_strnlen = __pi_strnlen; __efistub_strcmp = __pi_strcmp; __efistub_strncmp = __pi_strncmp; __efistub_strrchr = __pi_strrchr; +__efistub_strchr = __pi_strchr; __efistub___clean_dcache_area_poc = __pi___clean_dcache_area_poc;
#ifdef CONFIG_KASAN diff --git a/arch/arm64/lib/strchr.S b/arch/arm64/lib/strchr.S index 1f47eae3b0d6..5893ad8d4484 100644 --- a/arch/arm64/lib/strchr.S +++ b/arch/arm64/lib/strchr.S @@ -18,7 +18,7 @@ * Returns: * x0 - address of first occurrence of 'c' or 0 */ -SYM_FUNC_START_WEAK(strchr) +SYM_FUNC_START_WEAK_PI(strchr) and w1, w1, #0xff 1: ldrb w2, [x0], #1 cmp w2, w1 @@ -28,5 +28,5 @@ SYM_FUNC_START_WEAK(strchr) cmp w2, w1 csel x0, x0, xzr, eq ret -SYM_FUNC_END(strchr) +SYM_FUNC_END_PI(strchr) EXPORT_SYMBOL_NOKASAN(strchr) diff --git a/drivers/firmware/efi/libstub/string.c b/drivers/firmware/efi/libstub/string.c index 5d13e43869ee..006c9f0a8e0c 100644 --- a/drivers/firmware/efi/libstub/string.c +++ b/drivers/firmware/efi/libstub/string.c @@ -113,3 +113,21 @@ long simple_strtol(const char *cp, char **endp, unsigned int base)
return simple_strtoull(cp, endp, base); } + +#ifndef __HAVE_ARCH_STRCHR +/** + * strchr - Find the first occurrence of a character in a string + * @s: The string to be searched + * @c: The character to search for + * + * Note that the %NUL-terminator is considered part of the string, and can + * be searched for. + */ +char *strchr(const char *s, int c) +{ + for (; *s != (char)c; ++s) + if (*s == '\0') + return NULL; + return (char *)s; +} +#endif
From: Cui GaoSheng cuigaosheng1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I5J0X7 CVE: NA
--------------------------------
CONFIG_RANDOMIZE_BASE=y relocates the kernel to a random base address.
However, on arm64, it does not take into account the memmap= parameter passed in from the kernel command line. This results in the kernel sometimes being put in the middle of memmap.
Add support for memmap kernel parameters parsing on ARM64. The below modes are only supported:
memmap=nn[KMG]$ss[KMG]
Region of memory to be reserved is from ss to ss+nn, the region must be in the range of existed memory, otherwise will be ignored.
Teach KASLR to not insert the kernel in memmap defined regions. We support up to 32 memmap regions: any additional regions will cause KASLR to disable.
Signed-off-by: Cui GaoSheng cuigaosheng1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/firmware/efi/libstub/arm64-stub.c | 89 +++++++++++++++++++ .../firmware/efi/libstub/efi-stub-helper.c | 2 + drivers/firmware/efi/libstub/efi-stub.c | 3 + drivers/firmware/efi/libstub/efistub.h | 10 +++ 4 files changed, 104 insertions(+)
diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c index 4ee5ced0c6a4..143e3c13e742 100644 --- a/drivers/firmware/efi/libstub/arm64-stub.c +++ b/drivers/firmware/efi/libstub/arm64-stub.c @@ -15,6 +15,95 @@
#include "efistub.h"
+#define MAX_MEMMAP_REGIONS 32 + +struct mem_vector { + unsigned long long start; + unsigned long long size; +}; + +static struct mem_vector mem_avoid[MAX_MEMMAP_REGIONS]; + +static int +efi_parse_memmap(char *p, unsigned long long *start, unsigned long long *size) +{ + char *oldp; + u64 mem_size; + + if (!p) + return -EINVAL; + + oldp = p; + mem_size = memparse(p, &p); + if (p == oldp) + return -EINVAL; + if (!mem_size) + return -EINVAL; + if (*p != '$') + return -EINVAL; + + *start = memparse(p + 1, &p); + *size = mem_size; + + return 0; +} + +void efi_parse_option_memmap(const char *str) +{ + int rc; + static int idx; + char *k, *p = (char *)str; + + while (p && (idx < MAX_MEMMAP_REGIONS)) { + k = strchr(p, ','); + if (k) + *k++ = 0; + + rc = efi_parse_memmap(p, &mem_avoid[idx].start, &mem_avoid[idx].size); + if (rc < 0) + efi_err("Failed to parse memmap cmdlines, index: %d, str: %s\n", idx, p); + + p = k; + idx++; + } +} + +void mem_avoid_memmap(void) +{ + int i; + efi_status_t status; + unsigned long nr_pages; + unsigned long long start, end; + + for (i = 0; i < MAX_MEMMAP_REGIONS; i++) { + if (!mem_avoid[i].size) + continue; + start = round_down(mem_avoid[i].start, EFI_ALLOC_ALIGN); + end = round_up(mem_avoid[i].start + mem_avoid[i].size, EFI_ALLOC_ALIGN); + nr_pages = (end - start) / EFI_PAGE_SIZE; + + mem_avoid[i].start = start; + mem_avoid[i].size = end - start; + status = efi_bs_call(allocate_pages, EFI_ALLOCATE_ADDRESS, + EFI_LOADER_DATA, nr_pages, &mem_avoid[i].start); + if (status != EFI_SUCCESS) { + efi_err("Failed to reserve memmap, index: %d, status: %lu\n", i, status); + mem_avoid[i].size = 0; + } + } +} + +void free_avoid_memmap(void) +{ + int i; + + for (i = 0; i < MAX_MEMMAP_REGIONS; i++) { + if (!mem_avoid[i].size) + continue; + efi_free(mem_avoid[i].size, mem_avoid[i].start); + } +} + efi_status_t check_platform_features(void) { u64 tg; diff --git a/drivers/firmware/efi/libstub/efi-stub-helper.c b/drivers/firmware/efi/libstub/efi-stub-helper.c index aa8da0a49829..0e0033fa7d51 100644 --- a/drivers/firmware/efi/libstub/efi-stub-helper.c +++ b/drivers/firmware/efi/libstub/efi-stub-helper.c @@ -232,6 +232,8 @@ efi_status_t efi_parse_options(char const *cmdline) } else if (!strcmp(param, "video") && val && strstarts(val, "efifb:")) { efi_parse_option_graphics(val + strlen("efifb:")); + } else if (!strcmp(param, "memmap") && val) { + efi_parse_option_memmap(val); } } efi_bs_call(free_pool, buf); diff --git a/drivers/firmware/efi/libstub/efi-stub.c b/drivers/firmware/efi/libstub/efi-stub.c index 0ab439c53eee..6840a57b8f3b 100644 --- a/drivers/firmware/efi/libstub/efi-stub.c +++ b/drivers/firmware/efi/libstub/efi-stub.c @@ -194,6 +194,8 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle,
si = setup_graphics();
+ mem_avoid_memmap(); + status = handle_kernel_image(&image_addr, &image_size, &reserve_addr, &reserve_size, @@ -311,6 +313,7 @@ efi_status_t __efiapi efi_pe_entry(efi_handle_t handle, efi_free(image_size, image_addr); efi_free(reserve_size, reserve_addr); fail_free_screeninfo: + free_avoid_memmap(); free_screen_info(si); fail_free_cmdline: efi_bs_call(free_pool, cmdline_ptr); diff --git a/drivers/firmware/efi/libstub/efistub.h b/drivers/firmware/efi/libstub/efistub.h index 2d7abcd99de9..cf59df863fa7 100644 --- a/drivers/firmware/efi/libstub/efistub.h +++ b/drivers/firmware/efi/libstub/efistub.h @@ -805,6 +805,16 @@ efi_status_t efi_parse_options(char const *cmdline);
void efi_parse_option_graphics(char *option);
+#ifdef CONFIG_ARM64 +void efi_parse_option_memmap(const char *str); +void mem_avoid_memmap(void); +void free_avoid_memmap(void); +#else +static inline void efi_parse_option_memmap(const char *str) { } +static inline void mem_avoid_memmap(void) { } +static inline void free_avoid_memmap(void) { } +#endif + efi_status_t efi_setup_gop(struct screen_info *si, efi_guid_t *proto, unsigned long size);
From: George Kennedy george.kennedy@oracle.com
mainline inclusion from mainline-v5.16-rc7 commit 158b515f703e75e7d68289bf4d98c664e1d632df category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5KIF5
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------
Avoid double free in tun_free_netdev() by moving the dev->tstats and tun->security allocs to a new ndo_init routine (tun_net_init()) that will be called by register_netdevice(). ndo_init is paired with the desctructor (tun_free_netdev()), so if there's an error in register_netdevice() the destructor will handle the frees.
BUG: KASAN: double-free or invalid-free in selinux_tun_dev_free_security+0x1a/0x20 security/selinux/hooks.c:5605
CPU: 0 PID: 25750 Comm: syz-executor416 Not tainted 5.16.0-rc2-syzk #1 Hardware name: Red Hat KVM, BIOS Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x89/0xb5 lib/dump_stack.c:106 print_address_description.constprop.9+0x28/0x160 mm/kasan/report.c:247 kasan_report_invalid_free+0x55/0x80 mm/kasan/report.c:372 ____kasan_slab_free mm/kasan/common.c:346 [inline] __kasan_slab_free+0x107/0x120 mm/kasan/common.c:374 kasan_slab_free include/linux/kasan.h:235 [inline] slab_free_hook mm/slub.c:1723 [inline] slab_free_freelist_hook mm/slub.c:1749 [inline] slab_free mm/slub.c:3513 [inline] kfree+0xac/0x2d0 mm/slub.c:4561 selinux_tun_dev_free_security+0x1a/0x20 security/selinux/hooks.c:5605 security_tun_dev_free_security+0x4f/0x90 security/security.c:2342 tun_free_netdev+0xe6/0x150 drivers/net/tun.c:2215 netdev_run_todo+0x4df/0x840 net/core/dev.c:10627 rtnl_unlock+0x13/0x20 net/core/rtnetlink.c:112 __tun_chr_ioctl+0x80c/0x2870 drivers/net/tun.c:3302 tun_chr_ioctl+0x2f/0x40 drivers/net/tun.c:3311 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:874 [inline] __se_sys_ioctl fs/ioctl.c:860 [inline] __x64_sys_ioctl+0x19d/0x220 fs/ioctl.c:860 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3a/0x80 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae
Reported-by: syzkaller syzkaller@googlegroups.com Signed-off-by: George Kennedy george.kennedy@oracle.com Suggested-by: Jakub Kicinski kuba@kernel.org Link: https://lore.kernel.org/r/1639679132-19884-1-git-send-email-george.kennedy@o... Signed-off-by: Jakub Kicinski kuba@kernel.org Conflict: driver/net/tun.c Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/tun.c | 115 ++++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 56 deletions(-)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index aef966a9dae2..993236198242 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -220,6 +220,9 @@ struct tun_struct { struct tun_prog __rcu *steering_prog; struct tun_prog __rcu *filter_prog; struct ethtool_link_ksettings link_ksettings; + /* init args */ + struct file *file; + struct ifreq *ifr; };
struct veth { @@ -227,6 +230,9 @@ struct veth { __be16 h_vlan_TCI; };
+static void tun_flow_init(struct tun_struct *tun); +static void tun_flow_uninit(struct tun_struct *tun); + static int tun_napi_receive(struct napi_struct *napi, int budget) { struct tun_file *tfile = container_of(napi, struct tun_file, napi); @@ -964,6 +970,49 @@ static int check_filter(struct tap_filter *filter, const struct sk_buff *skb)
static const struct ethtool_ops tun_ethtool_ops;
+static int tun_net_init(struct net_device *dev) +{ + struct tun_struct *tun = netdev_priv(dev); + struct ifreq *ifr = tun->ifr; + int err; + + tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); + if (!tun->pcpu_stats) + return -ENOMEM; + + spin_lock_init(&tun->lock); + + err = security_tun_dev_alloc_security(&tun->security); + if (err < 0) { + free_percpu(tun->pcpu_stats); + return err; + } + + tun_flow_init(tun); + + dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | + TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX; + dev->features = dev->hw_features | NETIF_F_LLTX; + dev->vlan_features = dev->features & + ~(NETIF_F_HW_VLAN_CTAG_TX | + NETIF_F_HW_VLAN_STAG_TX); + + tun->flags = (tun->flags & ~TUN_FEATURES) | + (ifr->ifr_flags & TUN_FEATURES); + + INIT_LIST_HEAD(&tun->disabled); + err = tun_attach(tun, tun->file, false, ifr->ifr_flags & IFF_NAPI, + ifr->ifr_flags & IFF_NAPI_FRAGS, false); + if (err < 0) { + tun_flow_uninit(tun); + security_tun_dev_free_security(tun->security); + free_percpu(tun->pcpu_stats); + return err; + } + return 0; +} + /* Net device detach from fd. */ static void tun_net_uninit(struct net_device *dev) { @@ -1205,6 +1254,7 @@ static int tun_net_change_carrier(struct net_device *dev, bool new_carrier) }
static const struct net_device_ops tun_netdev_ops = { + .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, @@ -1285,6 +1335,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp) }
static const struct net_device_ops tap_netdev_ops = { + .ndo_init = tun_net_init, .ndo_uninit = tun_net_uninit, .ndo_open = tun_net_open, .ndo_stop = tun_net_close, @@ -1325,7 +1376,7 @@ static void tun_flow_uninit(struct tun_struct *tun) #define MAX_MTU 65535
/* Initialize net device. */ -static void tun_net_init(struct net_device *dev) +static void tun_net_initialize(struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev);
@@ -2257,11 +2308,6 @@ static void tun_free_netdev(struct net_device *dev) BUG_ON(!(list_empty(&tun->disabled)));
free_percpu(tun->pcpu_stats); - /* We clear pcpu_stats so that tun_set_iff() can tell if - * tun_free_netdev() has been called from register_netdevice(). - */ - tun->pcpu_stats = NULL; - tun_flow_uninit(tun); security_tun_dev_free_security(tun->security); __tun_set_ebpf(tun, &tun->steering_prog, NULL); @@ -2773,41 +2819,16 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) tun->rx_batched = 0; RCU_INIT_POINTER(tun->steering_prog, NULL);
- tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); - if (!tun->pcpu_stats) { - err = -ENOMEM; - goto err_free_dev; - } - - spin_lock_init(&tun->lock); - - err = security_tun_dev_alloc_security(&tun->security); - if (err < 0) - goto err_free_stat; - - tun_net_init(dev); - tun_flow_init(tun); + tun->ifr = ifr; + tun->file = file;
- dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | - TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX; - dev->features = dev->hw_features | NETIF_F_LLTX; - dev->vlan_features = dev->features & - ~(NETIF_F_HW_VLAN_CTAG_TX | - NETIF_F_HW_VLAN_STAG_TX); - - tun->flags = (tun->flags & ~TUN_FEATURES) | - (ifr->ifr_flags & TUN_FEATURES); - - INIT_LIST_HEAD(&tun->disabled); - err = tun_attach(tun, file, false, ifr->ifr_flags & IFF_NAPI, - ifr->ifr_flags & IFF_NAPI_FRAGS, false); - if (err < 0) - goto err_free_flow; + tun_net_initialize(dev);
err = register_netdevice(tun->dev); - if (err < 0) - goto err_detach; + if (err < 0) { + free_netdev(dev); + return err; + } /* free_netdev() won't check refcnt, to aovid race * with dev_put() we need publish tun after registration. */ @@ -2824,24 +2845,6 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
strcpy(ifr->ifr_name, tun->dev->name); return 0; - -err_detach: - tun_detach_all(dev); - /* We are here because register_netdevice() has failed. - * If register_netdevice() already called tun_free_netdev() - * while dealing with the error, tun->pcpu_stats has been cleared. - */ - if (!tun->pcpu_stats) - goto err_free_dev; - -err_free_flow: - tun_flow_uninit(tun); - security_tun_dev_free_security(tun->security); -err_free_stat: - free_percpu(tun->pcpu_stats); -err_free_dev: - free_netdev(dev); - return err; }
static void tun_get_iff(struct tun_struct *tun, struct ifreq *ifr)
From: Luo Meng luomeng12@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5KK52 CVE: NA
--------------------------------
If dm_get_device() create dd in multipath_message(), and then call table_deps() after dm_put_table_device(), it will lead to concurrency UAF bugs.
One of the concurrency UAF can be shown as below:
(USE) | (FREE) | target_message | multipath_message | dm_put_device | dm_put_table_device # | kfree(td) # table_device *td ioctl # DM_TABLE_DEPS_CMD | ... table_deps | ... dm_get_live_or_inactive_table | ... retrieve_dep | ... list_for_each_entry | ... deps->dev[count++] = | ... huge_encode_dev | ... (dd->dm_dev->bdev->bd_dev) | list_del(&dd->list) | kfree(dd) # dm_dev_internal
The root cause of UAF bugs is that find_device() failed in dm_get_device() and will create dd and refcount set 1, kfree() in dm_put_table() is not protected. When td, which there are still pointers point to, is released, the concurrency UAF bug will happen.
This patch add a flag to determine whether to create a new dd.
Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/md/dm-mpath.c | 2 +- drivers/md/dm-table.c | 44 +++++++++++++++++++++-------------- include/linux/device-mapper.h | 2 ++ 3 files changed, 30 insertions(+), 18 deletions(-)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index bced42f082b0..e0bfa16aab37 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1962,7 +1962,7 @@ static int multipath_message(struct dm_target *ti, unsigned argc, char **argv, goto out; }
- r = dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev); + r = __dm_get_device(ti, argv[1], dm_table_get_mode(ti->table), &dev, false); if (r) { DMWARN("message: error getting device %s", argv[1]); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 5c590895c14c..78627402b5fb 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -361,12 +361,8 @@ dev_t dm_get_dev_t(const char *path) } EXPORT_SYMBOL_GPL(dm_get_dev_t);
-/* - * Add a device to the list, or just increment the usage count if - * it's already present. - */ -int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, - struct dm_dev **result) +int __dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, + struct dm_dev **result, bool create_dd) { int r; dev_t dev; @@ -390,19 +386,22 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
dd = find_device(&t->devices, dev); if (!dd) { - dd = kmalloc(sizeof(*dd), GFP_KERNEL); - if (!dd) - return -ENOMEM; - - if ((r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev))) { - kfree(dd); - return r; - } + if (create_dd) { + dd = kmalloc(sizeof(*dd), GFP_KERNEL); + if (!dd) + return -ENOMEM;
- refcount_set(&dd->count, 1); - list_add(&dd->list, &t->devices); - goto out; + r = dm_get_table_device(t->md, dev, mode, &dd->dm_dev); + if (r) { + kfree(dd); + return r; + }
+ refcount_set(&dd->count, 1); + list_add(&dd->list, &t->devices); + goto out; + } else + return -ENODEV; } else if (dd->dm_dev->mode != (mode | dd->dm_dev->mode)) { r = upgrade_mode(dd, mode, t->md); if (r) @@ -413,6 +412,17 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, *result = dd->dm_dev; return 0; } +EXPORT_SYMBOL(__dm_get_device); + +/* + * Add a device to the list, or just increment the usage count if + * it's already present. + */ +int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, + struct dm_dev **result) +{ + return __dm_get_device(ti, path, mode, result, true); +} EXPORT_SYMBOL(dm_get_device);
static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 50cc070cb1f7..47db4a14c925 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -162,6 +162,8 @@ dev_t dm_get_dev_t(const char *path); int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, struct dm_dev **result); void dm_put_device(struct dm_target *ti, struct dm_dev *d); +int __dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, + struct dm_dev **result, bool create_dd);
/* * Information about a target type
From: Pablo Neira Ayuso pablo@netfilter.org
mainline inclusion from mainline-v5.14-rc1 commit e0241ae6ac59ffa318255640c047f7c90457fbe5 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I58CKN CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Replace netlink_unicast() calls by nfnetlink_unicast() which already deals with translating EAGAIN to ENOBUFS as the nfnetlink core expects.
nfnetlink_unicast() calls nlmsg_unicast() which returns zero in case of success, otherwise the netlink core function netlink_rcv_skb() turns err > 0 into an acknowlegment.
Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/netfilter/ipset/ip_set_core.c | 44 +++++--------------- net/netfilter/nf_conntrack_netlink.c | 62 ++++++++-------------------- net/netfilter/nfnetlink_acct.c | 9 +--- net/netfilter/nfnetlink_cthelper.c | 10 ++--- net/netfilter/nfnetlink_cttimeout.c | 34 ++++----------- 5 files changed, 42 insertions(+), 117 deletions(-)
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 2b19189a930f..1db4fa0580ff 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1706,8 +1706,8 @@ static const struct nla_policy ip_set_adt_policy[IPSET_ATTR_CMD_MAX + 1] = { };
static int -call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, - struct nlattr *tb[], enum ipset_adt adt, +call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, + struct ip_set *set, struct nlattr *tb[], enum ipset_adt adt, u32 flags, bool use_lineno) { int ret; @@ -1759,8 +1759,7 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
*errline = lineno;
- netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); + nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); /* Signal netlink not to send its ACK/errmsg. */ return -EINTR; } @@ -1804,7 +1803,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, attr[IPSET_ATTR_DATA], set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, adt, flags, + ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); } else { int nla_rem; @@ -1815,7 +1814,7 @@ static int ip_set_ad(struct net *net, struct sock *ctnl, nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; - ret = call_ad(ctnl, skb, set, tb, adt, + ret = call_ad(net, ctnl, skb, set, tb, adt, flags, use_lineno); if (ret < 0) return ret; @@ -1888,7 +1887,6 @@ static int ip_set_header(struct net *net, struct sock *ctnl, const struct ip_set *set; struct sk_buff *skb2; struct nlmsghdr *nlh2; - int ret = 0;
if (unlikely(protocol_min_failed(attr) || !attr[IPSET_ATTR_SETNAME])) @@ -1914,11 +1912,7 @@ static int ip_set_header(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -1975,11 +1969,8 @@ static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb, nlmsg_end(skb2, nlh2);
pr_debug("Send TYPE, nlmsg_len: %u\n", nlh2->nlmsg_len); - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret;
- return 0; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2002,7 +1993,6 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl, { struct sk_buff *skb2; struct nlmsghdr *nlh2; - int ret = 0;
if (unlikely(!attr[IPSET_ATTR_PROTOCOL])) return -IPSET_ERR_PROTOCOL; @@ -2021,11 +2011,7 @@ static int ip_set_protocol(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2046,7 +2032,6 @@ static int ip_set_byname(struct net *net, struct sock *ctnl, struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; - int ret = 0;
if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_SETNAME])) @@ -2070,11 +2055,7 @@ static int ip_set_byname(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
nla_put_failure: nlmsg_cancel(skb2, nlh2); @@ -2098,7 +2079,6 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl, struct nlmsghdr *nlh2; ip_set_id_t id = IPSET_INVALID_ID; const struct ip_set *set; - int ret = 0;
if (unlikely(protocol_failed(attr) || !attr[IPSET_ATTR_INDEX])) @@ -2124,11 +2104,7 @@ static int ip_set_byindex(struct net *net, struct sock *ctnl, goto nla_put_failure; nlmsg_end(skb2, nlh2);
- ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret < 0) - return ret; - - return 0; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid);
nla_put_failure: nlmsg_cancel(skb2, nlh2); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index eeeaa34b3e7b..a753762fb078 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1634,9 +1634,8 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
ct = nf_ct_tuplehash_to_ctrack(h);
- err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { + if (!skb2) { nf_ct_put(ct); return -ENOMEM; } @@ -1644,20 +1643,12 @@ static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl, err = ctnetlink_fill_info(skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, NFNL_MSG_TYPE(nlh->nlmsg_type), ct, true, 0); nf_ct_put(ct); - if (err <= 0) - goto free; - - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + }
-free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); }
static int ctnetlink_done_list(struct netlink_callback *cb) @@ -2613,20 +2604,12 @@ static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl, nlh->nlmsg_seq, NFNL_MSG_TYPE(nlh->nlmsg_type), sock_net(skb->sk)); - if (err <= 0) - goto free; - - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + }
-free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); }
static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { @@ -3368,11 +3351,10 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, } }
- err = -ENOMEM; skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { + if (!skb2) { nf_ct_expect_put(exp); - goto out; + return -ENOMEM; }
rcu_read_lock(); @@ -3380,20 +3362,12 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp); rcu_read_unlock(); nf_ct_expect_put(exp); - if (err <= 0) - goto free; - - err = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (err < 0) - goto out; - - return 0; + if (err <= 0) { + kfree_skb(skb2); + return -ENOMEM; + }
-free: - kfree_skb(skb2); -out: - /* this avoids a loop in nfnetlink. */ - return err == -EAGAIN ? -ENOBUFS : err; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); }
static bool expect_iter_name(struct nf_conntrack_expect *exp, void *data) diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5bfec829c12f..a9091b721181 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -308,13 +308,8 @@ static int nfnl_acct_get(struct net *net, struct sock *nfnl, kfree_skb(skb2); break; } - ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + break; } return ret; } diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 91afbf8ac8cf..7e5820e19de3 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -676,14 +676,10 @@ static int nfnl_cthelper_get(struct net *net, struct sock *nfnl, break; }
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + break; } + return ret; }
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 89a381f7f945..49d7499cd4ac 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -279,13 +279,8 @@ static int cttimeout_get_timeout(struct net *net, struct sock *ctnl, kfree_skb(skb2); break; } - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; - - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; + ret = nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); + break; } return ret; } @@ -429,9 +424,9 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, const struct nf_conntrack_l4proto *l4proto; unsigned int *timeouts = NULL; struct sk_buff *skb2; - int ret, err; __u16 l3num; __u8 l4num; + int ret;
if (!cda[CTA_TIMEOUT_L3PROTO] || !cda[CTA_TIMEOUT_L4PROTO]) return -EINVAL; @@ -440,9 +435,8 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]); l4proto = nf_ct_l4proto_find(l4num);
- err = -EOPNOTSUPP; if (l4proto->l4proto != l4num) - goto err; + return -EOPNOTSUPP;
switch (l4proto->l4proto) { case IPPROTO_ICMP: @@ -482,13 +476,11 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, }
if (!timeouts) - goto err; + return -EOPNOTSUPP;
skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb2 == NULL) { - err = -ENOMEM; - goto err; - } + if (!skb2) + return -ENOMEM;
ret = cttimeout_default_fill_info(net, skb2, NETLINK_CB(skb).portid, nlh->nlmsg_seq, @@ -497,17 +489,9 @@ static int cttimeout_default_get(struct net *net, struct sock *ctnl, l3num, l4proto, timeouts); if (ret <= 0) { kfree_skb(skb2); - err = -ENOMEM; - goto err; + return -ENOMEM; } - ret = netlink_unicast(ctnl, skb2, NETLINK_CB(skb).portid, MSG_DONTWAIT); - if (ret > 0) - ret = 0; - - /* this avoids a loop in nfnetlink. */ - return ret == -EAGAIN ? -ENOBUFS : ret; -err: - return err; + return nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); }
static struct nf_ct_timeout *ctnl_timeout_find_get(struct net *net,
From: Pablo Neira Ayuso pablo@netfilter.org
mainline inclusion from mainline-v5.15-rc1 commit 241d1af4c11a75d4c17ecc0193a6ab60553efbfc category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I58CKN CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Use nfnetlink_unicast() which already translates EAGAIN to ENOBUFS, since EAGAIN is reserved to report missing module dependencies to the nfnetlink core.
e0241ae6ac59 ("netfilter: use nfnetlink_unicast() forgot to update this spot.
Reported-by: Yajun Deng yajun.deng@linux.dev Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/netfilter/nft_compat.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 8e56f353ff35..6538f8968429 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -687,14 +687,12 @@ static int nfnl_compat_get_rcu(struct net *net, struct sock *nfnl, goto out_put; }
- ret = netlink_unicast(nfnl, skb2, NETLINK_CB(skb).portid, - MSG_DONTWAIT); - if (ret > 0) - ret = 0; + ret = nfnetlink_unicast(skb2, net, NETLINK_CB(skb).portid); out_put: rcu_read_lock(); module_put(THIS_MODULE); - return ret == -EAGAIN ? -ENOBUFS : ret; + + return ret; }
static const struct nla_policy nfnl_compat_policy_get[NFTA_COMPAT_MAX+1] = {
From: "Eric W. Biederman" ebiederm@xmission.com
mainline inclusion from mainline-v5.15-rc1 commit d21918e5a94a862ccb297b9f2be38574c865fda0 category: bugfix bugzilla: 187336, https://gitee.com/openeuler/kernel/issues/I5LLC6
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Replace get_nr_threads with atomic_read(¤t->signal->live) as that is a more accurate number that is decremented sooner.
Acked-by: Kees Cook keescook@chromium.org Link: https://lkml.kernel.org/r/87lf6z6qbd.fsf_-_@disp2133 Signed-off-by: "Eric W. Biederman" ebiederm@xmission.com
Conflicts: kernel/seccomp.c
Signed-off-by: GONG, Ruiqi gongruiqi1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/seccomp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b2dd045d6afe..1e0b33dd681b 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1260,7 +1260,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, seccomp_log(this_syscall, SIGSYS, action, true); /* Dump core only if this is the last remaining thread. */ if (action != SECCOMP_RET_KILL_THREAD || - get_nr_threads(current) == 1) { + (atomic_read(¤t->signal->live) == 1)) { kernel_siginfo_t info;
/* Show the original registers in the dump. */
From: Jann Horn jannh@google.com
stable inclusion from stable-v5.10.130 commit 6c32496964da0dc230cea763a0e934b2e02dabd5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5LJFH CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit eeaa345e128515135ccb864c04482180c08e3259 upstream.
The fastpath in slab_alloc_node() assumes that c->slab is stable as long as the TID stays the same. However, two places in __slab_alloc() currently don't update the TID when deactivating the CPU slab.
If multiple operations race the right way, this could lead to an object getting lost; or, in an even more unlikely situation, it could even lead to an object being freed onto the wrong slab's freelist, messing up the `inuse` counter and eventually causing a page to be freed to the page allocator while it still contains slab objects.
(I haven't actually tested these cases though, this is just based on looking at the code. Writing testcases for this stuff seems like it'd be a pain...)
The race leading to state inconsistency is (all operations on the same CPU and kmem_cache):
- task A: begin do_slab_free(): - read TID - read pcpu freelist (==NULL) - check `slab == c->slab` (true) - [PREEMPT A->B] - task B: begin slab_alloc_node(): - fastpath fails (`c->freelist` is NULL) - enter __slab_alloc() - slub_get_cpu_ptr() (disables preemption) - enter ___slab_alloc() - take local_lock_irqsave() - read c->freelist as NULL - get_freelist() returns NULL - write `c->slab = NULL` - drop local_unlock_irqrestore() - goto new_slab - slub_percpu_partial() is NULL - get_partial() returns NULL - slub_put_cpu_ptr() (enables preemption) - [PREEMPT B->A] - task A: finish do_slab_free(): - this_cpu_cmpxchg_double() succeeds() - [CORRUPT STATE: c->slab==NULL, c->freelist!=NULL]
From there, the object on c->freelist will get lost if task B is allowed to continue from here: It will proceed to the retry_load_slab label, set c->slab, then jump to load_freelist, which clobbers c->freelist.
But if we instead continue as follows, we get worse corruption:
- task A: run __slab_free() on object from other struct slab: - CPU_PARTIAL_FREE case (slab was on no list, is now on pcpu partial) - task A: run slab_alloc_node() with NUMA node constraint: - fastpath fails (c->slab is NULL) - call __slab_alloc() - slub_get_cpu_ptr() (disables preemption) - enter ___slab_alloc() - c->slab is NULL: goto new_slab - slub_percpu_partial() is non-NULL - set c->slab to slub_percpu_partial(c) - [CORRUPT STATE: c->slab points to slab-1, c->freelist has objects from slab-2] - goto redo - node_match() fails - goto deactivate_slab - existing c->freelist is passed into deactivate_slab() - inuse count of slab-1 is decremented to account for object from slab-2
At this point, the inuse count of slab-1 is 1 lower than it should be. This means that if we free all allocated objects in slab-1 except for one, SLUB will think that slab-1 is completely unused, and may free its page, leading to use-after-free.
Fixes: c17dda40a6a4e ("slub: Separate out kmem_cache_cpu processing from deactivate_slab") Fixes: 03e404af26dc2 ("slub: fast release on full slab") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn jannh@google.com Acked-by: Christoph Lameter cl@linux.com Acked-by: David Rientjes rientjes@google.com Reviewed-by: Muchun Song songmuchun@bytedance.com Tested-by: Hyeonggon Yoo 42.hyeyoo@gmail.com Signed-off-by: Vlastimil Babka vbabka@suse.cz Link: https://lore.kernel.org/r/20220608182205.2945720-1-jannh@google.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c index 98452815a066..ad44734dbf72 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2318,6 +2318,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
c->page = NULL; c->freelist = NULL; + c->tid = next_tid(c->tid); }
/* @@ -2451,8 +2452,6 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); deactivate_slab(s, c->page, c->freelist, c); - - c->tid = next_tid(c->tid); }
/* @@ -2738,6 +2737,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
if (!freelist) { c->page = NULL; + c->tid = next_tid(c->tid); stat(s, DEACTIVATE_BYPASS); goto new_slab; }
From: Lee Jones lee.jones@linaro.org
mainline inclusion from mainline-v5.11-rc1 commit a609c58086e381c13bdad1ba97e6510a13d465e7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5LMI9 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Fixes the following W=1 kernel build warning(s):
drivers/tty/serial/8250/8250_port.c:349:14: warning: no previous prototype for ‘au_serial_in’ [-Wmissing-prototypes] drivers/tty/serial/8250/8250_port.c:359:6: warning: no previous prototype for ‘au_serial_out’ [-Wmissing-prototypes]
Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Jiri Slaby jirislaby@kernel.org Cc: Mike Hudson Exoray@isys.ca Cc: linux-serial@vger.kernel.org Signed-off-by: Lee Jones lee.jones@linaro.org Link: https://lore.kernel.org/r/20201112105857.2078977-3-lee.jones@linaro.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yi Yang yiyang13@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/tty/serial/8250/8250_early.c | 3 --- include/linux/serial_8250.h | 5 +++++ 2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/drivers/tty/serial/8250/8250_early.c b/drivers/tty/serial/8250/8250_early.c index 70d7826788f5..c171ce6db691 100644 --- a/drivers/tty/serial/8250/8250_early.c +++ b/drivers/tty/serial/8250/8250_early.c @@ -204,9 +204,6 @@ OF_EARLYCON_DECLARE(omap8250, "ti,omap4-uart", early_omap8250_setup);
#ifdef CONFIG_SERIAL_8250_RT288X
-unsigned int au_serial_in(struct uart_port *p, int offset); -void au_serial_out(struct uart_port *p, int offset, int value); - static int __init early_au_setup(struct earlycon_device *dev, const char *opt) { dev->port.serial_in = au_serial_in; diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 2b70f736b091..9e655055112d 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -187,4 +187,9 @@ extern void serial8250_set_isa_configurator(void (*v) (int port, struct uart_port *up, u32 *capabilities));
+#ifdef CONFIG_SERIAL_8250_RT288X +unsigned int au_serial_in(struct uart_port *p, int offset); +void au_serial_out(struct uart_port *p, int offset, int value); +#endif + #endif
From: Thadeu Lima de Souza Cascardo cascardo@canonical.com
mainline inclusion from mainline-v6.0-rc1 commit 9ad36309e2719a884f946678e0296be10f0bb4c1 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5LJLR CVE: CVE-2022-2588
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When a route filter is replaced and the old filter has a 0 handle, the old one won't be removed from the hashtable, while it will still be freed.
The test was there since before commit 1109c00547fc ("net: sched: RCU cls_route"), when a new filter was not allocated when there was an old one. The old filter was reused and the reinserting would only be necessary if an old filter was replaced. That was still wrong for the same case where the old handle was 0.
Remove the old filter from the list independently from its handle value.
This fixes CVE-2022-2588, also reported as ZDI-CAN-17440.
Reported-by: Zhenpeng Lin zplin@u.northwestern.edu Signed-off-by: Thadeu Lima de Souza Cascardo cascardo@canonical.com Reviewed-by: Kamal Mostafa kamal@canonical.com Cc: stable@vger.kernel.org Acked-by: Jamal Hadi Salim jhs@mojatatu.com Link: https://lore.kernel.org/r/20220809170518.164662-1-cascardo@canonical.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/sched/cls_route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 5efa3e7ace15..315ca2b7e2ed 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -526,7 +526,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, rcu_assign_pointer(f->next, f1); rcu_assign_pointer(*fp, f);
- if (fold && fold->handle && f->handle != fold->handle) { + if (fold) { th = to_hash(fold->handle); h = from_hash(fold->handle >> 16); b = rtnl_dereference(head->table[th]);
From: Thadeu Lima de Souza Cascardo cascardo@canonical.com
mainline inclusion from mainline-v6.0-rc1 commit e362359ace6f87c201531872486ff295df306d13 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5LJH2 CVE: CVE-2022-2585
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Commit 55e8c8eb2c7b ("posix-cpu-timers: Store a reference to a pid not a task") started looking up tasks by PID when deleting a CPU timer.
When a non-leader thread calls execve, it will switch PIDs with the leader process. Then, as it calls exit_itimers, posix_cpu_timer_del cannot find the task because the timer still points out to the old PID.
That means that armed timers won't be disarmed, that is, they won't be removed from the timerqueue_list. exit_itimers will still release their memory, and when that list is later processed, it leads to a use-after-free.
Clean up the timers from the de-threaded task before freeing them. This prevents a reported use-after-free.
Fixes: 55e8c8eb2c7b ("posix-cpu-timers: Store a reference to a pid not a task") Signed-off-by: Thadeu Lima de Souza Cascardo cascardo@canonical.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Reviewed-by: Thomas Gleixner tglx@linutronix.de Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20220809170751.164716-1-cascardo@canonical.com
Conflicts: fs/exec.c
Signed-off-by: Yu Liao liaoyu15@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/exec.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/exec.c b/fs/exec.c index 4c2d18061633..97f991f79e69 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1274,6 +1274,9 @@ int begin_new_exec(struct linux_binprm * bprm) bprm->mm = NULL;
#ifdef CONFIG_POSIX_TIMERS + spin_lock_irq(&me->sighand->siglock); + posix_cpu_timers_exit(me); + spin_unlock_irq(&me->sighand->siglock); exit_itimers(me->signal); flush_itimer_signals(); #endif
From: GUO Zihua guozihua@huawei.com
stable inclusion from stable-v5.10.136 commit 3c77292d52b341831cb09c24ca4112a1e4f9e91f category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5MF5J CVE: NA
Reference: https://lore.kernel.org/linux-arm-kernel/20220712075031.29061-1-guozihua@hua...
--------------------------------
commit 7ae19d422c7da84b5f13bc08b98bd737a08d3a53 upstream.
A kasan error was reported during fuzzing:
BUG: KASAN: slab-out-of-bounds in neon_poly1305_blocks.constprop.0+0x1b4/0x250 [poly1305_neon] Read of size 4 at addr ffff0010e293f010 by task syz-executor.5/1646715 CPU: 4 PID: 1646715 Comm: syz-executor.5 Kdump: loaded Not tainted 5.10.0.aarch64 #1 Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.59 01/31/2019 Call trace: dump_backtrace+0x0/0x394 show_stack+0x34/0x4c arch/arm64/kernel/stacktrace.c:196 __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x158/0x1e4 lib/dump_stack.c:118 print_address_description.constprop.0+0x68/0x204 mm/kasan/report.c:387 __kasan_report+0xe0/0x140 mm/kasan/report.c:547 kasan_report+0x44/0xe0 mm/kasan/report.c:564 check_memory_region_inline mm/kasan/generic.c:187 [inline] __asan_load4+0x94/0xd0 mm/kasan/generic.c:252 neon_poly1305_blocks.constprop.0+0x1b4/0x250 [poly1305_neon] neon_poly1305_do_update+0x6c/0x15c [poly1305_neon] neon_poly1305_update+0x9c/0x1c4 [poly1305_neon] crypto_shash_update crypto/shash.c:131 [inline] shash_finup_unaligned+0x84/0x15c crypto/shash.c:179 crypto_shash_finup+0x8c/0x140 crypto/shash.c:193 shash_digest_unaligned+0xb8/0xe4 crypto/shash.c:201 crypto_shash_digest+0xa4/0xfc crypto/shash.c:217 crypto_shash_tfm_digest+0xb4/0x150 crypto/shash.c:229 essiv_skcipher_setkey+0x164/0x200 [essiv] crypto_skcipher_setkey+0xb0/0x160 crypto/skcipher.c:612 skcipher_setkey+0x3c/0x50 crypto/algif_skcipher.c:305 alg_setkey+0x114/0x2a0 crypto/af_alg.c:220 alg_setsockopt+0x19c/0x210 crypto/af_alg.c:253 __sys_setsockopt+0x190/0x2e0 net/socket.c:2123 __do_sys_setsockopt net/socket.c:2134 [inline] __se_sys_setsockopt net/socket.c:2131 [inline] __arm64_sys_setsockopt+0x78/0x94 net/socket.c:2131 __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline] invoke_syscall+0x64/0x100 arch/arm64/kernel/syscall.c:48 el0_svc_common.constprop.0+0x220/0x230 arch/arm64/kernel/syscall.c:155 do_el0_svc+0xb4/0xd4 arch/arm64/kernel/syscall.c:217 el0_svc+0x24/0x3c arch/arm64/kernel/entry-common.c:353 el0_sync_handler+0x160/0x164 arch/arm64/kernel/entry-common.c:369 el0_sync+0x160/0x180 arch/arm64/kernel/entry.S:683
This error can be reproduced by the following code compiled as ko on a system with kasan enabled:
char test_data[] = "\x00\x01\x02\x03\x04\x05\x06\x07" "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" "\x10\x11\x12\x13\x14\x15\x16\x17" "\x18\x19\x1a\x1b\x1c\x1d\x1e";
int init(void) { struct crypto_shash *tfm = NULL; char *data = NULL, *out = NULL;
tfm = crypto_alloc_shash("poly1305", 0, 0); data = kmalloc(POLY1305_KEY_SIZE - 1, GFP_KERNEL); out = kmalloc(POLY1305_DIGEST_SIZE, GFP_KERNEL); memcpy(data, test_data, POLY1305_KEY_SIZE - 1); crypto_shash_tfm_digest(tfm, data, POLY1305_KEY_SIZE - 1, out);
kfree(data); kfree(out); return 0; }
void deinit(void) { }
module_init(init) module_exit(deinit) MODULE_LICENSE("GPL");
The root cause of the bug sits in neon_poly1305_blocks. The logic neon_poly1305_blocks() performed is that if it was called with both s[] and r[] uninitialized, it will first try to initialize them with the data from the first "block" that it believed to be 32 bytes in length. First 16 bytes are used as the key and the next 16 bytes for s[]. This would lead to the aforementioned read out-of-bound. However, after calling poly1305_init_arch(), only 16 bytes were deducted from the input and s[] is initialized yet again with the following 16 bytes. The second initialization of s[] is certainly redundent which indicates that the first initialization should be for r[] only.
This patch fixes the issue by calling poly1305_init_arm64() instead of poly1305_init_arch(). This is also the implementation for the same algorithm on arm platform.
Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation") Cc: stable@vger.kernel.org Signed-off-by: GUO Zihua guozihua@huawei.com Reviewed-by: Eric Biggers ebiggers@google.com Acked-by: Will Deacon will@kernel.org Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: GUO Zihua guozihua@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/crypto/poly1305-glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c index 01e22fe40823..9f4599014854 100644 --- a/arch/arm64/crypto/poly1305-glue.c +++ b/arch/arm64/crypto/poly1305-glue.c @@ -52,7 +52,7 @@ static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, { if (unlikely(!dctx->sset)) { if (!dctx->rset) { - poly1305_init_arch(dctx, src); + poly1305_init_arm64(&dctx->h, src); src += POLY1305_BLOCK_SIZE; len -= POLY1305_BLOCK_SIZE; dctx->rset = 1;