From: Weilong Chen chenweilong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
Support disable oom-killer, and report oom events to bbox vm.enable_oom_killer: 0: disable oom killer 1: enable oom killer (default,compatible with mainline) 2: disable oom killer and panic on oom
Signed-off-by: Weilong Chen chenweilong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/Kconfig | 11 +++++++++ include/linux/oom.h | 11 +++++++++ kernel/sysctl.c | 12 ++++++++++ mm/memcontrol.c | 6 +++++ mm/oom_kill.c | 56 +++++++++++++++++++++++++++++++++++++++++++++ mm/util.c | 6 +++++ 6 files changed, 102 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 10fabb5f633d..4412f14547af 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1351,6 +1351,17 @@ config ASCEND_DVPP_MMAP special memory for DvPP processor, the new flag is only valid for Ascend platform.
+config ASCEND_OOM + bool "Enable support for disable oom killer" + default y + help + In some cases we hopes that the oom will not kill the process when it occurs, + be able to notify the black box to report the event, and be able to trigger + the panic to locate the problem. + vm.enable_oom_killer: + 0: disable oom killer + 1: enable oom killer (default,compatible with mainline) + 2: disable oom killer and panic on oom endif
endmenu diff --git a/include/linux/oom.h b/include/linux/oom.h index 69864a547663..689d32ab694b 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -117,4 +117,15 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p); extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; extern int sysctl_panic_on_oom; + +#ifdef CONFIG_ASCEND_OOM +#define HISI_OOM_TYPE_NOMEM 0 +#define HISI_OOM_TYPE_OVERCOMMIT 1 +#define HISI_OOM_TYPE_CGROUP 2 + +extern int sysctl_enable_oom_killer; +extern int register_hisi_oom_notifier(struct notifier_block *nb); +extern int hisi_oom_notifier_call(unsigned long val, void *v); +extern int unregister_hisi_oom_notifier(struct notifier_block *nb); +#endif #endif /* _INCLUDE_LINUX_OOM_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 54ae74d3180b..665c9e2a8802 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1264,6 +1264,18 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, .extra2 = &two, }, +#ifdef CONFIG_ASCEND_OOM + { + /* 0: diasable, 1: enable, 2: disable and panic on oom */ + .procname = "enable_oom_killer", + .data = &sysctl_enable_oom_killer, + .maxlen = sizeof(sysctl_enable_oom_killer), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, +#endif { .procname = "oom_kill_allocating_task", .data = &sysctl_oom_kill_allocating_task, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e0377bae0bf6..a63bfd73da9a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1729,6 +1729,9 @@ static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int current->memcg_in_oom = memcg; current->memcg_oom_gfp_mask = mask; current->memcg_oom_order = order; +#ifdef CONFIG_ASCEND_OOM + hisi_oom_notifier_call(HISI_OOM_TYPE_CGROUP, NULL); +#endif
return OOM_ASYNC; } @@ -1802,6 +1805,9 @@ bool mem_cgroup_oom_synchronize(bool handle) mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask, current->memcg_oom_order); } else { +#ifdef CONFIG_ASCEND_OOM + hisi_oom_notifier_call(HISI_OOM_TYPE_CGROUP, NULL); +#endif schedule(); mem_cgroup_unmark_under_oom(memcg); finish_wait(&memcg_oom_waitq, &owait.wait); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8a4570c53e83..c08041ecd286 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -52,6 +52,9 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; +#ifdef CONFIG_ASCEND_OOM +int sysctl_enable_oom_killer = 1; +#endif
/* * Serializes oom killer invocations (out_of_memory()) from all contexts to @@ -1047,6 +1050,42 @@ int unregister_oom_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_oom_notifier);
+#ifdef CONFIG_ASCEND_OOM +static BLOCKING_NOTIFIER_HEAD(hisi_oom_notify_list); + +int register_hisi_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&hisi_oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(register_hisi_oom_notifier); + +static unsigned long last_jiffies; +int hisi_oom_notifier_call(unsigned long val, void *v) +{ + /* when enable oom killer, just return */ + if (sysctl_enable_oom_killer == 1) + return 0; + + /* Print time interval to 10 seconds */ + if (time_after(jiffies, last_jiffies + 10 * HZ)) { + pr_err("OOM_NOTIFIER: oom type %lu\n", val); + dump_stack(); + show_mem(SHOW_MEM_FILTER_NODES, NULL); + dump_tasks(NULL, 0); + last_jiffies = jiffies; + } + + return blocking_notifier_call_chain(&hisi_oom_notify_list, val, v); +} +EXPORT_SYMBOL_GPL(hisi_oom_notifier_call); + +int unregister_hisi_oom_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&hisi_oom_notify_list, nb); +} +EXPORT_SYMBOL_GPL(unregister_hisi_oom_notifier); +#endif + /** * out_of_memory - kill the "best" process when we run out of memory * @oc: pointer to struct oom_control @@ -1060,10 +1099,27 @@ bool out_of_memory(struct oom_control *oc) { unsigned long freed = 0; enum oom_constraint constraint = CONSTRAINT_NONE; +#ifdef CONFIG_ASCEND_OOM + unsigned long oom_type; +#endif
if (oom_killer_disabled) return false;
+#ifdef CONFIG_ASCEND_OOM + if (sysctl_enable_oom_killer == 0 || sysctl_enable_oom_killer == 2) { + if (is_memcg_oom(oc)) + oom_type = HISI_OOM_TYPE_CGROUP; + else + oom_type = HISI_OOM_TYPE_NOMEM; + + hisi_oom_notifier_call(oom_type, NULL); + if (unlikely(sysctl_enable_oom_killer == 2)) + panic("Out of memory, panic by sysctl_enable_oom_killer"); + return false; + } +#endif + if (!is_memcg_oom(oc)) { blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) diff --git a/mm/util.c b/mm/util.c index 5515219168e8..ed64ef1f8387 100644 --- a/mm/util.c +++ b/mm/util.c @@ -17,6 +17,9 @@
#include <asm/sections.h> #include <linux/uaccess.h> +#ifdef CONFIG_ASCEND_OOM +#include <linux/oom.h> +#endif
#include "internal.h"
@@ -744,6 +747,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) if (percpu_counter_read_positive(&vm_committed_as) < allowed) return 0; error: +#ifdef CONFIG_ASCEND_OOM + hisi_oom_notifier_call(HISI_OOM_TYPE_OVERCOMMIT, NULL); +#endif vm_unacct_memory(pages);
return -ENOMEM;
From: Weilong Chen chenweilong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
Enable the ascend oom control features for hulk default config.
Signed-off-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/configs/hulk_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index 38432cf6e886..f5e57a07d1c4 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -475,6 +475,7 @@ CONFIG_RANDOMIZE_BASE=y CONFIG_RANDOMIZE_MODULE_REGION_FULL=y CONFIG_ASCEND_FEATURES=y CONFIG_ASCEND_DVPP_MMAP=y +CONFIG_ASCEND_OOM=y
# # Boot options
From: Jean-Philippe Brucker jean-philippe.brucker@arm.com
ascend inclusion category: feature bugzilla: 14369 CVE: NA
--------------
IOMMU drivers need a way to bind Linux processes to devices. This is used for Shared Virtual Memory (SVM), where devices support paging. In that mode, DMA can directly target virtual addresses of a process.
Introduce boilerplate code for allocating process structures and binding them to devices. Four operations are added to IOMMU drivers:
When a process exits, we need to ensure that devices attached to it stop issuing transactions with its PASID. Let device drivers register a callback to be notified on process exit.
At the moment the callback is set on the domain like the fault handler, because we don't have a structure available for IOMMU masters. This can become problematic if different devices in a domain are managed by distinct device drivers (for example multiple devices in the same group). The problem is the same for the fault handler, so we'll probably fix them all at once.
Signed-off-by: Jean-Philippe Brucker jean-philippe.brucker@arm.com Signed-off-by: Fang Lijun fanglijun3@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/Kconfig | 11 ++++++ drivers/iommu/Makefile | 1 + drivers/iommu/iommu-process.c | 68 +++++++++++++++++++++++++++++++++++ include/linux/iommu.h | 32 +++++++++++++++++ 4 files changed, 112 insertions(+) create mode 100644 drivers/iommu/iommu-process.c
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 4f4132d0fca4..80aeff486e0c 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -112,6 +112,17 @@ config IOMMU_DMA select IOMMU_IOVA select NEED_SG_DMA_LENGTH
+config IOMMU_PROCESS + bool "Process management API for the IOMMU" + depends on MMU_NOTIFIER + select IOMMU_API + help + Enable process management for the IOMMU API. In systems that support + it, device drivers can bind processes to devices and share their page + tables using this API. + + If unsure, say N here. + config IOMMU_SVA bool select IOMMU_API diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile index a6f94cc89f92..1533a9ff4777 100644 --- a/drivers/iommu/Makefile +++ b/drivers/iommu/Makefile @@ -3,6 +3,7 @@ obj-$(CONFIG_IOMMU_API) += iommu.o obj-$(CONFIG_IOMMU_API) += iommu-traces.o obj-$(CONFIG_IOMMU_API) += iommu-sysfs.o obj-$(CONFIG_IOMMU_DEBUGFS) += iommu-debugfs.o +obj-$(CONFIG_IOMMU_PROCESS) += iommu-process.o obj-$(CONFIG_IOMMU_DMA) += dma-iommu.o obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o obj-$(CONFIG_IOMMU_PAGE_FAULT) += io-pgfault.o diff --git a/drivers/iommu/iommu-process.c b/drivers/iommu/iommu-process.c new file mode 100644 index 000000000000..66ee91c14094 --- /dev/null +++ b/drivers/iommu/iommu-process.c @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Track processes bound to devices + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. + * + * Copyright (C) 2017 ARM Ltd. + * + * Author: Jean-Philippe Brucker jean-philippe.brucker@arm.com + */ + +#include <linux/idr.h> +#include <linux/iommu.h> +#include <linux/slab.h> +#include <linux/spinlock.h> + +/* Link between a domain and a process */ +struct iommu_context { + struct iommu_process *process; + struct iommu_domain *domain; + + struct list_head process_head; + struct list_head domain_head; + + /* Number of devices that use this context */ + refcount_t ref; +}; + +/** + * iommu_set_process_exit_handler() - set a callback for stopping the use of + * PASID in a device. + * @dev: the device + * @handler: exit handler + * @token: user data, will be passed back to the exit handler + * + * Users of the bind/unbind API should call this function to set a + * device-specific callback telling them when a process is exiting. + * + * After the callback returns, the device must not issue any more transaction + * with the PASIDs given as argument to the handler. It can be a single PASID + * value or the special IOMMU_PROCESS_EXIT_ALL. + * + * The handler itself should return 0 on success, and an appropriate error code + * otherwise. + */ +void iommu_set_process_exit_handler(struct device *dev, + iommu_process_exit_handler_t handler, + void *token) +{ + struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + + if (WARN_ON(!domain)) + return; + + domain->process_exit = handler; + domain->process_exit_token = token; +} +EXPORT_SYMBOL_GPL(iommu_set_process_exit_handler); diff --git a/include/linux/iommu.h b/include/linux/iommu.h index e6ffd426e267..8bec6a66e065 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -63,6 +63,11 @@ typedef int (*iommu_fault_handler_t)(struct iommu_domain *, typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault_event *, void *); typedef int (*iommu_mm_exit_handler_t)(struct device *dev, int pasid, void *);
+/* All process are being detached from this device */ +#define IOMMU_PROCESS_EXIT_ALL (-1) +typedef int (*iommu_process_exit_handler_t)(struct iommu_domain *, struct device *dev, + int pasid, void *); + #define IOMMU_SVA_FEAT_IOPF (1 << 0)
struct iommu_domain_geometry { @@ -101,12 +106,27 @@ struct iommu_domain { unsigned long pgsize_bitmap; /* Bitmap of page sizes in use */ iommu_fault_handler_t handler; void *handler_token; + iommu_process_exit_handler_t process_exit; + void *process_exit_token; struct iommu_domain_geometry geometry; void *iova_cookie;
+ unsigned int min_pasid, max_pasid; + struct list_head processes; + struct list_head mm_list; };
+struct iommu_process { + struct pid *pid; + int pasid; + struct list_head domains; + struct kref kref; + + /* Release callback for this process */ + void (*release)(struct iommu_process *process); +}; + struct io_mm { int pasid; /* IOMMU_SVA_FEAT_* */ @@ -1125,4 +1145,16 @@ void iommu_debugfs_setup(void); static inline void iommu_debugfs_setup(void) {} #endif
+#ifdef CONFIG_IOMMU_PROCESS +extern void iommu_set_process_exit_handler(struct device *dev, + iommu_process_exit_handler_t cb, + void *token); +#else /* CONFIG_IOMMU_PROCESS */ +static inline void iommu_set_process_exit_handler(struct device *dev, + iommu_process_exit_handler_t cb, + void *token) +{ +} +#endif /* CONFIG_IOMMU_PROCESS */ + #endif /* __LINUX_IOMMU_H */
From: Jean-Philippe Brucker jean-philippe.brucker@arm.com
ascend inclusion category: feature bugzilla: 14369 CVE: NA
--------------
commit https://patchwork.ozlabs.org/patch/822422/
The fault handler will need to find a process given its PASID. This is the reason we have an IDR for storing processes, so hook it up.
Signed-off-by: Jean-Philippe Brucker jean-philippe.brucker@arm.com Signed-off-by: Fang Lijun fanglijun3@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/iommu-process.c | 87 +++++++++++++++++++++++++++++++++++ include/linux/iommu.h | 12 +++++ 2 files changed, 99 insertions(+)
diff --git a/drivers/iommu/iommu-process.c b/drivers/iommu/iommu-process.c index 66ee91c14094..626503b83354 100644 --- a/drivers/iommu/iommu-process.c +++ b/drivers/iommu/iommu-process.c @@ -36,6 +36,93 @@ struct iommu_context { refcount_t ref; };
+/* + * Because we're using an IDR, PASIDs are limited to 31 bits (the sign bit is + * used for returning errors). In practice implementations will use at most 20 + * bits, which is the PCI limit. + */ +static DEFINE_IDR(iommu_process_idr); + +/* + * For the moment this is an all-purpose lock. It serializes + * access/modifications to contexts (process-domain links), access/modifications + * to the PASID IDR, and changes to process refcount as well. + */ +static DEFINE_SPINLOCK(iommu_process_lock); + +static void iommu_process_release(struct kref *kref) +{ + struct iommu_process *process; + void (*release)(struct iommu_process *); + + assert_spin_locked(&iommu_process_lock); + + process = container_of(kref, struct iommu_process, kref); + release = process->release; + + WARN_ON(!list_empty(&process->domains)); + + idr_remove(&iommu_process_idr, process->pasid); + put_pid(process->pid); + release(process); +} + +/* + * Returns non-zero if a reference to the process was successfully taken. + * Returns zero if the process is being freed and should not be used. + */ +static int iommu_process_get_locked(struct iommu_process *process) +{ + assert_spin_locked(&iommu_process_lock); + + if (process) + return kref_get_unless_zero(&process->kref); + + return 0; +} + +static void iommu_process_put_locked(struct iommu_process *process) +{ + assert_spin_locked(&iommu_process_lock); + + kref_put(&process->kref, iommu_process_release); +} + +/** + * iommu_process_put - Put reference to process, freeing it if necessary. + */ +void iommu_process_put(struct iommu_process *process) +{ + spin_lock(&iommu_process_lock); + iommu_process_put_locked(process); + spin_unlock(&iommu_process_lock); +} +EXPORT_SYMBOL_GPL(iommu_process_put); + +/** + * iommu_process_find - Find process associated to the given PASID + * + * Returns the IOMMU process corresponding to this PASID, or NULL if not found. + * A reference to the iommu_process is kept, and must be released with + * iommu_process_put. + */ +struct iommu_process *iommu_process_find(int pasid) +{ + struct iommu_process *process; + + spin_lock(&iommu_process_lock); + process = idr_find(&iommu_process_idr, pasid); + if (process) { + if (!iommu_process_get_locked(process)) + /* kref is 0, process is defunct */ + process = NULL; + } + spin_unlock(&iommu_process_lock); + + return process; +} +EXPORT_SYMBOL_GPL(iommu_process_find); + /** * iommu_set_process_exit_handler() - set a callback for stopping the use of * PASID in a device. diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 8bec6a66e065..3ca9d171620e 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -1149,12 +1149,24 @@ static inline void iommu_debugfs_setup(void) {} extern void iommu_set_process_exit_handler(struct device *dev, iommu_process_exit_handler_t cb, void *token); +extern struct iommu_process *iommu_process_find(int pasid); +extern void iommu_process_put(struct iommu_process *process); + #else /* CONFIG_IOMMU_PROCESS */ static inline void iommu_set_process_exit_handler(struct device *dev, iommu_process_exit_handler_t cb, void *token) { } + +static inline struct iommu_process *iommu_process_find(int pasid) +{ + return NULL; +} + +static inline void iommu_process_put(struct iommu_process *process) +{ +} #endif /* CONFIG_IOMMU_PROCESS */
#endif /* __LINUX_IOMMU_H */
From: Jean-Philippe Brucker jean-philippe.brucker@arm.com
ascend inclusion category: feature bugzilla: 14369 CVE: NA
--------------
commit https://patchwork.ozlabs.org/patch/822423/
When creating an iommu_process structure, register a notifier to be informed of changes to the virtual address space and to know when the process exits.
Two new operations are added to the IOMMU driver:
* process_invalidate when a range of addresses is unmapped, to let the IOMMU driver send TLB invalidations.
* process_exit when the mm is released. It's a bit more involved in this case, as the IOMMU driver has to tell all devices drivers to stop using this PASID, then clear the PASID table and invalidate TLBs.
Adding the notifier in the mix complicates process release. In one case device drivers free the process explicitly by calling unbind (or detaching the device). In the other case the process could crash before unbind, in which case the release notifier has to do all the work.
Signed-off-by: Jean-Philippe Brucker jean-philippe.brucker@arm.com Signed-off-by: Fang Lijun fanglijun3@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/iommu.h | 13 +++++++++++++ 1 file changed, 13 insertions(+)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 3ca9d171620e..dc78957b5544 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -122,9 +122,15 @@ struct iommu_process { int pasid; struct list_head domains; struct kref kref; +#ifdef CONFIG_MMU_NOTIFIER + struct mmu_notifier notifier; +#endif + struct mm_struct *mm;
/* Release callback for this process */ void (*release)(struct iommu_process *process); + /* For postponed release */ + struct rcu_head rcu; };
struct io_mm { @@ -278,6 +284,9 @@ struct iommu_sva_param { * @domain_free: free iommu domain * @attach_dev: attach device to an iommu domain * @detach_dev: detach device from an iommu domain + * @process_invalidate: Invalidate a range of mappings for a process. + * @process_exit: A process is exiting. Stop using the PASID, remove PASID entry + * and flush associated TLB entries. * @sva_device_init: initialize Shared Virtual Adressing for a device * @sva_device_shutdown: shutdown Shared Virtual Adressing for a device * @mm_alloc: allocate io_mm @@ -321,6 +330,10 @@ struct iommu_ops {
int (*attach_dev)(struct iommu_domain *domain, struct device *dev); void (*detach_dev)(struct iommu_domain *domain, struct device *dev); + void (*process_invalidate)(struct iommu_domain *domain, + struct iommu_process *process, + unsigned long iova, size_t size); + void (*process_exit)(struct iommu_domain *domain, struct iommu_process *process); int (*sva_device_init)(struct device *dev, struct iommu_sva_param *param); void (*sva_device_shutdown)(struct device *dev,
From: Jean-Philippe Brucker jean-philippe.brucker@arm.com
ascend inclusion category: feature bugzilla: 14369 CVE: NA
--------------
commit https://patchwork.ozlabs.org/patch/822424/
Add bind and unbind operations to the IOMMU API. Device drivers can use them to share process page tables with their device. iommu_process_bind_group is provided for VFIO's convenience, as it needs to provide a coherent interface on containers. Device drivers will most likely want to use iommu_process_bind_device, which doesn't bind the whole group.
PASIDs are de facto shared between all devices in a group (because of hardware weaknesses), but we don't do anything about it at the API level. Making bind_device call bind_group is probably the wrong way around, because it requires more work on our side for no benefit. We'd have to replay all binds each time a device is hotplugged into a group. But when a device is hotplugged into a group, the device driver will have to do a bind before using its PASID anyway and we can reject inconsistencies at that point.
Concurrent calls to iommu_process_bind_device for the same process are not supported at the moment (they'll race on process_alloc which will only succeed for the first one; the others will have to retry the bind). I also don't support calling bind() on a dying process, not sure if it matters.
Signed-off-by: Jean-Philippe Brucker jean-philippe.brucker@arm.com Signed-off-by: Fang Lijun fanglijun3@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/iommu-process.c | 445 +++++++++++++++++++++++++++++++++- drivers/iommu/iommu.c | 64 +++++ include/linux/iommu.h | 53 ++++ 3 files changed, 558 insertions(+), 4 deletions(-)
diff --git a/drivers/iommu/iommu-process.c b/drivers/iommu/iommu-process.c index 626503b83354..483821a9e93a 100644 --- a/drivers/iommu/iommu-process.c +++ b/drivers/iommu/iommu-process.c @@ -21,7 +21,9 @@
#include <linux/idr.h> #include <linux/iommu.h> +#include <linux/mmu_notifier.h> #include <linux/slab.h> +#include <linux/sched/mm.h> #include <linux/spinlock.h>
/* Link between a domain and a process */ @@ -50,21 +52,144 @@ static DEFINE_IDR(iommu_process_idr); */ static DEFINE_SPINLOCK(iommu_process_lock);
-static void iommu_process_release(struct kref *kref) +static struct mmu_notifier_ops iommu_process_mmu_notfier; + +/* + * Allocate a iommu_process structure for the given task. + * + * Ideally we shouldn't need the domain parameter, since iommu_process is + * system-wide, but we use it to retrieve the driver's allocation ops and a + * PASID range. + */ +static struct iommu_process * +iommu_process_alloc(struct iommu_domain *domain, struct task_struct *task) +{ + int err; + int pasid; + struct iommu_process *process; + + if (WARN_ON(!domain->ops->process_alloc || !domain->ops->process_free)) + return ERR_PTR(-ENODEV); + + process = domain->ops->process_alloc(task); + if (IS_ERR(process)) + return process; + if (!process) + return ERR_PTR(-ENOMEM); + + process->pid = get_task_pid(task, PIDTYPE_PID); + process->mm = get_task_mm(task); + process->notifier.ops = &iommu_process_mmu_notfier; + process->release = domain->ops->process_free; + INIT_LIST_HEAD(&process->domains); + + if (!process->pid) { + err = -EINVAL; + goto err_free_process; + } + + if (!process->mm) { + err = -EINVAL; + goto err_put_pid; + } + + idr_preload(GFP_KERNEL); + spin_lock(&iommu_process_lock); + pasid = idr_alloc_cyclic(&iommu_process_idr, process, domain->min_pasid, + domain->max_pasid + 1, GFP_ATOMIC); + process->pasid = pasid; + spin_unlock(&iommu_process_lock); + idr_preload_end(); + + if (pasid < 0) { + err = pasid; + goto err_put_mm; + } + + err = mmu_notifier_register(&process->notifier, process->mm); + if (err) + goto err_free_pasid; + + /* + * Now that the MMU notifier is valid, we can allow users to grab this + * process by setting a valid refcount. Before that it was accessible in + * the IDR but invalid. + * + * Users of the process structure obtain it with inc_not_zero, which + * provides a control dependency to ensure that they don't modify the + * structure if they didn't acquire the ref. So I think we need a write + * barrier here to pair with that control dependency (XXX probably + * nonsense.) + */ + smp_wmb(); + kref_init(&process->kref); + + /* A mm_count reference is kept by the notifier */ + mmput(process->mm); + + return process; + +err_free_pasid: + /* + * Even if the process is accessible from the IDR at this point, kref is + * 0 so no user could get a reference to it. Free it manually. + */ + spin_lock(&iommu_process_lock); + idr_remove(&iommu_process_idr, process->pasid); + spin_unlock(&iommu_process_lock); + +err_put_mm: + mmput(process->mm); + +err_put_pid: + put_pid(process->pid); + +err_free_process: + domain->ops->process_free(process); + + return ERR_PTR(err); +} + +static void iommu_process_free(struct rcu_head *rcu) { struct iommu_process *process; void (*release)(struct iommu_process *);
+ process = container_of(rcu, struct iommu_process, rcu); + release = process->release; + + release(process); +} + +static void iommu_process_release(struct kref *kref) +{ + struct iommu_process *process; + assert_spin_locked(&iommu_process_lock);
process = container_of(kref, struct iommu_process, kref); - release = process->release; - WARN_ON(!list_empty(&process->domains));
idr_remove(&iommu_process_idr, process->pasid); put_pid(process->pid); - release(process); + + /* + * If we're being released from process exit, the notifier callback + * ->release has already been called. Otherwise we don't need to go + * through there, the process isn't attached to anything anymore. Hence + * no_release. + */ + mmu_notifier_unregister_no_release(&process->notifier, process->mm); + + /* + * We can't free the structure here, because ->release might be + * attempting to grab it concurrently. And in the other case, if the + * structure is being released from within ->release, then + * __mmu_notifier_release expects to still have a valid mn when + * returning. So free the structure when it's safe, after the RCU grace + * period elapsed. + */ + mmu_notifier_call_srcu(&process->rcu, iommu_process_free); }
/* @@ -123,6 +248,318 @@ struct iommu_process *iommu_process_find(int pasid) } EXPORT_SYMBOL_GPL(iommu_process_find);
+static int iommu_process_attach(struct iommu_domain *domain, struct device *dev, + struct iommu_process *process) +{ + int err; + int pasid = process->pasid; + struct iommu_context *context; + + if (WARN_ON(!domain->ops->process_attach || !domain->ops->process_detach || + !domain->ops->process_exit || !domain->ops->process_invalidate)) + return -ENODEV; + + if (pasid > domain->max_pasid || pasid < domain->min_pasid) + return -ENOSPC; + + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) + return -ENOMEM; + + context->process = process; + context->domain = domain; + refcount_set(&context->ref, 1); + + spin_lock(&iommu_process_lock); + err = domain->ops->process_attach(domain, dev, process, true); + if (err) { + kfree(context); + spin_unlock(&iommu_process_lock); + return err; + } + + list_add(&context->process_head, &process->domains); + list_add(&context->domain_head, &domain->processes); + spin_unlock(&iommu_process_lock); + + return 0; +} + +static void iommu_context_free(struct iommu_context *context) +{ + assert_spin_locked(&iommu_process_lock); + + if (WARN_ON(!context->process || !context->domain)) + return; + + list_del(&context->process_head); + list_del(&context->domain_head); + iommu_process_put_locked(context->process); + + kfree(context); +} + +/* Attach an existing context to the device */ +static int iommu_process_attach_locked(struct iommu_context *context, + struct device *dev) +{ + assert_spin_locked(&iommu_process_lock); + + refcount_inc(&context->ref); + return context->domain->ops->process_attach(context->domain, dev, + context->process, false); +} + +/* Detach device from context and release it if necessary */ +static void iommu_process_detach_locked(struct iommu_context *context, + struct device *dev) +{ + bool last = false; + struct iommu_domain *domain = context->domain; + + assert_spin_locked(&iommu_process_lock); + + if (refcount_dec_and_test(&context->ref)) + last = true; + + domain->ops->process_detach(domain, dev, context->process, last); + + if (last) + iommu_context_free(context); +} + +/* + * Called when the process exits. Might race with unbind or any other function + * dropping the last reference to the process. As the mmu notifier doesn't hold + * any reference to the process when calling ->release, try to take a reference. + */ +static void iommu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) +{ + struct iommu_context *context, *next; + struct iommu_process *process = container_of(mn, struct iommu_process, notifier); + + /* + * If the process is exiting then domains are still attached to the + * process. A few things need to be done before it is safe to release + * + * 1) Tell the IOMMU driver to stop using this PASID (and forward the + * message to attached device drivers. It can then clear the PASID + * table and invalidate relevant TLBs. + * + * 2) Drop all references to this process, by freeing the contexts. + */ + spin_lock(&iommu_process_lock); + if (!iommu_process_get_locked(process)) { + /* Someone's already taking care of it. */ + spin_unlock(&iommu_process_lock); + return; + } + + list_for_each_entry_safe(context, next, &process->domains, process_head) { + context->domain->ops->process_exit(context->domain, process); + iommu_context_free(context); + } + spin_unlock(&iommu_process_lock); + + /* + * We're now reasonably certain that no more fault is being handled for + * this process, since we just flushed them all out of the fault queue. + * Release the last reference to free the process. + */ + iommu_process_put(process); +} + +static void iommu_notifier_invalidate_range(struct mmu_notifier *mn, struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct iommu_context *context; + struct iommu_process *process = container_of(mn, struct iommu_process, notifier); + + spin_lock(&iommu_process_lock); + list_for_each_entry(context, &process->domains, process_head) { + context->domain->ops->process_invalidate(context->domain, + process, start, end - start); + } + spin_unlock(&iommu_process_lock); +} + +static int iommu_notifier_clear_flush_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + iommu_notifier_invalidate_range(mn, mm, start, end); + return 0; +} + +static void iommu_notifier_change_pte(struct mmu_notifier *mn, struct mm_struct *mm, + unsigned long address, pte_t pte) +{ + iommu_notifier_invalidate_range(mn, mm, address, address + PAGE_SIZE); +} + +static struct mmu_notifier_ops iommu_process_mmu_notfier = { + .release = iommu_notifier_release, + .clear_flush_young = iommu_notifier_clear_flush_young, + .change_pte = iommu_notifier_change_pte, + .invalidate_range = iommu_notifier_invalidate_range, +}; + +/** + * iommu_process_bind_device - Bind a process address space to a device + * @dev: the device + * @task: the process to bind + * @pasid: valid address where the PASID will be stored + * @flags: bond properties (IOMMU_PROCESS_BIND_*) + * + * Create a bond between device and task, allowing the device to access the + * process address space using the returned PASID. + * + * On success, 0 is returned and @pasid contains a valid ID. Otherwise, an error + * is returned. + */ +int iommu_process_bind_device(struct device *dev, struct task_struct *task, + int *pasid, int flags) +{ + int err, i; + int nesting; + struct pid *pid; + struct iommu_domain *domain; + struct iommu_process *process; + struct iommu_context *cur_context; + struct iommu_context *context = NULL; + + domain = iommu_get_domain_for_dev(dev); + if (WARN_ON(!domain)) + return -EINVAL; + + if (!iommu_domain_get_attr(domain, DOMAIN_ATTR_NESTING, &nesting) && + nesting) + return -EINVAL; + + pid = get_task_pid(task, PIDTYPE_PID); + if (!pid) + return -EINVAL; + + /* If an iommu_process already exists, use it */ + spin_lock(&iommu_process_lock); + idr_for_each_entry(&iommu_process_idr, process, i) { + if (process->pid != pid) + continue; + + if (!iommu_process_get_locked(process)) { + /* Process is defunct, create a new one */ + process = NULL; + break; + } + + /* Great, is it also bound to this domain? */ + list_for_each_entry(cur_context, &process->domains, + process_head) { + if (cur_context->domain != domain) + continue; + + context = cur_context; + *pasid = process->pasid; + + /* Splendid, tell the driver and increase the ref */ + err = iommu_process_attach_locked(context, dev); + if (err) + iommu_process_put_locked(process); + + break; + } + break; + } + spin_unlock(&iommu_process_lock); + put_pid(pid); + + if (context) + return err; + + if (!process) { + process = iommu_process_alloc(domain, task); + if (IS_ERR(process)) + return PTR_ERR(process); + } + + err = iommu_process_attach(domain, dev, process); + if (err) { + iommu_process_put(process); + return err; + } + + *pasid = process->pasid; + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_process_bind_device); + +/** + * iommu_process_unbind_device - Remove a bond created with + * iommu_process_bind_device. + * + * @dev: the device + * @pasid: the pasid returned by bind + */ +int iommu_process_unbind_device(struct device *dev, int pasid) +{ + struct iommu_domain *domain; + struct iommu_process *process; + struct iommu_context *cur_context; + struct iommu_context *context = NULL; + + domain = iommu_get_domain_for_dev(dev); + if (WARN_ON(!domain)) + return -EINVAL; + + spin_lock(&iommu_process_lock); + process = idr_find(&iommu_process_idr, pasid); + if (!process) { + spin_unlock(&iommu_process_lock); + return -ESRCH; + } + + list_for_each_entry(cur_context, &process->domains, process_head) { + if (cur_context->domain == domain) { + context = cur_context; + break; + } + } + + if (context) + iommu_process_detach_locked(context, dev); + spin_unlock(&iommu_process_lock); + + return context ? 0 : -ESRCH; +} +EXPORT_SYMBOL_GPL(iommu_process_unbind_device); + +/* + * __iommu_process_unbind_dev_all - Detach all processes attached to this + * device. + * + * When detaching @device from @domain, IOMMU drivers have to use this function. + */ +void __iommu_process_unbind_dev_all(struct iommu_domain *domain, struct device *dev) +{ + struct iommu_context *context, *next; + + /* Ask device driver to stop using all PASIDs */ + spin_lock(&iommu_process_lock); + if (domain->process_exit) { + list_for_each_entry(context, &domain->processes, domain_head) + domain->process_exit(domain, dev, + context->process->pasid, + domain->process_exit_token); + } + + list_for_each_entry_safe(context, next, &domain->processes, domain_head) + iommu_process_detach_locked(context, dev); + spin_unlock(&iommu_process_lock); +} +EXPORT_SYMBOL_GPL(__iommu_process_unbind_dev_all); + /** * iommu_set_process_exit_handler() - set a callback for stopping the use of * PASID in a device. diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index dac8aeab16c9..831c5065f7f8 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -1852,6 +1852,70 @@ void iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group) } EXPORT_SYMBOL_GPL(iommu_detach_group);
+/* + * iommu_process_bind_group - Share process address space with all devices in + * the group. + * @group: the iommu group + * @task: the process to bind + * @pasid: valid address where the PASID will be stored + * @flags: bond properties (IOMMU_PROCESS_BIND_*) + * + * Create a bond between group and process, allowing devices in the group to + * access the process address space using @pasid. + * + * On success, 0 is returned and @pasid contains a valid ID. Otherwise, an error + * is returned. + */ +int iommu_process_bind_group(struct iommu_group *group, + struct task_struct *task, int *pasid, int flags) +{ + struct group_device *device; + int ret = -ENODEV; + + if (!pasid) + return -EINVAL; + + if (!group->domain) + return -EINVAL; + + mutex_lock(&group->mutex); + list_for_each_entry(device, &group->devices, list) { + ret = iommu_process_bind_device(device->dev, task, pasid, + flags); + if (ret) + break; + } + + if (ret) { + list_for_each_entry_continue_reverse(device, &group->devices, list) + iommu_process_unbind_device(device->dev, *pasid); + } + mutex_unlock(&group->mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_process_bind_group); + +/** + * iommu_process_unbind_group - Remove a bond created with + * iommu_process_bind_group + * + * @group: the group + * @pasid: the pasid returned by bind + */ +int iommu_process_unbind_group(struct iommu_group *group, int pasid) +{ + struct group_device *device; + + mutex_lock(&group->mutex); + list_for_each_entry(device, &group->devices, list) + iommu_process_unbind_device(device->dev, pasid); + mutex_unlock(&group->mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(iommu_process_unbind_group); + phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova) { if (unlikely(domain->ops->iova_to_phys == NULL)) diff --git a/include/linux/iommu.h b/include/linux/iommu.h index dc78957b5544..0af49bd0a6b1 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -284,6 +284,11 @@ struct iommu_sva_param { * @domain_free: free iommu domain * @attach_dev: attach device to an iommu domain * @detach_dev: detach device from an iommu domain + * @process_alloc: allocate iommu process + * @process_free: free iommu process + * @process_attach: attach iommu process to a domain + * @process_detach: detach iommu process from a domain. Remove PASID entry and + * flush associated TLB entries. * @process_invalidate: Invalidate a range of mappings for a process. * @process_exit: A process is exiting. Stop using the PASID, remove PASID entry * and flush associated TLB entries. @@ -330,6 +335,12 @@ struct iommu_ops {
int (*attach_dev)(struct iommu_domain *domain, struct device *dev); void (*detach_dev)(struct iommu_domain *domain, struct device *dev); + struct iommu_process *(*process_alloc)(struct task_struct *task); + void (*process_free)(struct iommu_process *process); + int (*process_attach)(struct iommu_domain *domain, struct device *dev, + struct iommu_process *process, bool first); + void (*process_detach)(struct iommu_domain *domain, struct device *dev, + struct iommu_process *process, bool last); void (*process_invalidate)(struct iommu_domain *domain, struct iommu_process *process, unsigned long iova, size_t size); @@ -703,6 +714,10 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode, void iommu_fwspec_free(struct device *dev); int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids); const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode); +extern int iommu_process_bind_group(struct iommu_group *group, + struct task_struct *task, int *pasid, + int flags); +extern int iommu_process_unbind_group(struct iommu_group *group, int pasid);
extern int iommu_sva_bind_device(struct device *dev, struct mm_struct *mm, int *pasid, unsigned long flags, void *drvdata); @@ -1059,6 +1074,19 @@ static inline int iommu_sva_unbind_device(struct device *dev, int pasid) return -ENODEV; }
+static inline int iommu_process_bind_group(struct iommu_group *group, + struct task_struct *task, int *pasid, + int flags) +{ + return -ENODEV; +} + +static inline int iommu_process_unbind_group(struct iommu_group *group, + int pasid) +{ + return -ENODEV; +} + #endif /* CONFIG_IOMMU_API */
#ifdef CONFIG_IOMMU_SVA @@ -1165,6 +1193,13 @@ extern void iommu_set_process_exit_handler(struct device *dev, extern struct iommu_process *iommu_process_find(int pasid); extern void iommu_process_put(struct iommu_process *process);
+extern int iommu_process_bind_device(struct device *dev, + struct task_struct *task, int *pasid, + int flags); +extern int iommu_process_unbind_device(struct device *dev, int pasid); +extern void __iommu_process_unbind_dev_all(struct iommu_domain *domain, + struct device *dev); + #else /* CONFIG_IOMMU_PROCESS */ static inline void iommu_set_process_exit_handler(struct device *dev, iommu_process_exit_handler_t cb, @@ -1180,6 +1215,24 @@ static inline struct iommu_process *iommu_process_find(int pasid) static inline void iommu_process_put(struct iommu_process *process) { } + +static inline int iommu_process_bind_device(struct device *dev, + struct task_struct *task, + int *pasid, int flags) +{ + return -ENODEV; +} + +static inline int iommu_process_unbind_device(struct device *dev, int pasid) +{ + return -ENODEV; +} + +static inline void __iommu_process_unbind_dev_all(struct iommu_domain *domain, + struct device *dev) +{ +} + #endif /* CONFIG_IOMMU_PROCESS */
#endif /* __LINUX_IOMMU_H */
From: Fang Lijun fanglijun3@huawei.com
ascend inclusion category: feature bugzilla: 14369 CVE: NA
--------------
Enable config IOMMU_PROCESS by default.
Signed-off-by: Fang Lijun fanglijun3@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/configs/euleros_defconfig | 1 + arch/arm64/configs/hulk_defconfig | 1 + arch/arm64/configs/openeuler_defconfig | 1 + arch/arm64/configs/syzkaller_defconfig | 1 + 4 files changed, 4 insertions(+)
diff --git a/arch/arm64/configs/euleros_defconfig b/arch/arm64/configs/euleros_defconfig index 6446fd374f6a..65e898e77973 100644 --- a/arch/arm64/configs/euleros_defconfig +++ b/arch/arm64/configs/euleros_defconfig @@ -4634,6 +4634,7 @@ CONFIG_IOMMU_IO_PGTABLE_LPAE=y # CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set CONFIG_IOMMU_IOVA=y CONFIG_OF_IOMMU=y +# CONFIG_IOMMU_PROCESS is not set CONFIG_IOMMU_DMA=y CONFIG_ARM_SMMU=y CONFIG_ARM_SMMU_V3=y diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index f5e57a07d1c4..68e0b0495949 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -4596,6 +4596,7 @@ CONFIG_ARM_SMMU_V3_CONTEXT=y # CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set CONFIG_IOMMU_IOVA=y CONFIG_OF_IOMMU=y +CONFIG_IOMMU_PROCESS=y CONFIG_IOMMU_DMA=y CONFIG_IOMMU_SVA=y CONFIG_IOMMU_PAGE_FAULT=y diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 88c9895b5337..44a7d1661576 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -4934,6 +4934,7 @@ CONFIG_ARM_SMMU_V3_CONTEXT=y # CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set CONFIG_IOMMU_IOVA=y CONFIG_OF_IOMMU=y +# CONFIG_IOMMU_PROCESS is not set CONFIG_IOMMU_DMA=y CONFIG_IOMMU_SVA=y CONFIG_IOMMU_PAGE_FAULT=y diff --git a/arch/arm64/configs/syzkaller_defconfig b/arch/arm64/configs/syzkaller_defconfig index 781bfd1c366f..05407a0de7df 100644 --- a/arch/arm64/configs/syzkaller_defconfig +++ b/arch/arm64/configs/syzkaller_defconfig @@ -4563,6 +4563,7 @@ CONFIG_ARM_SMMU_V3_CONTEXT=y # CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set CONFIG_IOMMU_IOVA=y CONFIG_OF_IOMMU=y +CONFIG_IOMMU_PROCESS=y CONFIG_IOMMU_DMA=y CONFIG_IOMMU_SVA=y CONFIG_IOMMU_PAGE_FAULT=y
From: Fang Lijun fanglijun3@huawei.com
ascend inclusion category: Bugfix bugzilla: NA CVE: NA
--------------
System cann't use the cdm nodes memory, but it can mmap all nodes huge pages, so it will cause Bus error when mmap succeed but the huge pages were not enough.
When set the cdmmask, users will transfer the numa id by mmap flag to map the specific numa node hugepages, if there was not enough hugepages on this node, return -ENOMEM.
v2: Fix compile error when disable CONFIG_COHERENT_DEVICE
Signed-off-by: Fang Lijun fanglijun3@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/hugetlbfs/inode.c | 45 +++++++++++++++++++++++++++++++++++++++++ include/linux/hugetlb.h | 4 ++++ include/linux/mm.h | 9 +++++++++ mm/hugetlb.c | 2 ++ mm/mmap.c | 13 ++++++++++++ 5 files changed, 73 insertions(+)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ef2c25b71736..2ac900f02280 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -120,6 +120,45 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); }
+/* + * Check current numa node has enough free huge pages to mmap hugetlb. + * resv_huge_pages_node: mmap hugepages but haven't used in current + * numa node. + */ +static int hugetlb_checknode(struct vm_area_struct *vma, long nr) +{ + int nid; + int ret = 0; + struct hstate *h = &default_hstate; + + spin_lock(&hugetlb_lock); + + nid = vma->vm_flags >> CHECKNODE_BITS; + + if (nid >= MAX_NUMNODES) { + ret = -EINVAL; + goto err; + } + + if (h->free_huge_pages_node[nid] < nr) { + ret = -ENOMEM; + goto err; + } else { + if (h->resv_huge_pages_node[nid] + nr > + h->free_huge_pages_node[nid]) { + ret = -ENOMEM; + goto err; + } else { + h->resv_huge_pages_node[nid] += nr; + ret = 0; + } + } + +err: + spin_unlock(&hugetlb_lock); + return ret; +} + /* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. @@ -172,6 +211,12 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) inode_lock(inode); file_accessed(file);
+ if (is_set_cdmmask()) { + ret = hugetlb_checknode(vma, len >> huge_page_shift(h)); + if (ret < 0) + goto out; + } + ret = -ENOMEM; if (hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f9ac17a4d368..32f2837a6075 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -15,6 +15,9 @@ struct ctl_table; struct user_struct; struct mmu_gather;
+#define CHECKNODE_BITS 48 +#define CHECKNODE_MASK (~((_AC(1, UL) << CHECKNODE_BITS) - 1)) + #ifndef is_hugepd /* * Some architectures requires a hugepage directory format that is @@ -350,6 +353,7 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; + unsigned int resv_huge_pages_node[MAX_NUMNODES]; #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ struct cftype cgroup_files[5]; diff --git a/include/linux/mm.h b/include/linux/mm.h index b985af8ea7df..794d21255bfc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -70,6 +70,15 @@ extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif
+#ifdef CONFIG_COHERENT_DEVICE +static inline bool is_set_cdmmask(void) +{ + return !nodes_empty(cdmmask); +} +#else +#define is_set_cdmmask() (0) +#endif + #include <asm/page.h> #include <asm/pgtable.h> #include <asm/processor.h> diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2938b5bb7a49..0eb0c943397f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -969,6 +969,8 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, if (page && !avoid_reserve && vma_has_reserves(vma, chg)) { SetPagePrivate(page); h->resv_huge_pages--; + if (is_set_cdmmask()) + h->resv_huge_pages_node[vma->vm_flags >> CHECKNODE_BITS]--; }
mpol_cond_put(mpol); diff --git a/mm/mmap.c b/mm/mmap.c index c1034012aeaa..e1a4d3fa713e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -69,6 +69,7 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX; int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; #endif
+static unsigned long numanode; static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
@@ -1531,6 +1532,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags |= VM_NORESERVE; }
+ /* set numa node id into vm_flags, + * hugetlbfs file mmap will use it to check node + */ + if (is_set_cdmmask()) + vm_flags |= ((numanode << CHECKNODE_BITS) & CHECKNODE_MASK); + addr = mmap_region(file, addr, len, vm_flags, pgoff, uf); if (!IS_ERR_VALUE(addr) && ((vm_flags & VM_LOCKED) || @@ -1546,6 +1553,12 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, struct file *file = NULL; unsigned long retval;
+ /* get mmap numa node id */ + if (is_set_cdmmask()) { + numanode = (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK; + flags &= ~(MAP_HUGE_MASK << MAP_HUGE_SHIFT); + } + if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); file = fget(fd);
From: Zhou Guanghui zhouguanghui1@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------------------
The iopf workqueue which means IO Page Fault workqueue is used for device page fault.
During the processing of the iopf work, the handle_mm_fault() is invoked to alloc page and create a mapping. It is expected that the CPU that allocates memory and the device that triggers the fault are affinity to the same NUMA node to ensure that the memory allocated is local access to CPU and device. This will also improve the performance.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/Kconfig | 13 +++++++++++++ drivers/iommu/io-pgfault.c | 22 +++++++++++++++++++++- 2 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 4412f14547af..8842402e302d 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1362,6 +1362,19 @@ config ASCEND_OOM 0: disable oom killer 1: enable oom killer (default,compatible with mainline) 2: disable oom killer and panic on oom + +config ASCEND_IOPF_HIPRI + bool "Enable support for highpri iopf workqueue" + default y + depends on IOMMU_PAGE_FAULT + help + The iopf workqueue which means IO Page Fault workqueue is used for device + page fault. + + This option enable the high priority for iopf workqueue. If enabled, the + CPU which processes IOPF work is the same as that which processes IOPF + event interrupts. + endif
endmenu diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c index 9025e50dff66..271400633dae 100644 --- a/drivers/iommu/io-pgfault.c +++ b/drivers/iommu/io-pgfault.c @@ -48,6 +48,8 @@ struct iopf_group { struct work_struct work; };
+static int enable_iopf_hipri __read_mostly; + static int iopf_complete(struct device *dev, struct iommu_fault_event *evt, enum page_response_code status) { @@ -399,6 +401,7 @@ struct iopf_queue * iopf_queue_alloc(const char *name, iopf_queue_flush_t flush, void *cookie) { struct iopf_queue *queue; + unsigned int type = WQ_UNBOUND;
queue = kzalloc(sizeof(*queue), GFP_KERNEL); if (!queue) @@ -410,7 +413,10 @@ iopf_queue_alloc(const char *name, iopf_queue_flush_t flush, void *cookie) * that's dealt with, the high-level function can handle groups out of * order. */ - queue->wq = alloc_workqueue("iopf_queue/%s", WQ_UNBOUND, 0, name); + if (enable_iopf_hipri) + type = WQ_HIGHPRI; + + queue->wq = alloc_workqueue("iopf_queue/%s", type, 0, name); if (!queue->wq) { kfree(queue); return NULL; @@ -442,3 +448,17 @@ void iopf_queue_free(struct iopf_queue *queue) kfree(queue); } EXPORT_SYMBOL_GPL(iopf_queue_free); + +#ifdef CONFIG_ASCEND_IOPF_HIPRI + +static int __init ascend_enable_iopf_hipri(char *s) +{ + enable_iopf_hipri = 1; + + pr_info("Ascend enable iopf workqueue highpri\n"); + + return 1; +} +__setup("enable_iopf_hipri", ascend_enable_iopf_hipri); + +#endif
From: Zhou Guanghui zhouguanghui1@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
------------------------------------------------------------
Enable the high priority of iopf workqueue for hulk default config.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/configs/hulk_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index 68e0b0495949..a8365a27fb99 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -476,6 +476,7 @@ CONFIG_RANDOMIZE_MODULE_REGION_FULL=y CONFIG_ASCEND_FEATURES=y CONFIG_ASCEND_DVPP_MMAP=y CONFIG_ASCEND_OOM=y +CONFIG_ASCEND_IOPF_HIPRI=y
# # Boot options
From: Jan Kara jack@suse.cz
mainline inclusion from mainline-5.9-rc1 commit 82e9a0c81aac04484e0f44be50e62ccca442084f category: bugfix bugzilla: 39268 CVE: NA ---------------------------
ext4_setup_system_zone() can fail. Handle the failure in ext4_remount().
Reviewed-by: Lukas Czerner lczerner@redhat.com Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5a09d52b864c..3647e6a8b2b6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5473,7 +5473,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_register_li_request(sb, first_not_zeroed); }
- ext4_setup_system_zone(sb); + err = ext4_setup_system_zone(sb); + if (err) + goto restore_opts; + if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { err = ext4_commit_super(sb, 1); if (err)
From: Jan Kara jack@suse.cz
mainline inclusion from mainline-5.9-rc1 commit adb2de050fd5662242f1988541693add29580e82 category: bugfix bugzilla: 39268 CVE: NA ---------------------------
Currently, add_system_zone() just silently merges two added system zones that overlap. However the overlap should not happen and it generally suggests that some unrelated metadata overlap which indicates the fs is corrupted. We should have caught such problems earlier (e.g. in ext4_check_descriptors()) but add this check as another line of defense. In later patch we also use this for stricter checking of journal inode extent tree.
Reviewed-by: Lukas Czerner lczerner@redhat.com Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/block_validity.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index ff8e1205127e..ceb54ccc937e 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -68,7 +68,7 @@ static int add_system_zone(struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, unsigned int count) { - struct ext4_system_zone *new_entry = NULL, *entry; + struct ext4_system_zone *new_entry, *entry; struct rb_node **n = &system_blks->root.rb_node, *node; struct rb_node *parent = NULL, *new_node = NULL;
@@ -79,30 +79,20 @@ static int add_system_zone(struct ext4_system_blocks *system_blks, n = &(*n)->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; - else { - if (start_blk + count > (entry->start_blk + - entry->count)) - entry->count = (start_blk + count - - entry->start_blk); - new_node = *n; - new_entry = rb_entry(new_node, struct ext4_system_zone, - node); - break; - } + else /* Unexpected overlap of system zones. */ + return -EFSCORRUPTED; }
- if (!new_entry) { - new_entry = kmem_cache_alloc(ext4_system_zone_cachep, - GFP_KERNEL); - if (!new_entry) - return -ENOMEM; - new_entry->start_blk = start_blk; - new_entry->count = count; - new_node = &new_entry->node; - - rb_link_node(new_node, parent, n); - rb_insert_color(new_node, &system_blks->root); - } + new_entry = kmem_cache_alloc(ext4_system_zone_cachep, + GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->start_blk = start_blk; + new_entry->count = count; + new_node = &new_entry->node; + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &system_blks->root);
/* Can we merge to the left? */ node = rb_prev(new_node);
From: Jan Kara jack@suse.cz
mainline inclusion from mainline-5.9-rc1 commit 80681a7fa4bf45cf04e082c45055ba490eb49935 category: bugfix bugzilla: 39268 CVE: NA ---------------------------
Currently, system zones just track ranges of block, that are "important" fs metadata (bitmaps, group descriptors, journal blocks, etc.). This however complicates how extent tree (or indirect blocks) can be checked for inodes that actually track such metadata - currently the journal inode but arguably we should be treating quota files or resize inode similarly. We cannot run __ext4_ext_check() on such metadata inodes when loading their extents as that would immediately trigger the validity checks and so we just hack around that and special-case the journal inode. This however leads to a situation that a journal inode which has extent tree of depth at least one can have invalid extent tree that gets unnoticed until ext4_cache_extents() crashes.
To overcome this limitation, track inode number each system zone belongs to (0 is used for zones not belonging to any inode). We can then verify inode number matches the expected one when verifying extent tree and thus avoid the false errors. With this there's no need to to special-case journal inode during extent tree checking anymore so remove it.
Fixes: 0a944e8a6c66 ("ext4: don't perform block validity checks on the journal inode") Reported-by: Wolfgang Frisch wolfgang.frisch@suse.com Reviewed-by: Lukas Czerner lczerner@redhat.com Signed-off-by: Jan Kara jack@suse.cz
Conflict: fs/ext4/block_validity.c 54d3adbc29f0c ("ext4: save all error info in save_error_info() ...")
Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/block_validity.c | 46 ++++++++++++++++++++-------------------- fs/ext4/ext4.h | 6 +++--- fs/ext4/extents.c | 16 ++++++-------- fs/ext4/indirect.c | 6 ++---- fs/ext4/inode.c | 5 ++--- fs/ext4/mballoc.c | 4 ++-- 6 files changed, 38 insertions(+), 45 deletions(-)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index ceb54ccc937e..9b96716c46b6 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -24,6 +24,7 @@ struct ext4_system_zone { struct rb_node node; ext4_fsblk_t start_blk; unsigned int count; + u32 ino; };
static struct kmem_cache *ext4_system_zone_cachep; @@ -45,7 +46,8 @@ void ext4_exit_system_zone(void) static inline int can_merge(struct ext4_system_zone *entry1, struct ext4_system_zone *entry2) { - if ((entry1->start_blk + entry1->count) == entry2->start_blk) + if ((entry1->start_blk + entry1->count) == entry2->start_blk && + entry1->ino == entry2->ino) return 1; return 0; } @@ -66,7 +68,7 @@ static void release_system_zone(struct ext4_system_blocks *system_blks) */ static int add_system_zone(struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, - unsigned int count) + unsigned int count, u32 ino) { struct ext4_system_zone *new_entry, *entry; struct rb_node **n = &system_blks->root.rb_node, *node; @@ -89,6 +91,7 @@ static int add_system_zone(struct ext4_system_blocks *system_blks, return -ENOMEM; new_entry->start_blk = start_blk; new_entry->count = count; + new_entry->ino = ino; new_node = &new_entry->node;
rb_link_node(new_node, parent, n); @@ -145,7 +148,7 @@ static void debug_print_tree(struct ext4_sb_info *sbi) static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, struct ext4_system_blocks *system_blks, ext4_fsblk_t start_blk, - unsigned int count) + unsigned int count, ino_t ino) { struct ext4_system_zone *entry; struct rb_node *n; @@ -169,7 +172,7 @@ static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, n = n->rb_right; else { sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); - return 0; + return entry->ino == ino; } } return 1; @@ -204,17 +207,15 @@ static int ext4_protect_reserved_inode(struct super_block *sb, if (n == 0) { i++; } else { - if (!ext4_data_block_valid_rcu(sbi, system_blks, - map.m_pblk, n)) { - ext4_error(sb, "blocks %llu-%llu from inode %u " - "overlap system zone", map.m_pblk, - map.m_pblk + map.m_len - 1, ino); - err = -EFSCORRUPTED; + err = add_system_zone(system_blks, map.m_pblk, n, ino); + if (err < 0) { + if (err == -EFSCORRUPTED) { + ext4_error(sb, "blocks %llu-%llu from inode %u " + "overlap system zone", map.m_pblk, + map.m_pblk + map.m_len - 1, ino); + } break; } - err = add_system_zone(system_blks, map.m_pblk, n); - if (err < 0) - break; i += n; } } @@ -268,19 +269,19 @@ int ext4_setup_system_zone(struct super_block *sb) ((i < 5) || ((i % flex_size) == 0))) add_system_zone(system_blks, ext4_group_first_block_no(sb, i), - ext4_bg_num_gdb(sb, i) + 1); + ext4_bg_num_gdb(sb, i) + 1, 0); gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(system_blks, - ext4_block_bitmap(sb, gdp), 1); + ext4_block_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, - ext4_inode_bitmap(sb, gdp), 1); + ext4_inode_bitmap(sb, gdp), 1, 0); if (ret) goto err; ret = add_system_zone(system_blks, ext4_inode_table(sb, gdp), - sbi->s_itb_per_group); + sbi->s_itb_per_group, 0); if (ret) goto err; } @@ -329,7 +330,7 @@ void ext4_release_system_zone(struct super_block *sb) call_rcu(&system_blks->rcu, ext4_destroy_system_zone); }
-int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, +int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, unsigned int count) { struct ext4_system_blocks *system_blks; @@ -341,9 +342,9 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, * mount option. */ rcu_read_lock(); - system_blks = rcu_dereference(sbi->system_blks); - ret = ext4_data_block_valid_rcu(sbi, system_blks, start_blk, - count); + system_blks = rcu_dereference(EXT4_SB(inode->i_sb)->system_blks); + ret = ext4_data_block_valid_rcu(EXT4_SB(inode->i_sb), system_blks, + start_blk, count, inode->i_ino); rcu_read_unlock(); return ret; } @@ -363,8 +364,7 @@ int ext4_check_blockref(const char *function, unsigned int line, while (bref < p+max) { blk = le32_to_cpu(*bref++); if (blk && - unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), - blk, 1))) { + unlikely(!ext4_inode_block_valid(inode, blk, 1))) { es->s_last_error_block = cpu_to_le64(blk); ext4_error_inode(inode, function, line, blk, "invalid block"); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 83bec07f5014..4491b9911807 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -3157,9 +3157,9 @@ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); extern int __init ext4_init_system_zone(void); extern void ext4_exit_system_zone(void); -extern int ext4_data_block_valid(struct ext4_sb_info *sbi, - ext4_fsblk_t start_blk, - unsigned int count); +extern int ext4_inode_block_valid(struct inode *inode, + ext4_fsblk_t start_blk, + unsigned int count); extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74144a79a506..56f1ff5eae78 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -389,7 +389,7 @@ static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) */ if (lblock + len <= lblock) return 0; - return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); + return ext4_inode_block_valid(inode, block, len); }
static int ext4_valid_extent_idx(struct inode *inode, @@ -397,7 +397,7 @@ static int ext4_valid_extent_idx(struct inode *inode, { ext4_fsblk_t block = ext4_idx_pblock(ext_idx);
- return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); + return ext4_inode_block_valid(inode, block, 1); }
static int ext4_valid_extent_entries(struct inode *inode, @@ -554,14 +554,10 @@ __read_extent_tree_block(const char *function, unsigned int line, } if (buffer_verified(bh) && !(flags & EXT4_EX_FORCE_CACHE)) return bh; - if (!ext4_has_feature_journal(inode->i_sb) || - (inode->i_ino != - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) { - err = __ext4_ext_check(function, line, inode, - ext_block_hdr(bh), depth, pblk); - if (err) - goto errout; - } + err = __ext4_ext_check(function, line, inode, + ext_block_hdr(bh), depth, pblk); + if (err) + goto errout; set_buffer_verified(bh); /* * If this is a leaf block, cache all of its entries diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 2625dabd142b..0385e94a2120 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -888,8 +888,7 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode, else if (ext4_should_journal_data(inode)) flags |= EXT4_FREE_BLOCKS_FORGET;
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, - count)) { + if (!ext4_inode_block_valid(inode, block_to_free, count)) { EXT4_ERROR_INODE(inode, "attempt to clear invalid " "blocks %llu len %lu", (unsigned long long) block_to_free, count); @@ -1034,8 +1033,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, if (!nr) continue; /* A hole */
- if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), - nr, 1)) { + if (!ext4_inode_block_valid(inode, nr, 1)) { EXT4_ERROR_INODE(inode, "invalid indirect mapped " "block %lu (level %d)", diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5a82fff11651..89a363a2e0bf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -386,8 +386,7 @@ static int __check_block_validity(struct inode *inode, const char *func, (inode->i_ino == le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) return 0; - if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, - map->m_len)) { + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, @@ -5065,7 +5064,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = 0; if (ei->i_file_acl && - !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) { + !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 392aa10efda7..ca402e3f014d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2992,7 +2992,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); - if (!ext4_data_block_valid(sbi, block, len)) { + if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error @@ -4755,7 +4755,7 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
sbi = EXT4_SB(sb); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && - !ext4_data_block_valid(sbi, block, count)) { + !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); goto error_return;
From: Jan Kara jack@suse.cz
mainline inclusion from mainline-5.9-rc1 commit f97ad45a0d856d71bb0c37aa9a862c19f0c161eb category: bugfix bugzilla: 39268 CVE: NA ---------------------------
After the previous patch, ext4_data_block_valid_rcu() has a single caller. Fold it into it.
Reviewed-by: Lukas Czerner lczerner@redhat.com Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/block_validity.c | 76 ++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 42 deletions(-)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 9b96716c46b6..117226d9c70f 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -140,44 +140,6 @@ static void debug_print_tree(struct ext4_sb_info *sbi) printk(KERN_CONT "\n"); }
-/* - * Returns 1 if the passed-in block region (start_blk, - * start_blk+count) is valid; 0 if some part of the block region - * overlaps with filesystem metadata blocks. - */ -static int ext4_data_block_valid_rcu(struct ext4_sb_info *sbi, - struct ext4_system_blocks *system_blks, - ext4_fsblk_t start_blk, - unsigned int count, ino_t ino) -{ - struct ext4_system_zone *entry; - struct rb_node *n; - - if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || - (start_blk + count < start_blk) || - (start_blk + count > ext4_blocks_count(sbi->s_es))) { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); - return 0; - } - - if (system_blks == NULL) - return 1; - - n = system_blks->root.rb_node; - while (n) { - entry = rb_entry(n, struct ext4_system_zone, node); - if (start_blk + count - 1 < entry->start_blk) - n = n->rb_left; - else if (start_blk >= (entry->start_blk + entry->count)) - n = n->rb_right; - else { - sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); - return entry->ino == ino; - } - } - return 1; -} - static int ext4_protect_reserved_inode(struct super_block *sb, struct ext4_system_blocks *system_blks, u32 ino) @@ -330,11 +292,26 @@ void ext4_release_system_zone(struct super_block *sb) call_rcu(&system_blks->rcu, ext4_destroy_system_zone); }
+/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with some other filesystem metadata blocks. + */ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, unsigned int count) { + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_system_blocks *system_blks; - int ret; + struct ext4_system_zone *entry; + struct rb_node *n; + int ret = 1; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count < start_blk) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + return 0; + }
/* * Lock the system zone to prevent it being released concurrently @@ -342,9 +319,24 @@ int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, * mount option. */ rcu_read_lock(); - system_blks = rcu_dereference(EXT4_SB(inode->i_sb)->system_blks); - ret = ext4_data_block_valid_rcu(EXT4_SB(inode->i_sb), system_blks, - start_blk, count, inode->i_ino); + system_blks = rcu_dereference(sbi->system_blks); + if (system_blks == NULL) + goto out_rcu; + + n = system_blks->root.rb_node; + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); + ret = (entry->ino == inode->i_ino); + break; + } + } +out_rcu: rcu_read_unlock(); return ret; }
From: Jan Kara jack@suse.cz
mainline inclusion from mainline-5.9-rc1 commit 20c8c96f0113ec4b19d25ec1131971efd9313967 category: bugfix bugzilla: 39268 CVE: NA ---------------------------
There's one place that fails to handle error from add_system_zone() call and thus we can fail to protect superblock and group-descriptor blocks properly in case of ENOMEM. Fix it.
Reported-by: Lukas Czerner lczerner@redhat.com Reviewed-by: Lukas Czerner lczerner@redhat.com Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/block_validity.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 117226d9c70f..2c68f68560c6 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -228,10 +228,13 @@ int ext4_setup_system_zone(struct super_block *sb) for (i=0; i < ngroups; i++) { cond_resched(); if (ext4_bg_has_super(sb, i) && - ((i < 5) || ((i % flex_size) == 0))) - add_system_zone(system_blks, + ((i < 5) || ((i % flex_size) == 0))) { + ret = add_system_zone(system_blks, ext4_group_first_block_no(sb, i), ext4_bg_num_gdb(sb, i) + 1, 0); + if (ret) + goto err; + } gdp = ext4_get_group_desc(sb, i, NULL); ret = add_system_zone(system_blks, ext4_block_bitmap(sb, gdp), 1, 0);
From: Jan Kara jack@suse.cz
mainline inclusion from mainline-5.9-rc1 commit 0f9be45c08148d3c9686671acaf08579b49ba2f0 category: bugfix bugzilla: 39268 CVE: NA ---------------------------
When remounting filesystem fails late during remount handling and block_validity mount option is also changed during the remount, we fail to restore system zone information to a state matching the mount option. This is mostly harmless, just the block validity checking will not match the situation described by the mount option. Make sure these two are always consistent.
Reported-by: Lukas Czerner lczerner@redhat.com Reviewed-by: Lukas Czerner lczerner@redhat.com Signed-off-by: Jan Kara jack@suse.cz Reviewed-by: zhangyi (F) yi.zhang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/block_validity.c | 8 -------- fs/ext4/super.c | 29 +++++++++++++++++++++-------- 2 files changed, 21 insertions(+), 16 deletions(-)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 2c68f68560c6..817e3896462f 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -213,14 +213,6 @@ int ext4_setup_system_zone(struct super_block *sb) int flex_size = ext4_flex_bg_size(sbi); int ret;
- if (!test_opt(sb, BLOCK_VALIDITY)) { - if (sbi->system_blks) - ext4_release_system_zone(sb); - return 0; - } - if (sbi->system_blks) - return 0; - system_blks = kzalloc(sizeof(*system_blks), GFP_KERNEL); if (!system_blks) return -ENOMEM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3647e6a8b2b6..b6b6a25cce08 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -4522,11 +4522,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_set_resv_clusters(sb);
- err = ext4_setup_system_zone(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize system " - "zone (%d)", err); - goto failed_mount4a; + if (test_opt(sb, BLOCK_VALIDITY)) { + err = ext4_setup_system_zone(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize system " + "zone (%d)", err); + goto failed_mount4a; + } }
ext4_ext_init(sb); @@ -5473,9 +5475,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_register_li_request(sb, first_not_zeroed); }
- err = ext4_setup_system_zone(sb); - if (err) - goto restore_opts; + /* + * Handle creation of system zone data early because it can fail. + * Releasing of existing data is done when we are sure remount will + * succeed. + */ + if (test_opt(sb, BLOCK_VALIDITY) && !sbi->system_blks) { + err = ext4_setup_system_zone(sb); + if (err) + goto restore_opts; + }
if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { err = ext4_commit_super(sb, 1); @@ -5497,6 +5506,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } } #endif + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) + ext4_release_system_zone(sb);
*flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); @@ -5512,6 +5523,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) + ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < EXT4_MAXQUOTAS; i++) {
hulk inclusion category: bugfix bugzilla: CVE: NA
---------------------------
Add skcd->no_refcnt check which is missed when backporting ad0f75e5f57c ("cgroup: fix cgroup_sk_alloc() for sk_clone_lock()").
This patch is needed in stable-4.9, stable-4.14 and stable-4.19.
Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/cgroup/cgroup.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 4e3653400aa4..4aa4821a9c6a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5955,6 +5955,8 @@ void cgroup_sk_clone(struct sock_cgroup_data *skcd) { /* Socket clone path */ if (skcd->val) { + if (skcd->no_refcnt) + return; /* * We might be cloning a socket which is left in an empty * cgroup and the cgroup might have already been rmdir'd.
From: Calvin Walton calvin.walton@kepstin.ca
mainline inclusion from mainline-v5.1-rc1 commit 9392bd98bba760be96ee category: feature bugzilla: NA CVE: NA ---------------------------
Based on the Open-Source Register Reference for AMD Family 17h Processors Models 00h-2Fh: https://support.amd.com/TechDocs/56255_OSRR.pdf
These processors report RAPL support in bit 14 of CPUID 0x80000007 EDX, and the following MSRs are present: 0xc0010299 (RAPL_PWR_UNIT), like Intel's RAPL_POWER_UNIT 0xc001029a (CORE_ENERGY_STAT), kind of like Intel's PP0_ENERGY_STATUS 0xc001029b (PKG_ENERGY_STAT), like Intel's PKG_ENERGY_STATUS
A notable difference from the Intel implementation is that AMD reports the "Cores" energy usage separately for each core, rather than a per-package total. The code has been adjusted to handle either case in a generic way.
I haven't yet enabled collection of package power, due to being unable to test it on multi-node systems (TR, EPYC).
Signed-off-by: Calvin Walton calvin.walton@kepstin.ca Signed-off-by: Len Brown len.brown@intel.com Signed-off-by: Hanjun Guo guohanjun@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/power/x86/turbostat/turbostat.c | 159 +++++++++++++++++++++----- 1 file changed, 130 insertions(+), 29 deletions(-)
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 2233cf722c69..f9439fc42a02 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -44,6 +44,7 @@ #include <cpuid.h> #include <linux/capability.h> #include <errno.h> +#include <math.h>
char *proc_stat = "/proc/stat"; FILE *outf; @@ -141,9 +142,21 @@ unsigned int first_counter_read = 1;
#define RAPL_CORES_ENERGY_STATUS (1 << 9) /* 0x639 MSR_PP0_ENERGY_STATUS */ +#define RAPL_PER_CORE_ENERGY (1 << 10) + /* Indicates cores energy collection is per-core, + * not per-package. */ +#define RAPL_AMD_F17H (1 << 11) + /* 0xc0010299 MSR_RAPL_PWR_UNIT */ + /* 0xc001029a MSR_CORE_ENERGY_STAT */ + /* 0xc001029b MSR_PKG_ENERGY_STAT */ #define RAPL_CORES (RAPL_CORES_ENERGY_STATUS | RAPL_CORES_POWER_LIMIT) #define TJMAX_DEFAULT 100
+/* MSRs that are not yet in the kernel-provided header. */ +#define MSR_RAPL_PWR_UNIT 0xc0010299 +#define MSR_CORE_ENERGY_STAT 0xc001029a +#define MSR_PKG_ENERGY_STAT 0xc001029b + #define MAX(a, b) ((a) > (b) ? (a) : (b))
/* @@ -187,6 +200,7 @@ struct core_data { unsigned long long c7; unsigned long long mc6_us; /* duplicate as per-core for now, even though per module */ unsigned int core_temp_c; + unsigned int core_energy; /* MSR_CORE_ENERGY_STAT */ unsigned int core_id; unsigned long long counter[MAX_ADDED_COUNTERS]; } *core_even, *core_odd; @@ -684,6 +698,14 @@ void print_header(char *delim) if (DO_BIC(BIC_CoreTmp)) outp += sprintf(outp, "%sCoreTmp", (printed++ ? delim : ""));
+ if (do_rapl && !rapl_joules) { + if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY)) + outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : "")); + } else if (do_rapl && rapl_joules) { + if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY)) + outp += sprintf(outp, "%sCor_J", (printed++ ? delim : "")); + } + for (mp = sys.cp; mp; mp = mp->next) { if (mp->format == FORMAT_RAW) { if (mp->width == 64) @@ -738,7 +760,7 @@ void print_header(char *delim) if (do_rapl && !rapl_joules) { if (DO_BIC(BIC_PkgWatt)) outp += sprintf(outp, "%sPkgWatt", (printed++ ? delim : "")); - if (DO_BIC(BIC_CorWatt)) + if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, "%sCorWatt", (printed++ ? delim : "")); if (DO_BIC(BIC_GFXWatt)) outp += sprintf(outp, "%sGFXWatt", (printed++ ? delim : "")); @@ -751,7 +773,7 @@ void print_header(char *delim) } else if (do_rapl && rapl_joules) { if (DO_BIC(BIC_Pkg_J)) outp += sprintf(outp, "%sPkg_J", (printed++ ? delim : "")); - if (DO_BIC(BIC_Cor_J)) + if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, "%sCor_J", (printed++ ? delim : "")); if (DO_BIC(BIC_GFX_J)) outp += sprintf(outp, "%sGFX_J", (printed++ ? delim : "")); @@ -812,6 +834,7 @@ int dump_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, "c6: %016llX\n", c->c6); outp += sprintf(outp, "c7: %016llX\n", c->c7); + outp += sprintf(outp, "Joules: %0X\n", c->core_energy);
for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { outp += sprintf(outp, "cADDED [%d] msr0x%x: %08llX\n", @@ -1037,6 +1060,20 @@ int format_counters(struct thread_data *t, struct core_data *c, } }
+ /* + * If measurement interval exceeds minimum RAPL Joule Counter range, + * indicate that results are suspect by printing "**" in fraction place. + */ + if (interval_float < rapl_joule_counter_range) + fmt8 = "%s%.2f"; + else + fmt8 = "%6.0f**"; + + if (DO_BIC(BIC_CorWatt) && (do_rapl & RAPL_PER_CORE_ENERGY)) + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units / interval_float); + if (DO_BIC(BIC_Cor_J) && (do_rapl & RAPL_PER_CORE_ENERGY)) + outp += sprintf(outp, fmt8, (printed++ ? delim : ""), c->core_energy * rapl_energy_units); + /* print per-package data only for 1st core in package */ if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE)) goto done; @@ -1089,18 +1126,9 @@ int format_counters(struct thread_data *t, struct core_data *c, if (DO_BIC(BIC_SYS_LPI)) outp += sprintf(outp, "%s%.2f", (printed++ ? delim : ""), 100.0 * p->sys_lpi / 1000000.0 / interval_float);
- /* - * If measurement interval exceeds minimum RAPL Joule Counter range, - * indicate that results are suspect by printing "**" in fraction place. - */ - if (interval_float < rapl_joule_counter_range) - fmt8 = "%s%.2f"; - else - fmt8 = "%6.0f**"; - if (DO_BIC(BIC_PkgWatt)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units / interval_float); - if (DO_BIC(BIC_CorWatt)) + if (DO_BIC(BIC_CorWatt) && !(do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units / interval_float); if (DO_BIC(BIC_GFXWatt)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units / interval_float); @@ -1108,7 +1136,7 @@ int format_counters(struct thread_data *t, struct core_data *c, outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_dram * rapl_dram_energy_units / interval_float); if (DO_BIC(BIC_Pkg_J)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_pkg * rapl_energy_units); - if (DO_BIC(BIC_Cor_J)) + if (DO_BIC(BIC_Cor_J) && !(do_rapl & RAPL_PER_CORE_ENERGY)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_cores * rapl_energy_units); if (DO_BIC(BIC_GFX_J)) outp += sprintf(outp, fmt8, (printed++ ? delim : ""), p->energy_gfx * rapl_energy_units); @@ -1253,6 +1281,8 @@ delta_core(struct core_data *new, struct core_data *old) old->core_temp_c = new->core_temp_c; old->mc6_us = new->mc6_us - old->mc6_us;
+ DELTA_WRAP32(new->core_energy, old->core_energy); + for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) old->counter[i] = new->counter[i]; @@ -1395,6 +1425,7 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data c->c7 = 0; c->mc6_us = 0; c->core_temp_c = 0; + c->core_energy = 0;
p->pkg_wtd_core_c0 = 0; p->pkg_any_core_c0 = 0; @@ -1477,6 +1508,8 @@ int sum_counters(struct thread_data *t, struct core_data *c,
average.cores.core_temp_c = MAX(average.cores.core_temp_c, c->core_temp_c);
+ average.cores.core_energy += c->core_energy; + for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (mp->format == FORMAT_RAW) continue; @@ -1849,6 +1882,12 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) c->core_temp_c = tcc_activation_temp - ((msr >> 16) & 0x7F); }
+ if (do_rapl & RAPL_AMD_F17H) { + if (get_msr(cpu, MSR_CORE_ENERGY_STAT, &msr)) + return -14; + c->core_energy = msr & 0xFFFFFFFF; + } + for (i = 0, mp = sys.cp; mp; i++, mp = mp->next) { if (get_mp(cpu, mp, &c->counter[i])) return -10; @@ -3738,7 +3777,7 @@ int print_perf_limit(struct thread_data *t, struct core_data *c, struct pkg_data #define RAPL_POWER_GRANULARITY 0x7FFF /* 15 bit power granularity */ #define RAPL_TIME_GRANULARITY 0x3F /* 6 bit time granularity */
-double get_tdp(unsigned int model) +double get_tdp_intel(unsigned int model) { unsigned long long msr;
@@ -3755,6 +3794,16 @@ double get_tdp(unsigned int model) } }
+double get_tdp_amd(unsigned int family) +{ + switch (family) { + case 0x17: + default: + /* This is the max stock TDP of HEDT/Server Fam17h chips */ + return 250.0; + } +} + /* * rapl_dram_energy_units_probe() * Energy units are either hard-coded, or come from RAPL Energy Unit MSR. @@ -3776,21 +3825,12 @@ rapl_dram_energy_units_probe(int model, double rapl_energy_units) } }
- -/* - * rapl_probe() - * - * sets do_rapl, rapl_power_units, rapl_energy_units, rapl_time_units - */ -void rapl_probe(unsigned int family, unsigned int model) +void rapl_probe_intel(unsigned int family, unsigned int model) { unsigned long long msr; unsigned int time_unit; double tdp;
- if (!genuine_intel) - return; - if (family != 6) return;
@@ -3920,13 +3960,66 @@ void rapl_probe(unsigned int family, unsigned int model)
rapl_time_units = 1.0 / (1 << (time_unit));
- tdp = get_tdp(model); + tdp = get_tdp_intel(model); + + rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp; + if (!quiet) + fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp); +} + +void rapl_probe_amd(unsigned int family, unsigned int model) +{ + unsigned long long msr; + unsigned int eax, ebx, ecx, edx; + unsigned int has_rapl = 0; + double tdp; + + if (max_extended_level >= 0x80000007) { + __cpuid(0x80000007, eax, ebx, ecx, edx); + /* RAPL (Fam 17h) */ + has_rapl = edx & (1 << 14); + } + + if (!has_rapl) + return; + + switch (family) { + case 0x17: /* Zen, Zen+ */ + do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY; + if (rapl_joules) + BIC_PRESENT(BIC_Cor_J); + else + BIC_PRESENT(BIC_CorWatt); + break; + default: + return; + } + + if (get_msr(base_cpu, MSR_RAPL_PWR_UNIT, &msr)) + return; + + rapl_time_units = ldexp(1.0, -(msr >> 16 & 0xf)); + rapl_energy_units = ldexp(1.0, -(msr >> 8 & 0x1f)); + rapl_power_units = ldexp(1.0, -(msr & 0xf)); + + tdp = get_tdp_amd(model);
rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp; if (!quiet) fprintf(outf, "RAPL: %.0f sec. Joule Counter Range, at %.0f Watts\n", rapl_joule_counter_range, tdp); +}
- return; +/* + * rapl_probe() + * + * sets do_rapl, rapl_power_units, rapl_energy_units, rapl_time_units + */ +void rapl_probe(unsigned int family, unsigned int model) +{ + if (genuine_intel) + rapl_probe_intel(family, model); + if (authentic_amd) + rapl_probe_amd(family, model); }
void perf_limit_reasons_probe(unsigned int family, unsigned int model) @@ -4032,6 +4125,7 @@ void print_power_limit_msr(int cpu, unsigned long long msr, char *label) int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) { unsigned long long msr; + const char *msr_name; int cpu;
if (!do_rapl) @@ -4047,10 +4141,17 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -1; }
- if (get_msr(cpu, MSR_RAPL_POWER_UNIT, &msr)) - return -1; + if (do_rapl & RAPL_AMD_F17H) { + msr_name = "MSR_RAPL_PWR_UNIT"; + if (get_msr(cpu, MSR_RAPL_PWR_UNIT, &msr)) + return -1; + } else { + msr_name = "MSR_RAPL_POWER_UNIT"; + if (get_msr(cpu, MSR_RAPL_POWER_UNIT, &msr)) + return -1; + }
- fprintf(outf, "cpu%d: MSR_RAPL_POWER_UNIT: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr, + fprintf(outf, "cpu%d: %s: 0x%08llx (%f Watts, %f Joules, %f sec.)\n", cpu, msr_name, msr, rapl_power_units, rapl_energy_units, rapl_time_units);
if (do_rapl & RAPL_PKG_POWER_INFO) {
From: Calvin Walton calvin.walton@kepstin.ca
mainline inclusion from mainline-v5.1-rc1 commit 3316f99a9f1b68c578c5 category: feature bugzilla: NA CVE: NA ---------------------------
The package power can also be read from an MSR. It's not clear exactly what is included, and whether it's aggregated over all nodes or reported separately.
It does look like this is reported separately per CCX (I get a single value on the Ryzen R7 1700), but it might be reported separately per- die (node?) on larger processors. If that's the case, it would have to be recorded per node and aggregated for the socket.
Note that although Zen has these MSRs reporting power, it looks like the actual RAPL configuration (power limits, configured TDP) is done through PCI configuration space. I have not yet found any public documentation for this.
Signed-off-by: Calvin Walton calvin.walton@kepstin.ca Signed-off-by: Len Brown len.brown@intel.com Signed-off-by: Hanjun Guo guohanjun@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/power/x86/turbostat/turbostat.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index f9439fc42a02..712a204a7d4c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -1977,6 +1977,11 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p) return -16; p->rapl_dram_perf_status = msr & 0xFFFFFFFF; } + if (do_rapl & RAPL_AMD_F17H) { + if (get_msr(cpu, MSR_PKG_ENERGY_STAT, &msr)) + return -13; + p->energy_pkg = msr & 0xFFFFFFFF; + } if (DO_BIC(BIC_PkgTmp)) { if (get_msr(cpu, MSR_IA32_PACKAGE_THERM_STATUS, &msr)) return -17; @@ -3986,10 +3991,13 @@ void rapl_probe_amd(unsigned int family, unsigned int model) switch (family) { case 0x17: /* Zen, Zen+ */ do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY; - if (rapl_joules) + if (rapl_joules) { + BIC_PRESENT(BIC_Pkg_J); BIC_PRESENT(BIC_Cor_J); - else + } else { + BIC_PRESENT(BIC_PkgWatt); BIC_PRESENT(BIC_CorWatt); + } break; default: return;
From: Pu Wen puwen@hygon.cn
mainline inclusion from mainline-v5.3-rc7 commit 9cfa8e0 category: feature bugzilla: NA CVE: NA ---------------------------
Commit 9392bd98bba760be96ee ("tools/power turbostat: Add support for AMD Fam 17h (Zen) RAPL") add a function get_tdp_amd(), the parameter is CPU family. But the rapl_probe_amd() function use wrong model parameter. Fix the wrong caller parameter of get_tdp_amd() to use family.
Cc: stable@vger.kernel.org # v5.1+ Signed-off-by: Pu Wen puwen@hygon.cn Reviewed-by: Calvin Walton calvin.walton@kepstin.ca Signed-off-by: Len Brown len.brown@intel.com Signed-off-by: Hanjun Guo guohanjun@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/power/x86/turbostat/turbostat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 712a204a7d4c..07cda2743104 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -4010,7 +4010,7 @@ void rapl_probe_amd(unsigned int family, unsigned int model) rapl_energy_units = ldexp(1.0, -(msr >> 8 & 0x1f)); rapl_power_units = ldexp(1.0, -(msr & 0xf));
- tdp = get_tdp_amd(model); + tdp = get_tdp_amd(family);
rapl_joule_counter_range = 0xFFFFFFFF * rapl_energy_units / tdp; if (!quiet)
From: Pu Wen puwen@hygon.cn
mainline inclusion from mainline-v5.3-rc7 commit c1c10cc category: feature bugzilla: NA CVE: NA ---------------------------
Commit 9392bd98bba760be96ee ("tools/power turbostat: Add support for AMD Fam 17h (Zen) RAPL") and the commit 3316f99a9f1b68c578c5 ("tools/power turbostat: Also read package power on AMD F17h (Zen)") add AMD Fam 17h RAPL support.
Hygon Family 18h(Dhyana) support RAPL in bit 14 of CPUID 0x80000007 EDX, and has MSRs RAPL_PWR_UNIT/CORE_ENERGY_STAT/PKG_ENERGY_STAT. So add Hygon Dhyana Family 18h support for RAPL.
Already tested on Hygon multi-node systems and it shows correct per-core energy usage and the total package power.
Signed-off-by: Pu Wen puwen@hygon.cn Reviewed-by: Calvin Walton calvin.walton@kepstin.ca Signed-off-by: Len Brown len.brown@intel.com Signed-off-by: Hanjun Guo guohanjun@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/power/x86/turbostat/turbostat.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c index 07cda2743104..1c36b07bbc9c 100644 --- a/tools/power/x86/turbostat/turbostat.c +++ b/tools/power/x86/turbostat/turbostat.c @@ -73,6 +73,7 @@ unsigned int do_irtl_hsw; unsigned int units = 1000000; /* MHz etc */ unsigned int genuine_intel; unsigned int authentic_amd; +unsigned int hygon_genuine; unsigned int max_level, max_extended_level; unsigned int has_invariant_tsc; unsigned int do_nhm_platform_info; @@ -1718,7 +1719,7 @@ void get_apic_id(struct thread_data *t) if (!DO_BIC(BIC_X2APIC)) return;
- if (authentic_amd) { + if (authentic_amd || hygon_genuine) { unsigned int topology_extensions;
if (max_extended_level < 0x8000001e) @@ -3803,6 +3804,7 @@ double get_tdp_amd(unsigned int family) { switch (family) { case 0x17: + case 0x18: default: /* This is the max stock TDP of HEDT/Server Fam17h chips */ return 250.0; @@ -3990,6 +3992,7 @@ void rapl_probe_amd(unsigned int family, unsigned int model)
switch (family) { case 0x17: /* Zen, Zen+ */ + case 0x18: /* Hygon Dhyana */ do_rapl = RAPL_AMD_F17H | RAPL_PER_CORE_ENERGY; if (rapl_joules) { BIC_PRESENT(BIC_Pkg_J); @@ -4026,7 +4029,7 @@ void rapl_probe(unsigned int family, unsigned int model) { if (genuine_intel) rapl_probe_intel(family, model); - if (authentic_amd) + if (authentic_amd || hygon_genuine) rapl_probe_amd(family, model); }
@@ -4582,6 +4585,8 @@ void process_cpuid() genuine_intel = 1; else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) authentic_amd = 1; + else if (ebx == 0x6f677948 && ecx == 0x656e6975 && edx == 0x6e65476e) + hygon_genuine = 1;
if (!quiet) fprintf(outf, "CPUID(0): %.4s%.4s%.4s ",
From: Jinke Fan fanjinke@hygon.cn
mainline inclusion from mainline-v5.5-rc1 commit 7ad295d5196a58c22abecef62dd4f99e2f86e831 category: bugfix bugzilla: NA CVE: NA ---------------------------
When using following operations: date -s "21190910 19:20:00" hwclock -w to change date from 2019 to 2119 for test, it will fail on Hygon Dhyana and AMD Zen CPUs, while the same operations run ok on Intel i7 platform.
MC146818 driver use function mc146818_set_time() to set register RTC_FREQ_SELECT(RTC_REG_A)'s bit4-bit6 field which means divider stage reset value on Intel platform to 0x7.
While AMD/Hygon RTC_REG_A(0Ah)'s bit4 is defined as DV0 [Reference]: DV0 = 0 selects Bank 0, DV0 = 1 selects Bank 1. Bit5-bit6 is defined as reserved.
DV0 is set to 1, it will select Bank 1, which will disable AltCentury register(0x32) access. As UEFI pass acpi_gbl_FADT.century 0x32 (AltCentury), the CMOS write will be failed on code: CMOS_WRITE(century, acpi_gbl_FADT.century).
Correct RTC_REG_A bank select bit(DV0) to 0 on AMD/Hygon CPUs, it will enable AltCentury(0x32) register writing and finally setup century as expected.
Test results on Intel i7, AMD EPYC(17h) and Hygon machine show that it works as expected. Compiling for sparc64 and alpha architectures are passed.
Reference: https://www.amd.com/system/files/TechDocs/51192_Bolton_FCH_RRG.pdf section: 3.13 Real Time Clock (RTC)
Reported-by: kbuild test robot lkp@intel.com Signed-off-by: Jinke Fan fanjinke@hygon.cn Link: https://lore.kernel.org/r/20191105083943.115320-1-fanjinke@hygon.cn Signed-off-by: Alexandre Belloni alexandre.belloni@bootlin.com Signed-off-by: Hanjun Guo guohanjun@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/rtc/rtc-mc146818-lib.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-)
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index 18a6f15e313d..24dad215540a 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -171,7 +171,20 @@ int mc146818_set_time(struct rtc_time *time) save_control = CMOS_READ(RTC_CONTROL); CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT); + +#ifdef CONFIG_X86 + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 == 0x17) || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + CMOS_WRITE((save_freq_select & (~RTC_DIV_RESET2)), + RTC_FREQ_SELECT); + save_freq_select &= ~RTC_DIV_RESET2; + } else + CMOS_WRITE((save_freq_select | RTC_DIV_RESET2), + RTC_FREQ_SELECT); +#else + CMOS_WRITE((save_freq_select | RTC_DIV_RESET2), RTC_FREQ_SELECT); +#endif
#ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR);
From: Jiasen Lin linjiasen@hygon.cn
mainline inclusion from mainline-v5.7-rc1 commit bb81bf62151031df004864eabee0431c8b8e9064 category: bugfix bugzilla: NA CVE: NA ---------------------------
The offset of PCIe Capability Header for AMD and HYGON NTB is 0x64, but the macro which named "AMD_LINK_STATUS_OFFSET" is defined as 0x68. It is offset of Device Capabilities Reg rather than Link Control Reg.
This code trigger an error in get link statsus:
cat /sys/kernel/debug/ntb_hw_amd/0000:43:00.1/info LNK STA - 0x8fa1 Link Status - Up Link Speed - PCI-E Gen 0 Link Width - x0
This patch use pcie_capability_read_dword to get link status. After fix this issue, we can get link status accurately:
cat /sys/kernel/debug/ntb_hw_amd/0000:43:00.1/info LNK STA - 0x11030042 Link Status - Up Link Speed - PCI-E Gen 3 Link Width - x16
Fixes: a1b3695820aa4 ("NTB: Add support for AMD PCI-Express Non-Transparent Bridge") Signed-off-by: Jiasen Lin linjiasen@hygon.cn Signed-off-by: Jon Mason jdmason@kudzu.us Signed-off-by: Hanjun Guo guohanjun@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/ntb/hw/amd/ntb_hw_amd.c | 4 ++-- drivers/ntb/hw/amd/ntb_hw_amd.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-)
diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.c b/drivers/ntb/hw/amd/ntb_hw_amd.c index 33245b1e86d4..ffc5a963e574 100644 --- a/drivers/ntb/hw/amd/ntb_hw_amd.c +++ b/drivers/ntb/hw/amd/ntb_hw_amd.c @@ -855,8 +855,8 @@ static int amd_poll_link(struct amd_ntb_dev *ndev)
ndev->cntl_sta = reg;
- rc = pci_read_config_dword(ndev->ntb.pdev, - AMD_LINK_STATUS_OFFSET, &stat); + rc = pcie_capability_read_dword(ndev->ntb.pdev, + PCI_EXP_LNKCTL, &stat); if (rc) return 0; ndev->lnk_sta = stat; diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.h b/drivers/ntb/hw/amd/ntb_hw_amd.h index 8f3617a46292..81d634bb72c8 100644 --- a/drivers/ntb/hw/amd/ntb_hw_amd.h +++ b/drivers/ntb/hw/amd/ntb_hw_amd.h @@ -54,7 +54,6 @@
#define PCI_DEVICE_ID_AMD_NTB 0x145B #define AMD_LINK_HB_TIMEOUT msecs_to_jiffies(1000) -#define AMD_LINK_STATUS_OFFSET 0x68 #define NTB_LIN_STA_ACTIVE_BIT 0x00000002 #define NTB_LNK_STA_SPEED_MASK 0x000F0000 #define NTB_LNK_STA_WIDTH_MASK 0x03F00000
From: Jiasen Lin linjiasen@hygon.cn
mainline inclusion from mainline-v5.7-rc1 commit 99a06056124dcf5cfc4c95278b86c6ff96aaa1ec category: bugfix bugzilla: NA CVE: NA ---------------------------
peer->outbuf is a virtual address which is get by ioremap, it can not be converted to a physical address by virt_to_page and page_to_phys. This conversion will result in DMA error, because the destination address which is converted by page_to_phys is invalid.
This patch save the MMIO address of NTB BARx in perf_setup_peer_mw, and map the BAR space to DMA address after we assign the DMA channel. Then fill the destination address of DMA descriptor with this DMA address to guarantee that the address of memory write requests fall into memory window of NBT BARx with IOMMU enabled and disabled.
Fixes: 5648e56d03fa ("NTB: ntb_perf: Add full multi-port NTB API support") Signed-off-by: Jiasen Lin linjiasen@hygon.cn Reviewed-by: Logan Gunthorpe logang@deltatee.com Signed-off-by: Jon Mason jdmason@kudzu.us Signed-off-by: Hanjun Guo guohanjun@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/ntb/test/ntb_perf.c | 57 ++++++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 10 deletions(-)
diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c index ad5d3919435c..2fd736f0cff3 100644 --- a/drivers/ntb/test/ntb_perf.c +++ b/drivers/ntb/test/ntb_perf.c @@ -149,7 +149,8 @@ struct perf_peer { u64 outbuf_xlat; resource_size_t outbuf_size; void __iomem *outbuf; - + phys_addr_t out_phys_addr; + dma_addr_t dma_dst_addr; /* Inbound MW params */ dma_addr_t inbuf_xlat; resource_size_t inbuf_size; @@ -782,6 +783,10 @@ static int perf_copy_chunk(struct perf_thread *pthr, struct dmaengine_unmap_data *unmap; struct device *dma_dev; int try = 0, ret = 0; + struct perf_peer *peer = pthr->perf->test_peer; + void __iomem *vbase; + void __iomem *dst_vaddr; + dma_addr_t dst_dma_addr;
if (!use_dma) { memcpy_toio(dst, src, len); @@ -794,6 +799,10 @@ static int perf_copy_chunk(struct perf_thread *pthr, offset_in_page(dst), len)) return -EIO;
+ vbase = peer->outbuf; + dst_vaddr = dst; + dst_dma_addr = peer->dma_dst_addr + (dst_vaddr - vbase); + unmap = dmaengine_get_unmap_data(dma_dev, 2, GFP_NOWAIT); if (!unmap) return -ENOMEM; @@ -807,8 +816,7 @@ static int perf_copy_chunk(struct perf_thread *pthr, } unmap->to_cnt = 1;
- unmap->addr[1] = dma_map_page(dma_dev, virt_to_page(dst), - offset_in_page(dst), len, DMA_FROM_DEVICE); + unmap->addr[1] = dst_dma_addr; if (dma_mapping_error(dma_dev, unmap->addr[1])) { ret = -EIO; goto err_free_resource; @@ -865,6 +873,7 @@ static int perf_init_test(struct perf_thread *pthr) { struct perf_ctx *perf = pthr->perf; dma_cap_mask_t dma_mask; + struct perf_peer *peer = pthr->perf->test_peer;
pthr->src = kmalloc_node(perf->test_peer->outbuf_size, GFP_KERNEL, dev_to_node(&perf->ntb->dev)); @@ -882,15 +891,33 @@ static int perf_init_test(struct perf_thread *pthr) if (!pthr->dma_chan) { dev_err(&perf->ntb->dev, "%d: Failed to get DMA channel\n", pthr->tidx); - atomic_dec(&perf->tsync); - wake_up(&perf->twait); - kfree(pthr->src); - return -ENODEV; + goto err_free; } + peer->dma_dst_addr = + dma_map_resource(pthr->dma_chan->device->dev, + peer->out_phys_addr, peer->outbuf_size, + DMA_FROM_DEVICE, 0); + if (dma_mapping_error(pthr->dma_chan->device->dev, + peer->dma_dst_addr)) { + dev_err(pthr->dma_chan->device->dev, "%d: Failed to map DMA addr\n", + pthr->tidx); + peer->dma_dst_addr = 0; + dma_release_channel(pthr->dma_chan); + goto err_free; + } + dev_dbg(pthr->dma_chan->device->dev, "%d: Map MMIO %pa to DMA addr %pad\n", + pthr->tidx, + &peer->out_phys_addr, + &peer->dma_dst_addr);
atomic_set(&pthr->dma_sync, 0); - return 0; + +err_free: + atomic_dec(&perf->tsync); + wake_up(&perf->twait); + kfree(pthr->src); + return -ENODEV; }
static int perf_run_test(struct perf_thread *pthr) @@ -978,8 +1005,13 @@ static void perf_clear_test(struct perf_thread *pthr) * We call it anyway just to be sure of the transfers completion. */ (void)dmaengine_terminate_sync(pthr->dma_chan); - - dma_release_channel(pthr->dma_chan); + if (pthr->perf->test_peer->dma_dst_addr) + dma_unmap_resource(pthr->dma_chan->device->dev, + pthr->perf->test_peer->dma_dst_addr, + pthr->perf->test_peer->outbuf_size, + DMA_FROM_DEVICE, 0); + if (pthr->dma_chan) + dma_release_channel(pthr->dma_chan);
no_dma_notify: atomic_dec(&perf->tsync); @@ -1195,6 +1227,9 @@ static ssize_t perf_dbgfs_read_info(struct file *filep, char __user *ubuf, pos += scnprintf(buf + pos, buf_size - pos, "\tOut buffer addr 0x%pK\n", peer->outbuf);
+ pos += scnprintf(buf + pos, buf_size - pos, + "\tOut buff phys addr %pa[p]\n", &peer->out_phys_addr); + pos += scnprintf(buf + pos, buf_size - pos, "\tOut buffer size %pa\n", &peer->outbuf_size);
@@ -1389,6 +1424,8 @@ static int perf_setup_peer_mw(struct perf_peer *peer) if (!peer->outbuf) return -ENOMEM;
+ peer->out_phys_addr = phys_addr; + if (max_mw_size && peer->outbuf_size > max_mw_size) { peer->outbuf_size = max_mw_size; dev_warn(&peer->perf->ntb->dev,
From: Jiasen Lin linjiasen@hygon.cn
mainline inclusion from mainline-v5.8-rc1 commit a0348a4da1a7073fa14065f370f507073b857782 category: bugfix bugzilla: NA CVE: NA ---------------------------
As pthr->dma_chan can't be NULL in this context, so there is no need to check pthr->dma_chan.
Fixes: 99a06056124d ("NTB: ntb_perf: Fix address err in perf_copy_chunk") Reported-by: Dan Carpenter dan.carpenter@oracle.com Signed-off-by: Jiasen Lin linjiasen@hygon.cn Signed-off-by: Jon Mason jdmason@kudzu.us Signed-off-by: Hanjun Guo guohanjun@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/ntb/test/ntb_perf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c index 2fd736f0cff3..e335e34205f9 100644 --- a/drivers/ntb/test/ntb_perf.c +++ b/drivers/ntb/test/ntb_perf.c @@ -1010,8 +1010,8 @@ static void perf_clear_test(struct perf_thread *pthr) pthr->perf->test_peer->dma_dst_addr, pthr->perf->test_peer->outbuf_size, DMA_FROM_DEVICE, 0); - if (pthr->dma_chan) - dma_release_channel(pthr->dma_chan); + + dma_release_channel(pthr->dma_chan);
no_dma_notify: atomic_dec(&perf->tsync);
From: Alexandre Belloni alexandre.belloni@bootlin.com
mainline inclusion from mainline-v5.5-rc6 commit f01f4ffdfb703694035870f94b10f6ef2523f8de category: bugfix bugzilla: NA CVE: NA ---------------------------
There are multiple reports of this patch breaking RTC time setting for AMD platforms.
This reverts commit 7ad295d5196a58c22abecef62dd4f99e2f86e831.
Cc: Jinke Fan fanjinke@hygon.cn Link: https://lore.kernel.org/r/CABXGCsMLob0DC25JS8wwAYydnDoHBSoMh2_YLPfqm3TTvDE-Z... Fixes: 7ad295d5196a ("rtc: Fix the AltCentury value on AMD/Hygon platform") Link: https://lore.kernel.org/r/20200104043110.707810-1-alexandre.belloni@bootlin.... Signed-off-by: Alexandre Belloni alexandre.belloni@bootlin.com Signed-off-by: Hanjun Guo guohanjun@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/rtc/rtc-mc146818-lib.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-)
diff --git a/drivers/rtc/rtc-mc146818-lib.c b/drivers/rtc/rtc-mc146818-lib.c index 24dad215540a..18a6f15e313d 100644 --- a/drivers/rtc/rtc-mc146818-lib.c +++ b/drivers/rtc/rtc-mc146818-lib.c @@ -171,20 +171,7 @@ int mc146818_set_time(struct rtc_time *time) save_control = CMOS_READ(RTC_CONTROL); CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL); save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - -#ifdef CONFIG_X86 - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 == 0x17) || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - CMOS_WRITE((save_freq_select & (~RTC_DIV_RESET2)), - RTC_FREQ_SELECT); - save_freq_select &= ~RTC_DIV_RESET2; - } else - CMOS_WRITE((save_freq_select | RTC_DIV_RESET2), - RTC_FREQ_SELECT); -#else - CMOS_WRITE((save_freq_select | RTC_DIV_RESET2), RTC_FREQ_SELECT); -#endif + CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
#ifdef CONFIG_MACH_DECSTATION CMOS_WRITE(real_yrs, RTC_DEC_YEAR);