Recently we have meet a problem when running FIO test. In our kunpeng server with 320 cores, about 80% cpus comes to- 100% usage, and soft lockup appears in the kernel message, which shows it stuck in __alloc_and_insert_iova_range(). Both the call trace and high cpu occupy rate implys that the iova_rcache behaves poorly to alloc iovas.
A similar problem has been addressed in early months this year, the solution is just to enlarge the MAX_GLOBAL_MAGS to 128, but it requires to growth with the cpu cores, and it is hard to set a accurate val for a specific machine. So, it is better to use solutions from the linux community, use a list to replace the array of iova_rcache->depot. It will push iova_magazine to depot list when the local cpu_rcache is full, and use schedul_delayed_work() to free it after 100ms. The minimum length of the depot list is the num of online_cpus().
The performance of this patch shows good. This patch set use a lot of timers to record the iova_magazine, and the delay of timers in system may increase when undering heavy work load, but it should not cause problem as timers itself allows a inaccurate delay for a range.
We need to merge "iommu/iova: change IOVA_MAG_SIZE to 127 to save memory" first to resolve a compile error:
error: static assertion failed: "!(sizeof(struct iova_magazine) & (sizeof(struct iova_magazine) - 1))"
Feng Tang (1): iommu/iova: change IOVA_MAG_SIZE to 127 to save memory
Zhang Zekun (5): Revert "iommu/iova: move IOVA_MAX_GLOBAL_MAGS outside of IOMMU_SUPPORT" Revert "config: enable set the max iova mag size to 128" Revert "iommu/iova: increase the iova_rcache depot max size to 128" iommu/iova: Make the rcache depot scale better iommu/iova: Manage the depot list size
arch/arm64/configs/openeuler_defconfig | 1 - drivers/iommu/Kconfig | 10 ---- drivers/iommu/iova.c | 79 +++++++++++++++++++------- include/linux/iova.h | 7 ++- 4 files changed, 63 insertions(+), 34 deletions(-)
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8KS9A CVE: NA
-------------------------------------------
This reverts commit b86a67670a8c3e5ce85237807f21490eba495b6b.
Signed-off-by: Zhang Zekun zhangzekun11@huawei.com --- drivers/iommu/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index b630e58c49b6..54d4a8cc3876 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -437,9 +437,6 @@ config SMMU_BYPASS_DEV
This feature will be replaced by ACPI IORT RMR node, which will be upstreamed in mainline. - -endif # IOMMU_SUPPORT - config IOVA_MAX_GLOBAL_MAGS int "Set the max iova global magzines in iova rcache" range 16 2048 @@ -449,3 +446,6 @@ config IOVA_MAX_GLOBAL_MAGS it can be a bottle neck when lots of cpus are contending to use it. If you are suffering from the speed of allocing iova with more than 128 cpus, try to tune this config larger. + + +endif # IOMMU_SUPPORT
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8KS9A CVE: NA
---------------------------------------------
This reverts commit ae2040f5837dacf454ed427109adc9123a3235ae.
Signed-off-by: Zhang Zekun zhangzekun11@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 - 1 file changed, 1 deletion(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index edcb7911b3ce..36dda61d5a5a 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -5994,7 +5994,6 @@ CONFIG_ARM_SMMU_V3_PM=y # CONFIG_QCOM_IOMMU is not set # CONFIG_VIRTIO_IOMMU is not set CONFIG_SMMU_BYPASS_DEV=y -CONFIG_IOVA_MAX_GLOBAL_MAGS=128
# # Remoteproc drivers
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8KS9A CVE: NA
---------------------------------------
This reverts commit 0227a749c263db30ef5d49fc009a5161ae70cfab.
Signed-off-by: Zhang Zekun zhangzekun11@huawei.com --- drivers/iommu/Kconfig | 10 ---------- include/linux/iova.h | 2 +- 2 files changed, 1 insertion(+), 11 deletions(-)
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index 54d4a8cc3876..f04a2bde0018 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -437,15 +437,5 @@ config SMMU_BYPASS_DEV
This feature will be replaced by ACPI IORT RMR node, which will be upstreamed in mainline. -config IOVA_MAX_GLOBAL_MAGS - int "Set the max iova global magzines in iova rcache" - range 16 2048 - default "32" - help - Iova rcache global magizine is shared among every cpu. The size of - it can be a bottle neck when lots of cpus are contending to use it. - If you are suffering from the speed of allocing iova with more than - 128 cpus, try to tune this config larger. -
endif # IOMMU_SUPPORT diff --git a/include/linux/iova.h b/include/linux/iova.h index 3cb469b366d7..dfa51ae49666 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -26,7 +26,7 @@ struct iova_magazine; struct iova_cpu_rcache;
#define IOVA_RANGE_CACHE_MAX_SIZE 6 /* log of max cached IOVA range size (in pages) */ -#define MAX_GLOBAL_MAGS CONFIG_IOVA_MAX_GLOBAL_MAGS /* magazines per bin */ +#define MAX_GLOBAL_MAGS 32 /* magazines per bin */
struct iova_rcache { spinlock_t lock;
From: Feng Tang feng.tang@intel.com
mainline inclusion from mainline-v5.19-rc5 commit b4c9bf178ace26c52c9cac36f265ba95132a221d bugzilla: https://gitee.com/openeuler/kernel/issues/I8KS9A CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------
kmalloc will round up the request size to power of 2, and current iova_magazine's size is 1032 (1024+8) bytes, so each instance allocated will get 2048 bytes from kmalloc, causing around 1KB waste.
Change IOVA_MAG_SIZE from 128 to 127 to make size of 'iova_magazine' 1024 bytes so that no memory will be wasted.
Signed-off-by: Feng Tang feng.tang@intel.com Acked-by: Robin Murphy robin.murphy@arm.com Link: https://lore.kernel.org/r/20220703114450.15184-1-feng.tang@intel.com Signed-off-by: Joerg Roedel jroedel@suse.de Signed-off-by: Zhang Zekun zhangzekun11@huawei.com --- drivers/iommu/iova.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 203331b54970..2caf2aec9f56 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -792,7 +792,12 @@ split_and_remove_iova(struct iova_domain *iovad, struct iova *iova, * dynamic size tuning described in the paper. */
-#define IOVA_MAG_SIZE 128 +/* + * As kmalloc's buffer size is fixed to power of 2, 127 is chosen to + * assure size of 'iova_magazine' to be 1024 bytes, so that no memory + * will be wasted. + */ +#define IOVA_MAG_SIZE 127
struct iova_magazine { unsigned long size;
mainline inclusion from mainline-v6.7-rc1 commit 911aa1245da83ff5e76d33bb612d8b5a3f2ec4a5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8KS9A CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------
The algorithm in the original paper specifies the storage of full magazines in the depot as an unbounded list rather than a fixed-size array. It turns out to be pretty straightforward to do this in our implementation with no significant loss of efficiency. This allows the depot to scale up to the working set sizes of larger systems, while also potentially saving some memory on smaller ones too.
Since this involves touching struct iova_magazine with the requisite care, we may as well reinforce the comment with a proper assertion too.
Reviewed-by: John Garry john.g.garry@oracle.com Reviewed-by: Jerry Snitselaar jsnitsel@redhat.com Signed-off-by: Robin Murphy robin.murphy@arm.com Link: https://lore.kernel.org/r/f597aa72fc3e1d315bc4574af0ce0ebe5c31cd22.169453558... Signed-off-by: Joerg Roedel jroedel@suse.de Signed-off-by: Zhang Zekun zhangzekun11@huawei.com --- drivers/iommu/iova.c | 45 +++++++++++++++++++++++++------------------- include/linux/iova.h | 4 +--- 2 files changed, 27 insertions(+), 22 deletions(-)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 2caf2aec9f56..68be2756903d 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -800,9 +800,13 @@ split_and_remove_iova(struct iova_domain *iovad, struct iova *iova, #define IOVA_MAG_SIZE 127
struct iova_magazine { - unsigned long size; + union { + unsigned long size; + struct iova_magazine *next; + }; unsigned long pfns[IOVA_MAG_SIZE]; }; +static_assert(!(sizeof(struct iova_magazine) & (sizeof(struct iova_magazine) - 1)));
struct iova_cpu_rcache { spinlock_t lock; @@ -882,6 +886,21 @@ static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn) mag->pfns[mag->size++] = pfn; }
+static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache) +{ + struct iova_magazine *mag = rcache->depot; + + rcache->depot = mag->next; + mag->size = IOVA_MAG_SIZE; + return mag; +} + +static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *mag) +{ + mag->next = rcache->depot; + rcache->depot = mag; +} + static void init_iova_rcaches(struct iova_domain *iovad) { struct iova_cpu_rcache *cpu_rcache; @@ -892,7 +911,6 @@ static void init_iova_rcaches(struct iova_domain *iovad) for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { rcache = &iovad->rcaches[i]; spin_lock_init(&rcache->lock); - rcache->depot_size = 0; rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), cache_line_size()); if (WARN_ON(!rcache->cpu_rcaches)) continue; @@ -915,7 +933,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, struct iova_rcache *rcache, unsigned long iova_pfn) { - struct iova_magazine *mag_to_free = NULL; struct iova_cpu_rcache *cpu_rcache; bool can_insert = false; unsigned long flags; @@ -933,12 +950,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
if (new_mag) { spin_lock(&rcache->lock); - if (rcache->depot_size < MAX_GLOBAL_MAGS) { - rcache->depot[rcache->depot_size++] = - cpu_rcache->loaded; - } else { - mag_to_free = cpu_rcache->loaded; - } + iova_depot_push(rcache, cpu_rcache->loaded); spin_unlock(&rcache->lock);
cpu_rcache->loaded = new_mag; @@ -951,11 +963,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
spin_unlock_irqrestore(&cpu_rcache->lock, flags);
- if (mag_to_free) { - iova_magazine_free_pfns(mag_to_free, iovad); - iova_magazine_free(mag_to_free); - } - return can_insert; }
@@ -993,9 +1000,9 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache, has_pfn = true; } else { spin_lock(&rcache->lock); - if (rcache->depot_size > 0) { + if (rcache->depot) { iova_magazine_free(cpu_rcache->loaded); - cpu_rcache->loaded = rcache->depot[--rcache->depot_size]; + cpu_rcache->loaded = iova_depot_pop(rcache); has_pfn = true; } spin_unlock(&rcache->lock); @@ -1034,7 +1041,7 @@ static void free_iova_rcaches(struct iova_domain *iovad) struct iova_rcache *rcache; struct iova_cpu_rcache *cpu_rcache; unsigned int cpu; - int i, j; + int i;
for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { rcache = &iovad->rcaches[i]; @@ -1044,8 +1051,8 @@ static void free_iova_rcaches(struct iova_domain *iovad) iova_magazine_free(cpu_rcache->prev); } free_percpu(rcache->cpu_rcaches); - for (j = 0; j < rcache->depot_size; ++j) - iova_magazine_free(rcache->depot[j]); + while (rcache->depot) + iova_magazine_free(iova_depot_pop(rcache)); } }
diff --git a/include/linux/iova.h b/include/linux/iova.h index dfa51ae49666..f1622ba17a27 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -26,12 +26,10 @@ struct iova_magazine; struct iova_cpu_rcache;
#define IOVA_RANGE_CACHE_MAX_SIZE 6 /* log of max cached IOVA range size (in pages) */ -#define MAX_GLOBAL_MAGS 32 /* magazines per bin */
struct iova_rcache { spinlock_t lock; - unsigned long depot_size; - struct iova_magazine *depot[MAX_GLOBAL_MAGS]; + struct iova_magazine *depot; struct iova_cpu_rcache __percpu *cpu_rcaches; };
mainline inclusion from mainline-v6.7-rc1 commit 233045378dbbc0a7346127d19a54d4f91e0bd855 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I8KS9A CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------
Automatically scaling the depot up to suit the peak capacity of a workload is all well and good, but it would be nice to have a way to scale it back down again if the workload changes. To that end, add backround reclaim that will gradually free surplus magazines if the depot size remains above a reasonable threshold for long enough.
Reviewed-by: Jerry Snitselaar jsnitsel@redhat.com Signed-off-by: Robin Murphy robin.murphy@arm.com Link: https://lore.kernel.org/r/03170665c56d89c6ce6081246b47f68d4e483308.169453558... Signed-off-by: Joerg Roedel jroedel@suse.de Signed-off-by: Zhang Zekun zhangzekun11@huawei.com --- drivers/iommu/iova.c | 27 +++++++++++++++++++++++++++ include/linux/iova.h | 3 +++ 2 files changed, 30 insertions(+)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 68be2756903d..d5429249ac84 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -11,6 +11,7 @@ #include <linux/smp.h> #include <linux/bitops.h> #include <linux/cpu.h> +#include <linux/workqueue.h>
/* The anchor node sits above the top of the usable address space */ #define IOVA_ANCHOR ~0UL @@ -799,6 +800,8 @@ split_and_remove_iova(struct iova_domain *iovad, struct iova *iova, */ #define IOVA_MAG_SIZE 127
+#define IOVA_DEPOT_DELAY msecs_to_jiffies(100) + struct iova_magazine { union { unsigned long size; @@ -892,6 +895,7 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
rcache->depot = mag->next; mag->size = IOVA_MAG_SIZE; + rcache->depot_size--; return mag; }
@@ -899,6 +903,25 @@ static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *ma { mag->next = rcache->depot; rcache->depot = mag; + rcache->depot_size++; +} + +static void iova_depot_work_func(struct work_struct *work) +{ + struct iova_rcache *rcache = container_of(work, typeof(*rcache), work.work); + struct iova_magazine *mag = NULL; + unsigned long flags; + + spin_lock_irqsave(&rcache->lock, flags); + if (rcache->depot_size > num_online_cpus()) + mag = iova_depot_pop(rcache); + spin_unlock_irqrestore(&rcache->lock, flags); + + if (mag) { + iova_magazine_free_pfns(mag, rcache->iovad); + iova_magazine_free(mag); + schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY); + } }
static void init_iova_rcaches(struct iova_domain *iovad) @@ -911,6 +934,8 @@ static void init_iova_rcaches(struct iova_domain *iovad) for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { rcache = &iovad->rcaches[i]; spin_lock_init(&rcache->lock); + rcache->iovad = iovad; + INIT_DELAYED_WORK(&rcache->work, iova_depot_work_func); rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), cache_line_size()); if (WARN_ON(!rcache->cpu_rcaches)) continue; @@ -952,6 +977,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, spin_lock(&rcache->lock); iova_depot_push(rcache, cpu_rcache->loaded); spin_unlock(&rcache->lock); + schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
cpu_rcache->loaded = new_mag; can_insert = true; @@ -1051,6 +1077,7 @@ static void free_iova_rcaches(struct iova_domain *iovad) iova_magazine_free(cpu_rcache->prev); } free_percpu(rcache->cpu_rcaches); + cancel_delayed_work_sync(&rcache->work); while (rcache->depot) iova_magazine_free(iova_depot_pop(rcache)); } diff --git a/include/linux/iova.h b/include/linux/iova.h index f1622ba17a27..25c447124638 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -29,8 +29,11 @@ struct iova_cpu_rcache;
struct iova_rcache { spinlock_t lock; + unsigned int depot_size; struct iova_magazine *depot; struct iova_cpu_rcache __percpu *cpu_rcaches; + struct iova_domain *iovad; + struct delayed_work work; };
struct iova_domain;
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/3266 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/U...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/3266 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/U...