hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ASVH CVE: NA
---------------------------------------
The logic of cpu rcaches could cause severe performance problem. It is because that the logic of iova rcache would create percpu rcache which will be used only by local cpu, free iovas in cpu_rcache won't be shared among different cpus. For example, platform which have 256 cpus with 256 threads will have 417792 entiries( 6 * (256 * 2 + 32) * 128) to cache iova, but every thread running on a cpu can only use 1/256 of total cache size, and this could cause serve performance problem. We limit the max cpu size to 64 to fix this problem, and each iovad could save about 2MB memory.
In our test FIO with 256 threads:
Jobs: 12 (f=2): [/(1),R(2),/(9)][16.7%][r=2KiB/s][r=0 IOPS][eta 00m:30s] Jobs: 12 (f=12): [R(12)][20.0%][r=1091MiB/s][r=279k IOPS][eta 00m:28s] Jobs: 12 (f=12): [R(12)][22.2%][r=1426MiB/s][r=365k IOPS][eta 00m:28s] Jobs: 12 (f=12): [R(12)][25.0%][r=1607MiB/s][r=411k IOPS][eta 00m:27s] Jobs: 12 (f=12): [R(12)][27.8%][r=1501MiB/s][r=384k IOPS][eta 00m:26s] Jobs: 12 (f=12): [R(12)][30.6%][r=1486MiB/s][r=380k IOPS][eta 00m:25s] Jobs: 12 (f=12): [R(12)][33.3%][r=1393MiB/s][r=357k IOPS][eta 00m:24s] Jobs: 12 (f=12): [R(12)][36.1%][r=1550MiB/s][r=397k IOPS][eta 00m:23s] Jobs: 12 (f=12): [R(12)][38.9%][r=1485MiB/s][r=380k IOPS][eta 00m:22s]
After this patch:
bs: 10 (f=10): [R(10)][98.4%][r=7414MiB/s][r=1898k IOPS][eta 00m:15s] Jobs: 10 (f=10): [R(10)][98.5%][r=7495MiB/s][r=1919k IOPS][eta 00m:14s] Jobs: 10 (f=10): [R(10)][98.6%][r=7497MiB/s][r=1919k IOPS][eta 00m:13s] Jobs: 10 (f=10): [R(10)][98.7%][r=7497MiB/s][r=1919k IOPS][eta 00m:12s] Jobs: 10 (f=10): [R(10)][98.8%][r=7471MiB/s][r=1913k IOPS][eta 00m:11s] Jobs: 10 (f=10): [R(10)][98.9%][r=7483MiB/s][r=1916k IOPS][eta 00m:10s] Jobs: 10 (f=10): [R(10)][99.0%][r=7491MiB/s][r=1918k IOPS][eta 00m:09s] Jobs: 10 (f=10): [R(10)][99.1%][r=7436MiB/s][r=1904k IOPS][eta 00m:08s] Jobs: 10 (f=10): [R(10)][99.2%][r=7462MiB/s][r=1910k IOPS][eta 00m:07s]
can increase about 500% of IOPS.
Signed-off-by: Zhang Zekun zhangzekun11@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com ---
v2: -add hulk inclusion
drivers/iommu/Kconfig | 10 ++++++++++ drivers/iommu/iova.c | 33 +++++++++++++++++++++++++++++++++ include/linux/iova.h | 5 +++++ 3 files changed, 48 insertions(+)
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index f04a2bde0018..c71eb20bc702 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -343,6 +343,16 @@ config ARM_SMMU_V3_PM help Add support for suspend and resume support for arm smmu v3.
+config ARM_SMMU_V3_REDUCE_CPURCACHE + bool "Add support for cpu rcache" + depends on ARM_SMMU_V3 + default n + help + Add support for reducing the total amount of cpu rcache. When + num of cpus is large, iova rcache will have serve performance + problem. Reduce the amount of cpu rcaches to improve cache hit + rate. + config S390_IOMMU def_bool y if S390 && PCI depends on S390 && PCI diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 1246e8f8bf08..ed86bb09db4f 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -881,11 +881,19 @@ static void init_iova_rcaches(struct iova_domain *iovad) rcache = &iovad->rcaches[i]; spin_lock_init(&rcache->lock); rcache->depot_size = 0; +#ifdef CONFIG_ARM_SMMU_V3_REDUCE_CPURCACHE + for (cpu = 0; cpu < MAX_CPU_SIZE; cpu++) { + rcache->cpu_rcaches[cpu] = kmalloc(sizeof(*cpu_rcache), GFP_KERNEL); + if (WARN_ON(!rcache->cpu_rcaches[cpu])) + continue; + cpu_rcache = rcache->cpu_rcaches[cpu]; +#else rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), cache_line_size()); if (WARN_ON(!rcache->cpu_rcaches)) continue; for_each_possible_cpu(cpu) { cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu); +#endif spin_lock_init(&cpu_rcache->lock); cpu_rcache->loaded = iova_magazine_alloc(GFP_KERNEL); cpu_rcache->prev = iova_magazine_alloc(GFP_KERNEL); @@ -907,8 +915,14 @@ static bool __iova_rcache_insert(struct iova_domain *iovad, struct iova_cpu_rcache *cpu_rcache; bool can_insert = false; unsigned long flags; +#ifdef CONFIG_ARM_SMMU_V3_REDUCE_CPURCACHE + int cpu;
+ cpu = raw_smp_processor_id(); + cpu_rcache = rcache->cpu_rcaches[cpu % MAX_CPU_SIZE]; +#else cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches); +#endif spin_lock_irqsave(&cpu_rcache->lock, flags);
if (!iova_magazine_full(cpu_rcache->loaded)) { @@ -970,8 +984,14 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache, unsigned long iova_pfn = 0; bool has_pfn = false; unsigned long flags; +#ifdef CONFIG_ARM_SMMU_V3_REDUCE_CPURCACHE + int cpu;
+ cpu = raw_smp_processor_id(); + cpu_rcache = rcache->cpu_rcaches[cpu % MAX_CPU_SIZE]; +#else cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches); +#endif spin_lock_irqsave(&cpu_rcache->lock, flags);
if (!iova_magazine_empty(cpu_rcache->loaded)) { @@ -1026,12 +1046,21 @@ static void free_iova_rcaches(struct iova_domain *iovad)
for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { rcache = &iovad->rcaches[i]; +#ifdef CONFIG_ARM_SMMU_V3_REDUCE_CPURCACHE + for (cpu = 0; cpu < MAX_CPU_SIZE; cpu++) { + cpu_rcache = rcache->cpu_rcaches[cpu]; + iova_magazine_free(cpu_rcache->loaded); + iova_magazine_free(cpu_rcache->prev); + kfree(cpu_rcache); + } +#else for_each_possible_cpu(cpu) { cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu); iova_magazine_free(cpu_rcache->loaded); iova_magazine_free(cpu_rcache->prev); } free_percpu(rcache->cpu_rcaches); +#endif for (j = 0; j < rcache->depot_size; ++j) iova_magazine_free(rcache->depot[j]); } @@ -1049,7 +1078,11 @@ void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) { rcache = &iovad->rcaches[i]; +#ifdef CONFIG_ARM_SMMU_V3_REDUCE_CPURCACHE + cpu_rcache = rcache->cpu_rcaches[cpu % MAX_CPU_SIZE]; +#else cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu); +#endif spin_lock_irqsave(&cpu_rcache->lock, flags); iova_magazine_free_pfns(cpu_rcache->loaded, iovad); iova_magazine_free_pfns(cpu_rcache->prev, iovad); diff --git a/include/linux/iova.h b/include/linux/iova.h index dfa51ae49666..06897fc970eb 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -27,12 +27,17 @@ struct iova_cpu_rcache;
#define IOVA_RANGE_CACHE_MAX_SIZE 6 /* log of max cached IOVA range size (in pages) */ #define MAX_GLOBAL_MAGS 32 /* magazines per bin */ +#define MAX_CPU_SIZE (NR_CPUS > 64 ? 64 : NR_CPUS) /* max cpu rcaches */
struct iova_rcache { spinlock_t lock; unsigned long depot_size; struct iova_magazine *depot[MAX_GLOBAL_MAGS]; +#ifdef CONFIG_ARM_SMMU_V3_REDUCE_CPURCACHE + struct iova_cpu_rcache *cpu_rcaches[MAX_CPU_SIZE]; +#else struct iova_cpu_rcache __percpu *cpu_rcaches; +#endif };
struct iova_domain;
On 2023/6/13 10:46, Zhang Zekun wrote:
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I7ASVH CVE: NA
The logic of cpu rcaches could cause severe performance problem. It is because that the logic of iova rcache would create percpu rcache which will be used only by local cpu, free iovas in cpu_rcache won't be shared among different cpus. For example, platform which have 256 cpus with 256 threads will have 417792 entiries( 6 * (256 * 2 + 32) * 128) to cache iova, but every thread running on a cpu can only use 1/256 of total cache size, and this could cause serve performance problem. We limit the max cpu size to 64 to fix this problem, and each iovad could save about 2MB memory.
In our test FIO with 256 threads:
Jobs: 12 (f=2): [/(1),R(2),/(9)][16.7%][r=2KiB/s][r=0 IOPS][eta 00m:30s] Jobs: 12 (f=12): [R(12)][20.0%][r=1091MiB/s][r=279k IOPS][eta 00m:28s] Jobs: 12 (f=12): [R(12)][22.2%][r=1426MiB/s][r=365k IOPS][eta 00m:28s] Jobs: 12 (f=12): [R(12)][25.0%][r=1607MiB/s][r=411k IOPS][eta 00m:27s] Jobs: 12 (f=12): [R(12)][27.8%][r=1501MiB/s][r=384k IOPS][eta 00m:26s] Jobs: 12 (f=12): [R(12)][30.6%][r=1486MiB/s][r=380k IOPS][eta 00m:25s] Jobs: 12 (f=12): [R(12)][33.3%][r=1393MiB/s][r=357k IOPS][eta 00m:24s] Jobs: 12 (f=12): [R(12)][36.1%][r=1550MiB/s][r=397k IOPS][eta 00m:23s] Jobs: 12 (f=12): [R(12)][38.9%][r=1485MiB/s][r=380k IOPS][eta 00m:22s]
After this patch:
bs: 10 (f=10): [R(10)][98.4%][r=7414MiB/s][r=1898k IOPS][eta 00m:15s] Jobs: 10 (f=10): [R(10)][98.5%][r=7495MiB/s][r=1919k IOPS][eta 00m:14s] Jobs: 10 (f=10): [R(10)][98.6%][r=7497MiB/s][r=1919k IOPS][eta 00m:13s] Jobs: 10 (f=10): [R(10)][98.7%][r=7497MiB/s][r=1919k IOPS][eta 00m:12s] Jobs: 10 (f=10): [R(10)][98.8%][r=7471MiB/s][r=1913k IOPS][eta 00m:11s] Jobs: 10 (f=10): [R(10)][98.9%][r=7483MiB/s][r=1916k IOPS][eta 00m:10s] Jobs: 10 (f=10): [R(10)][99.0%][r=7491MiB/s][r=1918k IOPS][eta 00m:09s] Jobs: 10 (f=10): [R(10)][99.1%][r=7436MiB/s][r=1904k IOPS][eta 00m:08s] Jobs: 10 (f=10): [R(10)][99.2%][r=7462MiB/s][r=1910k IOPS][eta 00m:07s]
can increase about 500% of IOPS.
Signed-off-by: Zhang Zekun zhangzekun11@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com
v2: -add hulk inclusion
drivers/iommu/Kconfig | 10 ++++++++++ drivers/iommu/iova.c | 33 +++++++++++++++++++++++++++++++++ include/linux/iova.h | 5 +++++ 3 files changed, 48 insertions(+)
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index f04a2bde0018..c71eb20bc702 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -343,6 +343,16 @@ config ARM_SMMU_V3_PM help Add support for suspend and resume support for arm smmu v3.
+config ARM_SMMU_V3_REDUCE_CPURCACHE
- bool "Add support for cpu rcache"
- depends on ARM_SMMU_V3
- default n
- help
Add support for reducing the total amount of cpu rcache. When
num of cpus is large, iova rcache will have serve performance
problem. Reduce the amount of cpu rcaches to improve cache hit
rate.
- config S390_IOMMU def_bool y if S390 && PCI depends on S390 && PCI
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 1246e8f8bf08..ed86bb09db4f 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -881,11 +881,19 @@ static void init_iova_rcaches(struct iova_domain *iovad) rcache = &iovad->rcaches[i]; spin_lock_init(&rcache->lock); rcache->depot_size = 0; +#ifdef CONFIG_ARM_SMMU_V3_REDUCE_CPURCACHE
for (cpu = 0; cpu < MAX_CPU_SIZE; cpu++) {
rcache->cpu_rcaches[cpu] = kmalloc(sizeof(*cpu_rcache), GFP_KERNEL);
if (WARN_ON(!rcache->cpu_rcaches[cpu]))
continue;
cpu_rcache = rcache->cpu_rcaches[cpu];
+#else rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache), cache_line_size()); if (WARN_ON(!rcache->cpu_rcaches)) continue; for_each_possible_cpu(cpu) { cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu); +#endif
Too hack for the IOMMU core code,
NACK.
Thanks Hanjun