From: Mateusz Guzik mjguzik@gmail.com
mainline inclusion from mainline-v6.11-rc1 commit 3577dbb192419e37b6f54aced8777b6c81cd03d4 category: performance bugzilla: https://gitee.com/src-openeuler/kernel/issues/IB1S01
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Execs of dynamically linked binaries at 20-ish cores are bottlenecked on the i_mmap_rwsem semaphore, while the biggest singular contributor is free_pgd_range inducing the lock acquire back-to-back for all consecutive mappings of a given file.
Tracing the count of said acquires while building the kernel shows: [1, 2) 799579 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@| [2, 3) 0 | | [3, 4) 3009 | | [4, 5) 3009 | | [5, 6) 326442 |@@@@@@@@@@@@@@@@@@@@@ |
So in particular there were 326442 opportunities to coalesce 5 acquires into 1.
Doing so increases execs per second by 4% (~50k to ~52k) when running the benchmark linked below.
The lock remains the main bottleneck, I have not looked at other spots yet.
Bench can be found here: http://apollo.backplane.com/DFlyMisc/doexec.c
$ cc -O2 -o shared-doexec doexec.c $ ./shared-doexec $(nproc)
Note this particular test makes sure binaries are separate, but the loader is shared.
Stats collected on the patched kernel (+ "noinline") with: bpftrace -e 'kprobe:unlink_file_vma_batch_process { @ = lhist(((struct unlink_vma_file_batch *)arg0)->count, 0, 8, 1); }'
Link: https://lkml.kernel.org/r/20240521234321.359501-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik mjguzik@gmail.com Reviewed-by: Liam R. Howlett Liam.Howlett@oracle.com Cc: Lorenzo Stoakes lstoakes@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Conflicts: mm/internal.h [Context conflict] Signed-off-by: Jinjie Ruan ruanjinjie@huawei.com --- mm/internal.h | 10 ++++++++++ mm/memory.c | 10 ++++++++-- mm/mmap.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 2 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h index 37c17f921dae..0478e5dab55b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1446,4 +1446,14 @@ void __meminit __init_single_page(struct page *page, unsigned long pfn, #ifdef CONFIG_PAGE_CACHE_LIMIT unsigned long shrink_memory(unsigned long nr_to_reclaim, bool may_swap); #endif /* CONFIG_PAGE_CACHE_LIMIT */ + +struct unlink_vma_file_batch { + int count; + struct vm_area_struct *vmas[8]; +}; + +void unlink_file_vma_batch_init(struct unlink_vma_file_batch *); +void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *); +void unlink_file_vma_batch_final(struct unlink_vma_file_batch *); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory.c b/mm/memory.c index e248b8338417..a4f7066d1e68 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -368,6 +368,8 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling, bool mm_wr_locked) { + struct unlink_vma_file_batch vb; + do { unsigned long addr = vma->vm_start; struct vm_area_struct *next; @@ -387,12 +389,15 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); - unlink_file_vma(vma);
if (is_vm_hugetlb_page(vma)) { + unlink_file_vma(vma); hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } else { + unlink_file_vma_batch_init(&vb); + unlink_file_vma_batch_add(&vb, vma); + /* * Optimization: gather nearby vmas into one call down */ @@ -405,8 +410,9 @@ void free_pgtables(struct mmu_gather *tlb, struct ma_state *mas, if (mm_wr_locked) vma_start_write(vma); unlink_anon_vmas(vma); - unlink_file_vma(vma); + unlink_file_vma_batch_add(&vb, vma); } + unlink_file_vma_batch_final(&vb); free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } diff --git a/mm/mmap.c b/mm/mmap.c index 07ffb6c37b96..c898103c6d72 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -132,6 +132,47 @@ void unlink_file_vma(struct vm_area_struct *vma) } }
+void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb) +{ + vb->count = 0; +} + +static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb) +{ + struct address_space *mapping; + int i; + + mapping = vb->vmas[0]->vm_file->f_mapping; + i_mmap_lock_write(mapping); + for (i = 0; i < vb->count; i++) { + VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping); + __remove_shared_vm_struct(vb->vmas[i], mapping); + } + i_mmap_unlock_write(mapping); + + unlink_file_vma_batch_init(vb); +} + +void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb, + struct vm_area_struct *vma) +{ + if (vma->vm_file == NULL) + return; + + if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) || + vb->count == ARRAY_SIZE(vb->vmas)) + unlink_file_vma_batch_process(vb); + + vb->vmas[vb->count] = vma; + vb->count++; +} + +void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb) +{ + if (vb->count > 0) + unlink_file_vma_batch_process(vb); +} + /* * Close a vm structure and free it. */