From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.18-rc1 commit d8d55f5616cf3b900a23a72dd24e7b07211e7859 category: feature bugzilla: 187198, https://gitee.com/openeuler/kernel/issues/I5GVFO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The init_mm.page_table_lock is used to protect kernel page tables, we can use it to serialize splitting vmemmap PMD mappings instead of mmap write lock, which can increase the concurrency of vmemmap_remap_free().
Actually, It increase the concurrency between allocations of HugeTLB pages. But it is not the only benefit. There are a lot of users of mmap read lock of init_mm. The mmap write lock is holding through vmemmap_remap_free(), removing mmap write lock usage to make it does not affect other users of mmap read lock. It is not making anything worse and always a win to move.
Now the kernel page table walker does not hold the page_table_lock when walking pmd entries. There may be consistency issue of a pmd entry, because pmd entry might change from a huge pmd entry to a PTE page table. There is only one user of kernel page table walker, namely ptdump. The ptdump already considers the consistency, which use a local variable to cache the value of pmd entry. But we also need to update ->action to ACTION_CONTINUE to make sure the walker does not walk every pte entry again when concurrent thread has split the huge pmd.
Link: https://lkml.kernel.org/r/20211101031651.75851-4-songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Cc: Barry Song song.bao.hua@hisilicon.com Cc: Bodeddula Balasubramaniam bodeddub@amazon.com Cc: Chen Huang chenhuang5@huawei.com Cc: David Hildenbrand david@redhat.com Cc: Fam Zheng fam.zheng@bytedance.com Cc: Jonathan Corbet corbet@lwn.net Cc: Matthew Wilcox willy@infradead.org Cc: Michal Hocko mhocko@suse.com Cc: Mike Kravetz mike.kravetz@oracle.com Cc: Oscar Salvador osalvador@suse.de Cc: Qi Zheng zhengqi.arch@bytedance.com Cc: Xiongchun Duan duanxiongchun@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Conflicts: mm/sparse-vmemmap.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/ptdump.c | 16 +++++++++++---- mm/sparse-vmemmap.c | 47 ++++++++++++++++++++++++++++++--------------- 2 files changed, 43 insertions(+), 20 deletions(-)
diff --git a/mm/ptdump.c b/mm/ptdump.c index 93f2f63dc52d..43661863096b 100644 --- a/mm/ptdump.c +++ b/mm/ptdump.c @@ -39,8 +39,10 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 0, pgd_val(val));
- if (pgd_leaf(val)) + if (pgd_leaf(val)) { st->note_page(st, addr, 0, pgd_val(val)); + walk->action = ACTION_CONTINUE; + }
return 0; } @@ -59,8 +61,10 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 1, p4d_val(val));
- if (p4d_leaf(val)) + if (p4d_leaf(val)) { st->note_page(st, addr, 1, p4d_val(val)); + walk->action = ACTION_CONTINUE; + }
return 0; } @@ -79,8 +83,10 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr, if (st->effective_prot) st->effective_prot(st, 2, pud_val(val));
- if (pud_leaf(val)) + if (pud_leaf(val)) { st->note_page(st, addr, 2, pud_val(val)); + walk->action = ACTION_CONTINUE; + }
return 0; } @@ -98,8 +104,10 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
if (st->effective_prot) st->effective_prot(st, 3, pmd_val(val)); - if (pmd_leaf(val)) + if (pmd_leaf(val)) { st->note_page(st, addr, 3, pmd_val(val)); + walk->action = ACTION_CONTINUE; + }
return 0; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 39bdbf0b28dd..eef9053f948f 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -53,8 +53,7 @@ struct vmemmap_remap_walk { struct list_head *vmemmap_pages; };
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, - struct vmemmap_remap_walk *walk) +static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) { pmd_t __pmd; int i; @@ -76,15 +75,34 @@ static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start, set_pte_at(&init_mm, addr, pte, entry); }
- /* Make pte visible before pmd. See comment in __pte_alloc(). */ - smp_wmb(); - pmd_populate_kernel(&init_mm, pmd, pgtable); - - flush_tlb_kernel_range(start, start + PMD_SIZE); + spin_lock(&init_mm.page_table_lock); + if (likely(pmd_leaf(*pmd))) { + /* Make pte visible before pmd. See comment in __pte_alloc(). */ + smp_wmb(); + pmd_populate_kernel(&init_mm, pmd, pgtable); + flush_tlb_kernel_range(start, start + PMD_SIZE); + } else { + pte_free_kernel(&init_mm, pgtable); + } + spin_unlock(&init_mm.page_table_lock);
return 0; }
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start) +{ + int leaf; + + spin_lock(&init_mm.page_table_lock); + leaf = pmd_leaf(*pmd); + spin_unlock(&init_mm.page_table_lock); + + if (!leaf) + return 0; + + return __split_vmemmap_huge_pmd(pmd, start); +} + static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct vmemmap_remap_walk *walk) @@ -121,13 +139,12 @@ static int vmemmap_pmd_range(pud_t *pud, unsigned long addr,
pmd = pmd_offset(pud, addr); do { - if (pmd_leaf(*pmd)) { - int ret; + int ret; + + ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK); + if (ret) + return ret;
- ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk); - if (ret) - return ret; - } next = pmd_addr_end(addr, end); vmemmap_pte_range(pmd, addr, next, walk); } while (pmd++, addr = next, addr != end); @@ -321,10 +338,8 @@ int vmemmap_remap_free(unsigned long start, unsigned long end, */ BUG_ON(start - reuse != PAGE_SIZE);
- mmap_write_lock(&init_mm); + mmap_read_lock(&init_mm); ret = vmemmap_remap_range(reuse, end, &walk); - mmap_write_downgrade(&init_mm); - if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /*