From: Mike Kravetz mike.kravetz@oracle.com
stable inclusion from stable-v6.0.13 commit bb8f66f6afbbc822f61d8bfc01ecefe2a437256c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GVYW CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 04ada095dcfc4ae359418053c0be94453bdf1e84 upstream.
madvise(MADV_DONTNEED) ends up calling zap_page_range() to clear page tables associated with the address range. For hugetlb vmas, zap_page_range will call __unmap_hugepage_range_final. However, __unmap_hugepage_range_final assumes the passed vma is about to be removed and deletes the vma_lock to prevent pmd sharing as the vma is on the way out. In the case of madvise(MADV_DONTNEED) the vma remains, but the missing vma_lock prevents pmd sharing and could potentially lead to issues with truncation/fault races.
This issue was originally reported here [1] as a BUG triggered in page_try_dup_anon_rmap. Prior to the introduction of the hugetlb vma_lock, __unmap_hugepage_range_final cleared the VM_MAYSHARE flag to prevent pmd sharing. Subsequent faults on this vma were confused as VM_MAYSHARE indicates a sharable vma, but was not set so page_mapping was not set in new pages added to the page table. This resulted in pages that appeared anonymous in a VM_SHARED vma and triggered the BUG.
Address issue by adding a new zap flag ZAP_FLAG_UNMAP to indicate an unmap call from unmap_vmas(). This is used to indicate the 'final' unmapping of a hugetlb vma. When called via MADV_DONTNEED, this flag is not set and the vm_lock is not deleted.
NOTE - Prior to the introduction of the huegtlb vma_lock in v6.1, this issue is addressed by not clearing the VM_MAYSHARE flag when __unmap_hugepage_range_final is called in the MADV_DONTNEED case.
[1] https://lore.kernel.org/lkml/CAO4mrfdLMXsao9RF4fUE8-Wfde8xmjsKrTNMNC9wjUb6Ju...
Link: https://lkml.kernel.org/r/20221114235507.294320-3-mike.kravetz@oracle.com Fixes: 90e7e7f5ef3f ("mm: enable MADV_DONTNEED for hugetlb mappings") Signed-off-by: Mike Kravetz mike.kravetz@oracle.com Reported-by: Wei Chen harperchen1110@gmail.com Cc: Axel Rasmussen axelrasmussen@google.com Cc: David Hildenbrand david@redhat.com Cc: Matthew Wilcox willy@infradead.org Cc: Mina Almasry almasrymina@google.com Cc: Nadav Amit nadav.amit@gmail.com Cc: Naoya Horiguchi naoya.horiguchi@linux.dev Cc: Peter Xu peterx@redhat.com Cc: Rik van Riel riel@surriel.com Cc: Vlastimil Babka vbabka@suse.cz Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Mike Kravetz mike.kravetz@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org
Conflicts: include/linux/mm.h mm/memory.c
Signed-off-by: Ze Zuo zuoze1@huawei.com --- include/linux/hugetlb.h | 5 +++-- include/linux/mm.h | 4 ++++ include/linux/mm_types.h | 2 ++ mm/hugetlb.c | 28 ++++++++++++++++------------ mm/memory.c | 9 +++++++-- 5 files changed, 32 insertions(+), 16 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ce7c1f9d7961..b25c809af612 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -142,7 +142,7 @@ void unmap_hugepage_range(struct vm_area_struct *, void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, - struct page *ref_page); + struct page *ref_page, zap_flags_t zap_flags); void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page); @@ -371,7 +371,8 @@ static inline unsigned long hugetlb_change_protection(
static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) + unsigned long end, struct page *ref_page, + zap_flags_t zap_flags) { BUG(); } diff --git a/include/linux/mm.h b/include/linux/mm.h index c43f59ac87a9..e0d269b83c8f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1706,8 +1706,12 @@ struct zap_details { pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ struct page *single_page; /* Locked page to be unmapped */ + zap_flags_t zap_flags; /* Extra flags for zapping */ };
+/* Set in unmap_vmas() to indicate a final unmap call. Only used by hugetlb */ +#define ZAP_FLAG_UNMAP ((__force zap_flags_t) BIT(1)) + struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 940b19d4a531..d1c5946ad402 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -898,4 +898,6 @@ typedef struct { unsigned long val; } swp_entry_t;
+typedef unsigned int __bitwise zap_flags_t; + #endif /* _LINUX_MM_TYPES_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4a6f683cd7e4..a333caf66d15 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4512,21 +4512,25 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) + unsigned long end, struct page *ref_page, + zap_flags_t zap_flags) { __unmap_hugepage_range(tlb, vma, start, end, ref_page);
- /* - * Clear this flag so that x86's huge_pmd_share page_table_shareable - * test will fail on a vma being torn down, and not grab a page table - * on its way out. We're lucky that the flag has such an appropriate - * name, and can in fact be safely cleared here. We could clear it - * before the __unmap_hugepage_range above, but all that's necessary - * is to clear it before releasing the i_mmap_rwsem. This works - * because in the context this is called, the VMA is about to be - * destroyed and the i_mmap_rwsem is held. - */ - vma->vm_flags &= ~VM_MAYSHARE; + if (zap_flags & ZAP_FLAG_UNMAP) { /* final unmap */ + /* + * Clear this flag so that x86's huge_pmd_share + * page_table_shareable test will fail on a vma being torn + * down, and not grab a page table on its way out. We're lucky + * that the flag has such an appropriate name, and can in fact + * be safely cleared here. We could clear it before the + * __unmap_hugepage_range above, but all that's necessary + * is to clear it before releasing the i_mmap_rwsem. This works + * because in the context this is called, the VMA is about to + * be destroyed and the i_mmap_rwsem is held. + */ + vma->vm_flags &= ~VM_MAYSHARE; + } }
void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, diff --git a/mm/memory.c b/mm/memory.c index bc3f6408236b..494f40362174 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1518,8 +1518,10 @@ static void unmap_single_vma(struct mmu_gather *tlb, * safe to do nothing in this case. */ if (vma->vm_file) { + zap_flags_t zap_flags = details ? + details->zap_flags : 0; i_mmap_lock_write(vma->vm_file->f_mapping); - __unmap_hugepage_range_final(tlb, vma, start, end, NULL); + __unmap_hugepage_range_final(tlb, vma, start, end, NULL, zap_flags); i_mmap_unlock_write(vma->vm_file->f_mapping); } } else @@ -1550,12 +1552,15 @@ void unmap_vmas(struct mmu_gather *tlb, unsigned long end_addr) { struct mmu_notifier_range range; + struct zap_details details = { + .zap_flags = ZAP_FLAG_UNMAP, + };
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) - unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); + unmap_single_vma(tlb, vma, start_addr, end_addr, &details); mmu_notifier_invalidate_range_end(&range); }