From: Hugh Dickins hughd@google.com
mainline inclusion from mainline-v6.7-rc1 commit 72e315f7a750281b4410ac30d8930f735459e72d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9OHHN CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
mbind(2) holds down_write of current task's mmap_lock throughout (exclusive because it needs to set the new mempolicy on the vmas); migrate_pages(2) holds down_read of pid's mmap_lock throughout.
They both hold mmap_lock across the internal migrate_pages(), under which all new page allocations (huge or small) are made. I'm nervous about it; and migrate_pages() certainly does not need mmap_lock itself. It's done this way for mbind(2), because its page allocator is vma_alloc_folio() or alloc_hugetlb_folio_vma(), both of which depend on vma and address.
Now that we have alloc_pages_mpol(), depending on (refcounted) memory policy and interleave index, mbind(2) can be modified to use that or alloc_hugetlb_folio_nodemask(), and then not need mmap_lock across the internal migrate_pages() at all: add alloc_migration_target_by_mpol() to replace mbind's new_page().
(After that change, alloc_hugetlb_folio_vma() is used by nothing but a userfaultfd function: move it out of hugetlb.h and into the #ifdef.)
migrate_pages(2) has chosen its target node before migrating, so can continue to use the standard alloc_migration_target(); but let it take and drop mmap_lock just around migrate_to_node()'s queue_pages_range(): neither the node-to-node calculations nor the page migrations need it.
It seems unlikely, but it is conceivable that some userspace depends on the kernel's mmap_lock exclusion here, instead of doing its own locking: more likely in a testsuite than in real life. It is also possible, of course, that some pages on the list will be munmapped by another thread before they are migrated, or a newer memory policy applied to the range by that time: but such races could happen before, as soon as mmap_lock was dropped, so it does not appear to be a concern.
Link: https://lkml.kernel.org/r/21e564e8-269f-6a89-7ee2-fd612831c289@google.com Signed-off-by: Hugh Dickins hughd@google.com Cc: Andi Kleen ak@linux.intel.com Cc: Christoph Lameter cl@linux.com Cc: David Hildenbrand david@redhat.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: "Huang, Ying" ying.huang@intel.com Cc: Kefeng Wang wangkefeng.wang@huawei.com Cc: Matthew Wilcox (Oracle) willy@infradead.org Cc: Mel Gorman mgorman@techsingularity.net Cc: Michal Hocko mhocko@suse.com Cc: Mike Kravetz mike.kravetz@oracle.com Cc: Nhat Pham nphamcs@gmail.com Cc: Sidhartha Kumar sidhartha.kumar@oracle.com Cc: Suren Baghdasaryan surenb@google.com Cc: Tejun heo tj@kernel.org Cc: Vishal Moola (Oracle) vishal.moola@gmail.com Cc: Yang Shi shy828301@gmail.com Cc: Yosry Ahmed yosryahmed@google.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Ze Zuo zuoze1@huawei.com --- include/linux/hugetlb.h | 9 ----- mm/hugetlb.c | 38 ++++++++++--------- mm/mempolicy.c | 83 +++++++++++++++++++++-------------------- 3 files changed, 63 insertions(+), 67 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b9a78a93859a..616eab7d2d9e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -800,8 +800,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address); int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, @@ -1129,13 +1127,6 @@ alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return NULL; }
-static inline struct folio *alloc_hugetlb_folio_vma(struct hstate *h, - struct vm_area_struct *vma, - unsigned long address) -{ - return NULL; -} - static inline int __alloc_bootmem_huge_page(struct hstate *h) { return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 625e01f57c39..fcaf8eeba1c4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2590,24 +2590,6 @@ struct folio *alloc_hugetlb_folio_nodemask(struct hstate *h, int preferred_nid, return alloc_migrate_hugetlb_folio(h, gfp_mask, preferred_nid, nmask); }
-/* mempolicy aware migration callback */ -struct folio *alloc_hugetlb_folio_vma(struct hstate *h, struct vm_area_struct *vma, - unsigned long address) -{ - struct mempolicy *mpol; - nodemask_t *nodemask; - struct folio *folio; - gfp_t gfp_mask; - int node; - - gfp_mask = htlb_alloc_mask(h); - node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); - folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); - mpol_cond_put(mpol); - - return folio; -} - /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. @@ -6472,6 +6454,26 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, }
#ifdef CONFIG_USERFAULTFD +/* + * Can probably be eliminated, but still used by hugetlb_mfill_atomic_pte(). + */ +static struct folio *alloc_hugetlb_folio_vma(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct mempolicy *mpol; + nodemask_t *nodemask; + struct folio *folio; + gfp_t gfp_mask; + int node; + + gfp_mask = htlb_alloc_mask(h); + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + folio = alloc_hugetlb_folio_nodemask(h, node, nodemask, gfp_mask); + mpol_cond_put(mpol); + + return folio; +} + /* * Used by userfaultfd UFFDIO_* ioctls. Based on userfaultfd's mfill_atomic_pte * with modifications for hugetlb pages. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4831f3221b5f..02df4aed44af 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -420,6 +420,8 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
static bool migrate_folio_add(struct folio *folio, struct list_head *foliolist, unsigned long flags); +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid);
static bool strictly_unmovable(unsigned long flags) { @@ -1046,6 +1048,8 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, node_set(source, nmask);
VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); + + mmap_read_lock(mm); vma = find_vma(mm, 0);
/* @@ -1056,6 +1060,7 @@ static long migrate_to_node(struct mm_struct *mm, int source, int dest, */ nr_failed = queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); + mmap_read_unlock(mm);
if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, @@ -1084,8 +1089,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
lru_cache_disable();
- mmap_read_lock(mm); - /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' @@ -1165,7 +1168,6 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (err < 0) break; } - mmap_read_unlock(mm);
lru_cache_enable(); if (err < 0) @@ -1174,44 +1176,38 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, }
/* - * Allocate a new page for page migration based on vma policy. - * Start by assuming the page is mapped by the same vma as contains @start. - * Search forward from there, if not. N.B., this assumes that the - * list of pages handed to migrate_pages()--which is how we get here-- - * is in virtual address order. + * Allocate a new folio for page migration, according to NUMA mempolicy. */ -static struct folio *new_folio(struct folio *src, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { - struct vm_area_struct *vma; - unsigned long address; - VMA_ITERATOR(vmi, current->mm, start); - gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; - - for_each_vma(vmi, vma) { - address = page_address_in_vma(&src->page, vma); - if (address != -EFAULT) - break; - } + struct mempolicy *pol = (struct mempolicy *)private; + pgoff_t ilx = 0; /* improve on this later */ + struct page *page; + unsigned int order; + int nid = numa_node_id(); + gfp_t gfp;
- /* - * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL - * when the page can no longer be located in a vma: that is not ideal - * (migrate_pages() will give up early, presuming ENOMEM), but good - * enough to avoid a crash by syzkaller or concurrent holepunch. - */ - if (!vma) - return NULL; + order = folio_order(src); + ilx += src->index >> order;
if (folio_test_hugetlb(src)) { - return alloc_hugetlb_folio_vma(folio_hstate(src), - vma, address); + nodemask_t *nodemask; + struct hstate *h; + + h = folio_hstate(src); + gfp = htlb_alloc_mask(h); + nodemask = policy_nodemask(gfp, pol, ilx, &nid); + return alloc_hugetlb_folio_nodemask(h, nid, nodemask, gfp); }
if (folio_test_large(src)) gfp = GFP_TRANSHUGE; + else + gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP;
- return vma_alloc_folio(gfp, folio_order(src), vma, address, - folio_test_large(src)); + page = alloc_pages_mpol(gfp, order, pol, ilx, nid); + return page_rmappable_folio(page); } #else
@@ -1227,7 +1223,8 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, return -ENOSYS; }
-static struct folio *new_folio(struct folio *src, unsigned long start) +static struct folio *alloc_migration_target_by_mpol(struct folio *src, + unsigned long private) { return NULL; } @@ -1300,6 +1297,7 @@ long __do_mbind(unsigned long start, unsigned long len,
if (nr_failed < 0) { err = nr_failed; + nr_failed = 0; } else { vma_iter_init(&vmi, mm, start); prev = vma_prev(&vmi); @@ -1310,19 +1308,24 @@ long __do_mbind(unsigned long start, unsigned long len, } }
- if (!err) { - if (!list_empty(&pagelist)) { - nr_failed |= migrate_pages(&pagelist, new_folio, NULL, - start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND, NULL); + mmap_write_unlock(mm); + + if (!err && !list_empty(&pagelist)) { + /* Convert MPOL_DEFAULT's NULL to task or default policy */ + if (!new) { + new = get_task_policy(current); + mpol_get(new); } - if (nr_failed && (flags & MPOL_MF_STRICT)) - err = -EIO; + nr_failed |= migrate_pages(&pagelist, + alloc_migration_target_by_mpol, NULL, + (unsigned long)new, MIGRATE_SYNC, + MR_MEMPOLICY_MBIND, NULL); }
+ if (nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - - mmap_write_unlock(mm); mpol_out: mpol_put(new); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))