Backport four stable patch which have conflicts need to fix.
Kirill A. Shutemov (1): mm, treewide: introduce NR_PAGE_ORDERS
Matthew Wilcox (Oracle) (1): mm: turn folio_test_hugetlb into a PageType
Miaohe Lin (1): fork: defer linking file vma until vma is fully initialized
Peter Xu (1): mm/hugetlb: fix missing hugetlb_lock for resv uncharge
.../admin-guide/kdump/vmcoreinfo.rst | 6 +- arch/arm64/kvm/hyp/include/nvhe/gfp.h | 2 +- arch/sparc/kernel/traps_64.c | 2 +- drivers/gpu/drm/ttm/tests/ttm_device_test.c | 2 +- drivers/gpu/drm/ttm/ttm_pool.c | 20 +++--- include/drm/ttm/ttm_pool.h | 2 +- include/linux/mmzone.h | 6 +- include/linux/page-flags.h | 72 +++++++++---------- include/trace/events/mmflags.h | 1 + kernel/crash_core.c | 7 +- kernel/fork.c | 33 ++++----- lib/test_meminit.c | 2 +- mm/hugetlb.c | 27 ++----- mm/internal.h | 1 - mm/kmsan/init.c | 2 +- mm/page_alloc.c | 13 ++-- mm/page_reporting.c | 2 +- mm/show_mem.c | 8 +-- mm/vmstat.c | 12 ++-- 19 files changed, 102 insertions(+), 118 deletions(-)
From: "Kirill A. Shutemov" kirill.shutemov@linux.intel.com
stable inclusion from stable-v6.6.30 commit ded1ffea52132e58eaaa7d4ea39477f911796a40 category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/I9NYY7 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[ Upstream commit fd37721803c6e73619108f76ad2e12a9aa5fafaf ]
NR_PAGE_ORDERS defines the number of page orders supported by the page allocator, ranging from 0 to MAX_ORDER, MAX_ORDER + 1 in total.
NR_PAGE_ORDERS assists in defining arrays of page orders and allows for more natural iteration over them.
[kirill.shutemov@linux.intel.com: fixup for kerneldoc warning] Link: https://lkml.kernel.org/r/20240101111512.7empzyifq7kxtzk3@box Link: https://lkml.kernel.org/r/20231228144704.14033-1-kirill.shutemov@linux.intel... Signed-off-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Reviewed-by: Zi Yan ziy@nvidia.com Cc: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Stable-dep-of: b6976f323a86 ("drm/ttm: stop pooling cached NUMA pages v2") Signed-off-by: Sasha Levin sashal@kernel.org Conflicts: mm/compaction.c kernel/crash_core.c mm/internal.h [ Context conflict with commit c9b39e3ffa86 in kernel/crash_core.c. Context conflict with commit af879363ce88 in mm/compaction.c. Remove previous define of NR_PAGE_ORDERS in mm/internal.h. ] Signed-off-by: Liu Shixin liushixin2@huawei.com --- .../admin-guide/kdump/vmcoreinfo.rst | 6 +++--- arch/arm64/kvm/hyp/include/nvhe/gfp.h | 2 +- arch/sparc/kernel/traps_64.c | 2 +- drivers/gpu/drm/ttm/tests/ttm_device_test.c | 2 +- drivers/gpu/drm/ttm/ttm_pool.c | 20 +++++++++---------- include/drm/ttm/ttm_pool.h | 2 +- include/linux/mmzone.h | 6 ++++-- kernel/crash_core.c | 2 +- lib/test_meminit.c | 2 +- mm/internal.h | 1 - mm/kmsan/init.c | 2 +- mm/page_alloc.c | 13 ++++++------ mm/page_reporting.c | 2 +- mm/show_mem.c | 8 ++++---- mm/vmstat.c | 12 +++++------ 15 files changed, 41 insertions(+), 41 deletions(-)
diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index c11bd4b1ceb1..dd4dcc61f8e3 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -172,7 +172,7 @@ variables. Offset of the free_list's member. This value is used to compute the number of free pages.
-Each zone has a free_area structure array called free_area[MAX_ORDER + 1]. +Each zone has a free_area structure array called free_area[NR_PAGE_ORDERS]. The free_list represents a linked list of free page blocks.
(list_head, next|prev) @@ -189,8 +189,8 @@ Offsets of the vmap_area's members. They carry vmalloc-specific information. Makedumpfile gets the start address of the vmalloc region from this.
-(zone.free_area, MAX_ORDER + 1) -------------------------------- +(zone.free_area, NR_PAGE_ORDERS) +--------------------------------
Free areas descriptor. User-space tools use this value to iterate the free_area ranges. MAX_ORDER is used by the zone buddy allocator. diff --git a/arch/arm64/kvm/hyp/include/nvhe/gfp.h b/arch/arm64/kvm/hyp/include/nvhe/gfp.h index fe5472a184a3..97c527ef53c2 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/gfp.h +++ b/arch/arm64/kvm/hyp/include/nvhe/gfp.h @@ -16,7 +16,7 @@ struct hyp_pool { * API at EL2. */ hyp_spinlock_t lock; - struct list_head free_area[MAX_ORDER + 1]; + struct list_head free_area[NR_PAGE_ORDERS]; phys_addr_t range_start; phys_addr_t range_end; unsigned short max_order; diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 08ffd17d5ec3..523a6e5ee925 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -897,7 +897,7 @@ void __init cheetah_ecache_flush_init(void)
/* Now allocate error trap reporting scoreboard. */ sz = NR_CPUS * (2 * sizeof(struct cheetah_err_info)); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { if ((PAGE_SIZE << order) >= sz) break; } diff --git a/drivers/gpu/drm/ttm/tests/ttm_device_test.c b/drivers/gpu/drm/ttm/tests/ttm_device_test.c index b1b423b68cdf..19eaff22e6ae 100644 --- a/drivers/gpu/drm/ttm/tests/ttm_device_test.c +++ b/drivers/gpu/drm/ttm/tests/ttm_device_test.c @@ -175,7 +175,7 @@ static void ttm_device_init_pools(struct kunit *test)
if (params->pools_init_expected) { for (int i = 0; i < TTM_NUM_CACHING_TYPES; ++i) { - for (int j = 0; j <= MAX_ORDER; ++j) { + for (int j = 0; j < NR_PAGE_ORDERS; ++j) { pt = pool->caching[i].orders[j]; KUNIT_EXPECT_PTR_EQ(test, pt.pool, pool); KUNIT_EXPECT_EQ(test, pt.caching, i); diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index 9b60222511d6..c8ec6a2cac5d 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -65,11 +65,11 @@ module_param(page_pool_size, ulong, 0644);
static atomic_long_t allocated_pages;
-static struct ttm_pool_type global_write_combined[MAX_ORDER + 1]; -static struct ttm_pool_type global_uncached[MAX_ORDER + 1]; +static struct ttm_pool_type global_write_combined[NR_PAGE_ORDERS]; +static struct ttm_pool_type global_uncached[NR_PAGE_ORDERS];
-static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER + 1]; -static struct ttm_pool_type global_dma32_uncached[MAX_ORDER + 1]; +static struct ttm_pool_type global_dma32_write_combined[NR_PAGE_ORDERS]; +static struct ttm_pool_type global_dma32_uncached[NR_PAGE_ORDERS];
static spinlock_t shrinker_lock; static struct list_head shrinker_list; @@ -565,7 +565,7 @@ void ttm_pool_init(struct ttm_pool *pool, struct device *dev,
if (use_dma_alloc || nid != NUMA_NO_NODE) { for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j <= MAX_ORDER; ++j) + for (j = 0; j < NR_PAGE_ORDERS; ++j) ttm_pool_type_init(&pool->caching[i].orders[j], pool, i, j); } @@ -586,7 +586,7 @@ void ttm_pool_fini(struct ttm_pool *pool)
if (pool->use_dma_alloc || pool->nid != NUMA_NO_NODE) { for (i = 0; i < TTM_NUM_CACHING_TYPES; ++i) - for (j = 0; j <= MAX_ORDER; ++j) + for (j = 0; j < NR_PAGE_ORDERS; ++j) ttm_pool_type_fini(&pool->caching[i].orders[j]); }
@@ -641,7 +641,7 @@ static void ttm_pool_debugfs_header(struct seq_file *m) unsigned int i;
seq_puts(m, "\t "); - for (i = 0; i <= MAX_ORDER; ++i) + for (i = 0; i < NR_PAGE_ORDERS; ++i) seq_printf(m, " ---%2u---", i); seq_puts(m, "\n"); } @@ -652,7 +652,7 @@ static void ttm_pool_debugfs_orders(struct ttm_pool_type *pt, { unsigned int i;
- for (i = 0; i <= MAX_ORDER; ++i) + for (i = 0; i < NR_PAGE_ORDERS; ++i) seq_printf(m, " %8u", ttm_pool_type_count(&pt[i])); seq_puts(m, "\n"); } @@ -761,7 +761,7 @@ int ttm_pool_mgr_init(unsigned long num_pages) spin_lock_init(&shrinker_lock); INIT_LIST_HEAD(&shrinker_list);
- for (i = 0; i <= MAX_ORDER; ++i) { + for (i = 0; i < NR_PAGE_ORDERS; ++i) { ttm_pool_type_init(&global_write_combined[i], NULL, ttm_write_combined, i); ttm_pool_type_init(&global_uncached[i], NULL, ttm_uncached, i); @@ -794,7 +794,7 @@ void ttm_pool_mgr_fini(void) { unsigned int i;
- for (i = 0; i <= MAX_ORDER; ++i) { + for (i = 0; i < NR_PAGE_ORDERS; ++i) { ttm_pool_type_fini(&global_write_combined[i]); ttm_pool_type_fini(&global_uncached[i]);
diff --git a/include/drm/ttm/ttm_pool.h b/include/drm/ttm/ttm_pool.h index 30a347e5aa11..4490d43c63e3 100644 --- a/include/drm/ttm/ttm_pool.h +++ b/include/drm/ttm/ttm_pool.h @@ -74,7 +74,7 @@ struct ttm_pool { bool use_dma32;
struct { - struct ttm_pool_type orders[MAX_ORDER + 1]; + struct ttm_pool_type orders[NR_PAGE_ORDERS]; } caching[TTM_NUM_CACHING_TYPES]; };
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8683ae8f3f25..a394cbe31c59 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -34,6 +34,8 @@
#define IS_MAX_ORDER_ALIGNED(pfn) IS_ALIGNED(pfn, MAX_ORDER_NR_PAGES)
+#define NR_PAGE_ORDERS (MAX_ORDER + 1) + /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should @@ -95,7 +97,7 @@ static inline bool migratetype_is_mergeable(int mt) }
#define for_each_migratetype_order(order, type) \ - for (order = 0; order <= MAX_ORDER; order++) \ + for (order = 0; order < NR_PAGE_ORDERS; order++) \ for (type = 0; type < MIGRATE_TYPES; type++)
extern int page_group_by_mobility_disabled; @@ -965,7 +967,7 @@ struct zone { CACHELINE_PADDING(_pad1_);
/* free areas of different sizes */ - struct free_area free_area[MAX_ORDER + 1]; + struct free_area free_area[NR_PAGE_ORDERS];
#ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_ORDER */ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 05d1d7168c5b..1800784b2cec 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -658,7 +658,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER + 1); + VMCOREINFO_LENGTH(zone.free_area, NR_PAGE_ORDERS); log_buf_vmcoreinfo_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); VMCOREINFO_NUMBER(NR_FREE_PAGES); diff --git a/lib/test_meminit.c b/lib/test_meminit.c index 0ae35223d773..0dc173849a54 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -93,7 +93,7 @@ static int __init test_pages(int *total_failures) int failures = 0, num_tests = 0; int i;
- for (i = 0; i <= MAX_ORDER; i++) + for (i = 0; i < NR_PAGE_ORDERS; i++) num_tests += do_alloc_pages_order(i, &failures);
REPORT_FAILURES_IN_FN(); diff --git a/mm/internal.h b/mm/internal.h index b9d842322a40..fad7d8bf2e32 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -656,7 +656,6 @@ int split_free_page(struct page *free_page,
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
-#define NR_PAGE_ORDERS (MAX_ORDER + 1) #define MAX_PAGE_ORDER MAX_ORDER
/* diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c index ffedf4dbc49d..103e2e88ea03 100644 --- a/mm/kmsan/init.c +++ b/mm/kmsan/init.c @@ -96,7 +96,7 @@ void __init kmsan_init_shadow(void) struct metadata_page_pair { struct page *shadow, *origin; }; -static struct metadata_page_pair held_back[MAX_ORDER + 1] __initdata; +static struct metadata_page_pair held_back[NR_PAGE_ORDERS] __initdata;
/* * Eager metadata allocation. When the memblock allocator is freeing pages to diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 668b29d09be7..afb95af61298 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1578,7 +1578,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, struct page *page;
/* Find a page of the appropriate size in the preferred list */ - for (current_order = order; current_order <= MAX_ORDER; ++current_order) { + for (current_order = order; current_order < NR_PAGE_ORDERS; ++current_order) { area = &(zone->free_area[current_order]); page = get_page_from_free_area(area, migratetype); if (!page) @@ -1952,7 +1952,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, continue;
spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &(zone->free_area[order]);
page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); @@ -2062,8 +2062,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, return false;
find_smallest: - for (current_order = order; current_order <= MAX_ORDER; - current_order++) { + for (current_order = order; current_order < NR_PAGE_ORDERS; current_order++) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, start_migratetype, false, &can_steal); @@ -3038,7 +3037,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true;
/* For a high-order request, check at least one suitable page is free */ - for (o = order; o <= MAX_ORDER; o++) { + for (o = order; o < NR_PAGE_ORDERS; o++) { struct free_area *area = &z->free_area[o]; int mt;
@@ -6872,7 +6871,7 @@ bool is_free_buddy_page(struct page *page) unsigned long pfn = page_to_pfn(page); unsigned int order;
- for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1));
if (PageBuddy(page_head) && @@ -6931,7 +6930,7 @@ bool take_page_off_buddy(struct page *page) bool ret = false;
spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { struct page *page_head = page - (pfn & ((1 << order) - 1)); int page_order = buddy_order(page_head);
diff --git a/mm/page_reporting.c b/mm/page_reporting.c index b021f482a4cb..66369cc5279b 100644 --- a/mm/page_reporting.c +++ b/mm/page_reporting.c @@ -276,7 +276,7 @@ page_reporting_process_zone(struct page_reporting_dev_info *prdev, return err;
/* Process each free list starting from lowest order/mt */ - for (order = page_reporting_order; order <= MAX_ORDER; order++) { + for (order = page_reporting_order; order < NR_PAGE_ORDERS; order++) { for (mt = 0; mt < MIGRATE_TYPES; mt++) { /* We do not pull pages from the isolate free list */ if (is_migrate_isolate(mt)) diff --git a/mm/show_mem.c b/mm/show_mem.c index 1aa5cf26221f..a5608c697d15 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -356,8 +356,8 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z
for_each_populated_zone(zone) { unsigned int order; - unsigned long nr[MAX_ORDER + 1], flags, total = 0; - unsigned char types[MAX_ORDER + 1]; + unsigned long nr[NR_PAGE_ORDERS], flags, total = 0; + unsigned char types[NR_PAGE_ORDERS];
if (zone_idx(zone) > max_zone_idx) continue; @@ -367,7 +367,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z printk(KERN_CONT "%s: ", zone->name);
spin_lock_irqsave(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { struct free_area *area = &zone->free_area[order]; int type;
@@ -381,7 +381,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z } } spin_unlock_irqrestore(&zone->lock, flags); - for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { printk(KERN_CONT "%lu*%lukB ", nr[order], K(1UL) << order); if (nr[order]) diff --git a/mm/vmstat.c b/mm/vmstat.c index 4c2f3b824d60..9a0c40d804e6 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1057,7 +1057,7 @@ static void fill_contig_page_info(struct zone *zone, info->free_blocks_total = 0; info->free_blocks_suitable = 0;
- for (order = 0; order <= MAX_ORDER; order++) { + for (order = 0; order < NR_PAGE_ORDERS; order++) { unsigned long blocks;
/* @@ -1473,7 +1473,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, int order;
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order <= MAX_ORDER; ++order) + for (order = 0; order < NR_PAGE_ORDERS; ++order) /* * Access to nr_free is lockless as nr_free is used only for * printing purposes. Use data_race to avoid KCSAN warning. @@ -1502,7 +1502,7 @@ static void pagetypeinfo_showfree_print(struct seq_file *m, pgdat->node_id, zone->name, migratetype_names[mtype]); - for (order = 0; order <= MAX_ORDER; ++order) { + for (order = 0; order < NR_PAGE_ORDERS; ++order) { unsigned long freecount = 0; struct free_area *area; struct list_head *curr; @@ -1542,7 +1542,7 @@ static void pagetypeinfo_showfree(struct seq_file *m, void *arg)
/* Print header */ seq_printf(m, "%-43s ", "Free pages count per migrate type at order"); - for (order = 0; order <= MAX_ORDER; ++order) + for (order = 0; order < NR_PAGE_ORDERS; ++order) seq_printf(m, "%6d ", order); seq_putc(m, '\n');
@@ -2178,7 +2178,7 @@ static void unusable_show_print(struct seq_file *m, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order <= MAX_ORDER; ++order) { + for (order = 0; order < NR_PAGE_ORDERS; ++order) { fill_contig_page_info(zone, order, &info); index = unusable_free_index(order, &info); seq_printf(m, "%d.%03d ", index / 1000, index % 1000); @@ -2230,7 +2230,7 @@ static void extfrag_show_print(struct seq_file *m, seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order <= MAX_ORDER; ++order) { + for (order = 0; order < NR_PAGE_ORDERS; ++order) { fill_contig_page_info(zone, order, &info); index = __fragmentation_index(order, &info); seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
From: Miaohe Lin linmiaohe@huawei.com
mainline inclusion from mainline-v6.9-rc5 commit 35e351780fa9d8240dd6f7e4f245f9ea37e96c19 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9NYY7 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Thorvald reported a WARNING [1]. And the root cause is below race:
CPU 1 CPU 2 fork hugetlbfs_fallocate dup_mmap hugetlbfs_punch_hole i_mmap_lock_write(mapping); vma_interval_tree_insert_after -- Child vma is visible through i_mmap tree. i_mmap_unlock_write(mapping); hugetlb_dup_vma_private -- Clear vma_lock outside i_mmap_rwsem! i_mmap_lock_write(mapping); hugetlb_vmdelete_list vma_interval_tree_foreach hugetlb_vma_trylock_write -- Vma_lock is cleared. tmp->vm_ops->open -- Alloc new vma_lock outside i_mmap_rwsem! hugetlb_vma_unlock_write -- Vma_lock is assigned!!! i_mmap_unlock_write(mapping);
hugetlb_dup_vma_private() and hugetlb_vm_op_open() are called outside i_mmap_rwsem lock while vma lock can be used in the same time. Fix this by deferring linking file vma until vma is fully initialized. Those vmas should be initialized first before they can be used.
Link: https://lkml.kernel.org/r/20240410091441.3539905-1-linmiaohe@huawei.com Fixes: 8d9bfb260814 ("hugetlb: add vma based lock for pmd sharing") Signed-off-by: Miaohe Lin linmiaohe@huawei.com Reported-by: Thorvald Natvig thorvald@google.com Closes: https://lore.kernel.org/linux-mm/20240129161735.6gmjsswx62o4pbja@revolver/T/ [1] Reviewed-by: Jane Chu jane.chu@oracle.com Cc: Christian Brauner brauner@kernel.org Cc: Heiko Carstens hca@linux.ibm.com Cc: Kent Overstreet kent.overstreet@linux.dev Cc: Liam R. Howlett Liam.Howlett@oracle.com Cc: Mateusz Guzik mjguzik@gmail.com Cc: Matthew Wilcox (Oracle) willy@infradead.org Cc: Miaohe Lin linmiaohe@huawei.com Cc: Muchun Song muchun.song@linux.dev Cc: Oleg Nesterov oleg@redhat.com Cc: Peng Zhang zhangpeng.00@bytedance.com Cc: Tycho Andersen tandersen@netflix.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org [ The stable commit cec11fa2eb51 is conflict due to commit d37e561485e5, backport the mainline version instead. ] Signed-off-by: Liu Shixin liushixin2@huawei.com --- kernel/fork.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-)
diff --git a/kernel/fork.c b/kernel/fork.c index d75c18892eb6..e033388b11bd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -745,6 +745,23 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; vm_flags_clear(tmp, VM_LOCKED_MASK); + /* + * Copy/update hugetlb private vma information. + */ + if (is_vm_hugetlb_page(tmp)) + hugetlb_dup_vma_private(tmp); + + /* + * Link the vma into the MT. After using __mt_dup(), memory + * allocation is not necessary here, so it cannot fail. + */ + vma_iter_bulk_store(&vmi, tmp); + + mm->map_count++; + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; @@ -761,25 +778,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, i_mmap_unlock_write(mapping); }
- /* - * Copy/update hugetlb private vma information. - */ - if (is_vm_hugetlb_page(tmp)) - hugetlb_dup_vma_private(tmp); - - /* - * Link the vma into the MT. After using __mt_dup(), memory - * allocation is not necessary here, so it cannot fail. - */ - vma_iter_bulk_store(&vmi, tmp); - - mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(tmp, mpnt);
- if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - if (retval) { mpnt = vma_next(&vmi); goto loop_out;
From: Peter Xu peterx@redhat.com
stable inclusion from stable-v6.6.30 commit f6c5d21db16a0910152ec8aa9d5a7aed72694505 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9NYY7 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit b76b46902c2d0395488c8412e1116c2486cdfcb2 upstream.
There is a recent report on UFFDIO_COPY over hugetlb:
https://lore.kernel.org/all/000000000000ee06de0616177560@google.com/
350: lockdep_assert_held(&hugetlb_lock);
Should be an issue in hugetlb but triggered in an userfault context, where it goes into the unlikely path where two threads modifying the resv map together. Mike has a fix in that path for resv uncharge but it looks like the locking criteria was overlooked: hugetlb_cgroup_uncharge_folio_rsvd() will update the cgroup pointer, so it requires to be called with the lock held.
Link: https://lkml.kernel.org/r/20240417211836.2742593-3-peterx@redhat.com Fixes: 79aa925bf239 ("hugetlb_cgroup: fix reservation accounting") Signed-off-by: Peter Xu peterx@redhat.com Reported-by: syzbot+4b8077a5fccc61c385a1@syzkaller.appspotmail.com Reviewed-by: Mina Almasry almasrymina@google.com Cc: David Hildenbrand david@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: mm/hugetlb.c [ Context conflicts with commit d5ea6f5f46c3 because it adds a new param in hugepage_subpool_put_pages() and hugetlb_acct_memory(). ] Signed-off-by: Liu Shixin liushixin2@huawei.com --- mm/hugetlb.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 625e01f57c39..164626357140 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3255,9 +3255,12 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
rsv_adjust = hugepage_subpool_put_pages(spool, 1, info); hugetlb_acct_memory(h, -rsv_adjust, info); - if (deferred_reserve) + if (deferred_reserve) { + spin_lock_irq(&hugetlb_lock); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); + spin_unlock_irq(&hugetlb_lock); + } }
if (!memcg_charge_ret)
From: "Matthew Wilcox (Oracle)" willy@infradead.org
stable inclusion from stable-v6.6.30 commit 2431b5f2650dfc47ce782d1ca7b02d6b3916976f category: cleanup bugzilla: https://gitee.com/openeuler/kernel/issues/I9NYY7 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit d99e3140a4d33e26066183ff727d8f02f56bec64 upstream.
The current folio_test_hugetlb() can be fooled by a concurrent folio split into returning true for a folio which has never belonged to hugetlbfs. This can't happen if the caller holds a refcount on it, but we have a few places (memory-failure, compaction, procfs) which do not and should not take a speculative reference.
Since hugetlb pages do not use individual page mapcounts (they are always fully mapped and use the entire_mapcount field to record the number of mappings), the PageType field is available now that page_mapcount() ignores the value in this field.
In compaction and with CONFIG_DEBUG_VM enabled, the current implementation can result in an oops, as reported by Luis. This happens since 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") effectively added some VM_BUG_ON() checks in the PageHuge() testing path.
[willy@infradead.org: update vmcoreinfo] Link: https://lkml.kernel.org/r/ZgGZUvsdhaT1Va-T@casper.infradead.org Link: https://lkml.kernel.org/r/20240321142448.1645400-6-willy@infradead.org Fixes: 9c5ccf2db04b ("mm: remove HUGETLB_PAGE_DTOR") Signed-off-by: Matthew Wilcox (Oracle) willy@infradead.org Reviewed-by: David Hildenbrand david@redhat.com Acked-by: Vlastimil Babka vbabka@suse.cz Reported-by: Luis Chamberlain mcgrof@kernel.org Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218227 Cc: Miaohe Lin linmiaohe@huawei.com Cc: Muchun Song muchun.song@linux.dev Cc: Oscar Salvador osalvador@suse.de Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: include/linux/page-flags.h [ Conflict with commit aeb5fd3e5a79 since it add a page_type PG_dpool with value 0x800 which is same as PG_hugetlb. Change PG_dpool to 0x10000. ] Signed-off-by: Liu Shixin liushixin2@huawei.com --- include/linux/page-flags.h | 72 ++++++++++++++++------------------ include/trace/events/mmflags.h | 1 + kernel/crash_core.c | 5 +-- mm/hugetlb.c | 22 ++--------- 4 files changed, 40 insertions(+), 60 deletions(-)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 71c810ba7e08..7a67d997eece 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -193,7 +193,6 @@ enum pageflags {
/* At least one page in this folio has the hwpoison flag set */ PG_has_hwpoisoned = PG_error, - PG_hugetlb = PG_active, PG_large_rmappable = PG_workingset, /* anon or file-backed */ };
@@ -848,29 +847,6 @@ TESTPAGEFLAG_FALSE(LargeRmappable, large_rmappable)
#define PG_head_mask ((1UL << PG_head))
-#ifdef CONFIG_HUGETLB_PAGE -int PageHuge(struct page *page); -SETPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) -CLEARPAGEFLAG(HugeTLB, hugetlb, PF_SECOND) - -/** - * folio_test_hugetlb - Determine if the folio belongs to hugetlbfs - * @folio: The folio to test. - * - * Context: Any context. Caller should have a reference on the folio to - * prevent it from being turned into a tail page. - * Return: True for hugetlbfs folios, false for anon folios or folios - * belonging to other filesystems. - */ -static inline bool folio_test_hugetlb(struct folio *folio) -{ - return folio_test_large(folio) && - test_bit(PG_hugetlb, folio_flags(folio, 1)); -} -#else -TESTPAGEFLAG_FALSE(Huge, hugetlb) -#endif - #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageHuge() only returns true for hugetlbfs pages, but not for @@ -926,18 +902,6 @@ PAGEFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) TESTSCFLAG_FALSE(HasHWPoisoned, has_hwpoisoned) #endif
-/* - * Check if a page is currently marked HWPoisoned. Note that this check is - * best effort only and inherently racy: there is no way to synchronize with - * failing hardware. - */ -static inline bool is_page_hwpoison(struct page *page) -{ - if (PageHWPoison(page)) - return true; - return PageHuge(page) && PageHWPoison(compound_head(page)); -} - /* * For pages that are never mapped to userspace (and aren't PageSlab), * page_type may be used. Because it is initialised to -1, we invert the @@ -954,8 +918,9 @@ static inline bool is_page_hwpoison(struct page *page) #define PG_offline 0x00000100 #define PG_table 0x00000200 #define PG_guard 0x00000400 +#define PG_hugetlb 0x00000800 #ifdef CONFIG_DYNAMIC_POOL -#define PG_dpool 0x00000800 +#define PG_dpool 0x00010000 #endif
#define PageType(page, flag) \ @@ -1058,6 +1023,37 @@ PAGE_TYPE_OPS(Guard, guard, guard) PAGE_TYPE_OPS(Dpool, dpool, dpool) #endif
+#ifdef CONFIG_HUGETLB_PAGE +FOLIO_TYPE_OPS(hugetlb, hugetlb) +#else +FOLIO_TEST_FLAG_FALSE(hugetlb) +#endif + +/** + * PageHuge - Determine if the page belongs to hugetlbfs + * @page: The page to test. + * + * Context: Any context. + * Return: True for hugetlbfs pages, false for anon pages or pages + * belonging to other filesystems. + */ +static inline bool PageHuge(const struct page *page) +{ + return folio_test_hugetlb(page_folio(page)); +} + +/* + * Check if a page is currently marked HWPoisoned. Note that this check is + * best effort only and inherently racy: there is no way to synchronize with + * failing hardware. + */ +static inline bool is_page_hwpoison(struct page *page) +{ + if (PageHWPoison(page)) + return true; + return PageHuge(page) && PageHWPoison(compound_head(page)); +} + extern bool is_free_buddy_page(struct page *page);
PAGEFLAG(Isolated, isolated, PF_ANY); @@ -1124,7 +1120,7 @@ static __always_inline void __ClearPageAnonExclusive(struct page *page) */ #define PAGE_FLAGS_SECOND \ (0xffUL /* order */ | 1UL << PG_has_hwpoisoned | \ - 1UL << PG_hugetlb | 1UL << PG_large_rmappable) + 1UL << PG_large_rmappable)
#define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 998a5c590325..6104fa2b6e47 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -143,6 +143,7 @@ IF_HAVE_PG_ARCH_X(arch_3) #define DEF_PAGETYPE_NAME(_name) { PG_##_name, __stringify(_name) }
#define __def_pagetype_names \ + DEF_PAGETYPE_NAME(hugetlb), \ DEF_PAGETYPE_NAME(offline), \ DEF_PAGETYPE_NAME(guard), \ DEF_PAGETYPE_NAME(table), \ diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 1800784b2cec..748adee5b06b 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -673,11 +673,10 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_head_mask); #define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(PG_hugetlb); +#define PAGE_HUGETLB_MAPCOUNT_VALUE (~PG_hugetlb) + VMCOREINFO_NUMBER(PAGE_HUGETLB_MAPCOUNT_VALUE); #define PAGE_OFFLINE_MAPCOUNT_VALUE (~PG_offline) VMCOREINFO_NUMBER(PAGE_OFFLINE_MAPCOUNT_VALUE); -#endif
#ifdef CONFIG_KALLSYMS VMCOREINFO_SYMBOL(kallsyms_names); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 164626357140..6f90d0845c43 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1632,7 +1632,7 @@ static inline void __clear_hugetlb_destructor(struct hstate *h, { lockdep_assert_held(&hugetlb_lock);
- folio_clear_hugetlb(folio); + __folio_clear_hugetlb(folio); }
/* @@ -1719,7 +1719,7 @@ static void add_hugetlb_folio(struct hstate *h, struct folio *folio, h->surplus_huge_pages_node[nid]++; }
- folio_set_hugetlb(folio); + __folio_set_hugetlb(folio); folio_change_private(folio, NULL); /* * We have to set hugetlb_vmemmap_optimized again as above @@ -1981,7 +1981,7 @@ void __prep_new_hugetlb_folio(struct hstate *h, struct folio *folio) { hugetlb_vmemmap_optimize(h, &folio->page); INIT_LIST_HEAD(&folio->lru); - folio_set_hugetlb(folio); + __folio_set_hugetlb(folio); hugetlb_set_folio_subpool(folio, NULL); set_hugetlb_cgroup(folio, NULL); set_hugetlb_cgroup_rsvd(folio, NULL); @@ -2084,22 +2084,6 @@ bool prep_compound_gigantic_folio_for_demote(struct folio *folio, return __prep_compound_gigantic_folio(folio, order, true); }
-/* - * PageHuge() only returns true for hugetlbfs pages, but not for normal or - * transparent huge pages. See the PageTransHuge() documentation for more - * details. - */ -int PageHuge(struct page *page) -{ - struct folio *folio; - - if (!PageCompound(page)) - return 0; - folio = page_folio(page); - return folio_test_hugetlb(folio); -} -EXPORT_SYMBOL_GPL(PageHuge); - /* * Find and lock address space (mapping) in write mode. *
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/7078 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/M...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/7078 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/M...