From: Mel Gorman mgorman@techsingularity.net
mainline inclusion from mainline-v5.14-rc1 commit 44042b4498728f4376e84bae1ac8016d146d850b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I7EQDW CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------
The per-cpu page allocator (PCP) only stores order-0 pages. This means that all THP and "cheap" high-order allocations including SLUB contends on the zone->lock. This patch extends the PCP allocator to store THP and "cheap" high-order pages. Note that struct per_cpu_pages increases in size to 256 bytes (4 cache lines) on x86-64.
Note that this is not necessarily a universal performance win because of how it is implemented. High-order pages can cause pcp->high to be exceeded prematurely for lower-orders so for example, a large number of THP pages being freed could release order-0 pages from the PCP lists. Hence, much depends on the allocation/free pattern as observed by a single CPU to determine if caching helps or hurts a particular workload.
That said, basic performance testing passed. The following is a netperf UDP_STREAM test which hits the relevant patches as some of the network allocations are high-order.
netperf-udp 5.13.0-rc2 5.13.0-rc2 mm-pcpburst-v3r4 mm-pcphighorder-v1r7 Hmean send-64 261.46 ( 0.00%) 266.30 * 1.85%* Hmean send-128 516.35 ( 0.00%) 536.78 * 3.96%* Hmean send-256 1014.13 ( 0.00%) 1034.63 * 2.02%* Hmean send-1024 3907.65 ( 0.00%) 4046.11 * 3.54%* Hmean send-2048 7492.93 ( 0.00%) 7754.85 * 3.50%* Hmean send-3312 11410.04 ( 0.00%) 11772.32 * 3.18%* Hmean send-4096 13521.95 ( 0.00%) 13912.34 * 2.89%* Hmean send-8192 21660.50 ( 0.00%) 22730.72 * 4.94%* Hmean send-16384 31902.32 ( 0.00%) 32637.50 * 2.30%*
Functionally, a patch like this is necessary to make bulk allocation of high-order pages work with similar performance to order-0 bulk allocations. The bulk allocator is not updated in this series as it would have to be determined by bulk allocation users how they want to track the order of pages allocated with the bulk allocator.
Link: https://lkml.kernel.org/r/20210611135753.GC30378@techsingularity.net Signed-off-by: Mel Gorman mgorman@techsingularity.net Acked-by: Vlastimil Babka vbabka@suse.cz Cc: Zi Yan ziy@nvidia.com Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Michal Hocko mhocko@kernel.org Cc: Jesper Dangaard Brouer brouer@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org
Conflicts: include/linux/mmzone.h mm/internal.h mm/page_alloc.c
Signed-off-by: Tu Jinjiang tujinjiang@huawei.com --- include/linux/gfp.h | 2 +- include/linux/mmzone.h | 20 ++++- mm/page_alloc.c | 170 ++++++++++++++++++++++++++++++----------- mm/swap.c | 2 +- 4 files changed, 145 insertions(+), 49 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5c3df92a4745..e81a001861dd 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -600,7 +600,7 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); -extern void free_unref_page(struct page *page); +extern void free_unref_page(struct page *page, unsigned int order); extern void free_unref_page_list(struct list_head *list);
struct page_frag_cache; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0a70b4bdd236..85b91e4f6ebd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -318,6 +318,24 @@ enum zone_watermarks { NR_WMARK };
+/* + * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional + * for pageblock size for THP if configured. + */ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define NR_PCP_THP 1 +#else +#define NR_PCP_THP 0 +#endif +#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP)) + +/* + * Shift to encode migratetype and order in the same integer, with order + * in the least significant bits. + */ +#define NR_PCP_ORDER_WIDTH 8 +#define NR_PCP_ORDER_MASK ((1<<NR_PCP_ORDER_WIDTH) - 1) + #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) @@ -329,7 +347,7 @@ struct per_cpu_pages { int batch; /* chunk size for buddy add/remove */
/* Lists of pages, one per migrate type stored on the pcp-lists */ - struct list_head lists[MIGRATE_PCPTYPES]; + struct list_head lists[NR_PCP_LISTS]; };
struct per_cpu_pageset { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12da70f39e0c..e3dbafd45f0c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -222,6 +222,49 @@ static int __init early_init_on_free(char *buf) } early_param("init_on_free", early_init_on_free);
+static inline unsigned int order_to_pindex(int migratetype, int order) +{ + int base = order; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order > PAGE_ALLOC_COSTLY_ORDER) { + VM_BUG_ON(order != pageblock_order); + base = PAGE_ALLOC_COSTLY_ORDER + 1; + } +#else + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); +#endif + + return (MIGRATE_PCPTYPES * base) + migratetype; +} + +static inline int pindex_to_order(unsigned int pindex) +{ + int order = pindex / MIGRATE_PCPTYPES; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order > PAGE_ALLOC_COSTLY_ORDER) { + order = pageblock_order; + VM_BUG_ON(order != pageblock_order); + } +#else + VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); +#endif + + return order; +} + +static inline bool pcp_allowed_order(unsigned int order) +{ + if (order <= PAGE_ALLOC_COSTLY_ORDER) + return true; +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + if (order == pageblock_order) + return true; +#endif + return false; +} + /* * A cached value of the page's pageblock's migratetype, used when the page is * put on a pcplist. Used to avoid the pageblock migratetype lookup when @@ -693,11 +736,11 @@ static void bad_page(struct page *page, const char *reason) * The first tail page's ->compound_order holds the order of allocation. * This usage means that zero-order pages may not be compound. */ - +static void free_the_page(struct page *page, unsigned int order); void free_compound_page(struct page *page) { mem_cgroup_uncharge(page); - __free_pages_ok(page, compound_order(page), FPI_NONE); + free_the_page(page, compound_order(page)); }
void prep_compound_page(struct page *page, unsigned int order) @@ -1294,9 +1337,9 @@ __always_inline bool free_pages_prepare(struct page *page, * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when * moved from pcp lists to free lists. */ -static bool free_pcp_prepare(struct page *page) +static bool free_pcp_prepare(struct page *page, unsigned int order) { - return free_pages_prepare(page, 0, true); + return free_pages_prepare(page, order, true); }
static bool bulkfree_pcp_prepare(struct page *page) @@ -1313,12 +1356,12 @@ static bool bulkfree_pcp_prepare(struct page *page) * debug_pagealloc enabled, they are checked also immediately when being freed * to the pcp lists. */ -static bool free_pcp_prepare(struct page *page) +static bool free_pcp_prepare(struct page *page, unsigned int order) { if (debug_pagealloc_enabled_static()) - return free_pages_prepare(page, 0, true); + return free_pages_prepare(page, order, true); else - return free_pages_prepare(page, 0, false); + return free_pages_prepare(page, order, false); }
static bool bulkfree_pcp_prepare(struct page *page) @@ -1350,8 +1393,10 @@ static inline void prefetch_buddy(struct page *page) static void free_pcppages_bulk(struct zone *zone, int count, struct per_cpu_pages *pcp) { - int migratetype = 0; + int pindex = 0; int batch_free = 0; + int nr_freed = 0; + unsigned int order; int prefetch_nr = READ_ONCE(pcp->batch); bool isolated_pageblocks; struct page *page, *tmp; @@ -1362,7 +1407,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, * below while (list_empty(list)) loop. */ count = min(pcp->count, count); - while (count) { + while (count > 0) { struct list_head *list;
/* @@ -1374,24 +1419,31 @@ static void free_pcppages_bulk(struct zone *zone, int count, */ do { batch_free++; - if (++migratetype == MIGRATE_PCPTYPES) - migratetype = 0; - list = &pcp->lists[migratetype]; + if (++pindex == NR_PCP_LISTS) + pindex = 0; + list = &pcp->lists[pindex]; } while (list_empty(list));
/* This is the only non-empty list. Free them all. */ - if (batch_free == MIGRATE_PCPTYPES) + if (batch_free == NR_PCP_LISTS) batch_free = count;
+ order = pindex_to_order(pindex); + BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH)); do { page = list_last_entry(list, struct page, lru); /* must delete to avoid corrupting pcp list */ list_del(&page->lru); - pcp->count--; + nr_freed += 1 << order; + count -= 1 << order;
if (bulkfree_pcp_prepare(page)) continue;
+ /* Encode order with the migratetype */ + page->index <<= NR_PCP_ORDER_WIDTH; + page->index |= order; + list_add_tail(&page->lru, &head);
/* @@ -1407,8 +1459,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, prefetch_buddy(page); prefetch_nr--; } - } while (--count && --batch_free && !list_empty(list)); + } while (count > 0 && --batch_free && !list_empty(list)); } + pcp->count -= nr_freed;
spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); @@ -1419,14 +1472,19 @@ static void free_pcppages_bulk(struct zone *zone, int count, */ list_for_each_entry_safe(page, tmp, &head, lru) { int mt = get_pcppage_migratetype(page); + + /* mt has been encoded with the order (see above) */ + order = mt & NR_PCP_ORDER_MASK; + mt >>= NR_PCP_ORDER_WIDTH; + /* MIGRATE_ISOLATE page should not go to pcplists */ VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); /* Pageblock could have been isolated meanwhile */ if (unlikely(isolated_pageblocks)) mt = get_pageblock_migratetype(page);
- __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); - trace_mm_page_pcpu_drain(page, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); + trace_mm_page_pcpu_drain(page, order, mt); } spin_unlock(&zone->lock); } @@ -3182,11 +3240,12 @@ void mark_free_pages(struct zone *zone) } #endif /* CONFIG_PM */
-static bool free_unref_page_prepare(struct page *page, unsigned long pfn) +static bool free_unref_page_prepare(struct page *page, unsigned long pfn, + unsigned int order) { int migratetype;
- if (!free_pcp_prepare(page)) + if (!free_pcp_prepare(page, order)) return false;
migratetype = get_pfnblock_migratetype(page, pfn); @@ -3194,11 +3253,13 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn) return true; }
-static void free_unref_page_commit(struct page *page, unsigned long pfn) +static void free_unref_page_commit(struct page *page, unsigned long pfn, + unsigned int order) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; int migratetype; + int pindex;
migratetype = get_pcppage_migratetype(page); __count_vm_event(PGFREE); @@ -3220,16 +3281,17 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) }
pcp = &this_cpu_ptr(zone->pageset)->pcp; - list_add(&page->lru, &pcp->lists[migratetype]); - pcp->count++; + pindex = order_to_pindex(migratetype, order); + list_add(&page->lru, &pcp->lists[pindex]); + pcp->count += 1 << order; if (pcp->count >= READ_ONCE(pcp->high)) free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp); }
/* - * Free a 0-order page + * Free a pcp page */ -void free_unref_page(struct page *page) +void free_unref_page(struct page *page, unsigned int order) { unsigned long flags; unsigned long pfn = page_to_pfn(page); @@ -3238,11 +3300,11 @@ void free_unref_page(struct page *page) if (free_page_to_dhugetlb_pool(page)) return;
- if (!free_unref_page_prepare(page, pfn)) + if (!free_unref_page_prepare(page, pfn, order)) return;
local_irq_save(flags); - free_unref_page_commit(page, pfn); + free_unref_page_commit(page, pfn, order); local_irq_restore(flags); }
@@ -3261,7 +3323,7 @@ void free_unref_page_list(struct list_head *list) /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); - if (!free_unref_page_prepare(page, pfn)) + if (!free_unref_page_prepare(page, pfn, 0)) list_del(&page->lru); set_page_private(page, pfn); } @@ -3272,7 +3334,7 @@ void free_unref_page_list(struct list_head *list)
set_page_private(page, 0); trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn); + free_unref_page_commit(page, pfn, 0);
/* * Guard against excessive IRQ disabled times when we get @@ -3407,7 +3469,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
/* Remove page from the per-cpu list, caller must protect the list */ static inline -struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, +struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, + int migratetype, unsigned int alloc_flags, struct per_cpu_pages *pcp, struct list_head *list) @@ -3416,16 +3479,30 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
do { if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - READ_ONCE(pcp->batch), list, + int batch = READ_ONCE(pcp->batch); + int alloced; + + /* + * Scale batch relative to order if batch implies + * free pages can be stored on the PCP. Batch can + * be 1 for small zones or for boot pagesets which + * should never store free pages as the pages may + * belong to arbitrary zones. + */ + if (batch > 1) + batch = max(batch >> order, 2); + alloced = rmqueue_bulk(zone, order, + batch, list, migratetype, alloc_flags); + + pcp->count += alloced << order; if (unlikely(list_empty(list))) return NULL; }
page = list_first_entry(list, struct page, lru); list_del(&page->lru); - pcp->count--; + pcp->count -= 1 << order; } while (check_new_pcp(page));
return page; @@ -3433,8 +3510,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
/* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, - struct zone *zone, gfp_t gfp_flags, - int migratetype, unsigned int alloc_flags) + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype, + unsigned int alloc_flags) { struct per_cpu_pages *pcp; struct list_head *list; @@ -3443,8 +3521,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; - list = &pcp->lists[migratetype]; - page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); + list = &pcp->lists[order_to_pindex(migratetype, order)]; + page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone); @@ -3465,15 +3543,15 @@ struct page *rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page;
- if (likely(order == 0)) { + if (likely(pcp_allowed_order(order))) { /* * MIGRATE_MOVABLE pcplist could have the pages on CMA area and * we need to skip it when CMA area isn't allowed. */ if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || migratetype != MIGRATE_MOVABLE) { - page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, - migratetype, alloc_flags); + page = rmqueue_pcplist(preferred_zone, zone, order, + gfp_flags, migratetype, alloc_flags); goto out; } } @@ -5185,7 +5263,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, /* Attempt the batch allocation */ local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; - pcp_list = &pcp->lists[ac.migratetype]; + pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
while (nr_populated < nr_pages) {
@@ -5195,7 +5273,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, continue; }
- page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags, + page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags, pcp, pcp_list); if (unlikely(!page)) { /* Try and get at least one page */ @@ -5454,8 +5532,8 @@ EXPORT_SYMBOL(get_zeroed_page);
static inline void free_the_page(struct page *page, unsigned int order) { - if (order == 0) /* Via pcp? */ - free_unref_page(page); + if (pcp_allowed_order(order)) /* Via pcp? */ + free_unref_page(page, order); else __free_pages_ok(page, order, FPI_NONE); } @@ -6873,13 +6951,13 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, static void pageset_init(struct per_cpu_pageset *p) { struct per_cpu_pages *pcp; - int migratetype; + int pindex;
memset(p, 0, sizeof(*p));
pcp = &p->pcp; - for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) - INIT_LIST_HEAD(&pcp->lists[migratetype]); + for (pindex = 0; pindex < NR_PCP_LISTS; pindex++) + INIT_LIST_HEAD(&pcp->lists[pindex]);
/* * Set batch and high values safe for a boot pageset. A true percpu diff --git a/mm/swap.c b/mm/swap.c index c37fac5a73e8..d33ec3bb0564 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -95,7 +95,7 @@ static void __put_single_page(struct page *page) { __page_cache_release(page); mem_cgroup_uncharge(page); - free_unref_page(page); + free_unref_page(page, 0); }
static void __put_compound_page(struct page *page)