From: Huang Ying ying.huang@intel.com
mainline inclusion from mainline-v6.7-rc1 commit c0a242394cb980bd00e1f61dc8aacb453d2bbe6a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8JXIR
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When a task is allocating a large number of order-0 pages, it may acquire the zone->lock multiple times allocating pages in batches. This may unnecessarily contend on the zone lock when allocating very large number of pages. This patch adapts the size of the batch based on the recent pattern to scale the batch size for subsequent allocations.
On a 2-socket Intel server with 224 logical CPU, we run 8 kbuild instances in parallel (each with `make -j 28`) in 8 cgroup. This simulates the kbuild server that is used by 0-Day kbuild service. With the patch, the cycles% of the spinlock contention (mostly for zone lock) decreases from 12.6% to 11.0% (with PCP size == 367).
Link: https://lkml.kernel.org/r/20231016053002.756205-6-ying.huang@intel.com Signed-off-by: "Huang, Ying" ying.huang@intel.com Suggested-by: Mel Gorman mgorman@techsingularity.net Acked-by: Mel Gorman mgorman@techsingularity.net Cc: Vlastimil Babka vbabka@suse.cz Cc: David Hildenbrand david@redhat.com Cc: Johannes Weiner jweiner@redhat.com Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Michal Hocko mhocko@suse.com Cc: Pavel Tatashin pasha.tatashin@soleen.com Cc: Matthew Wilcox willy@infradead.org Cc: Christoph Lameter cl@linux.com Cc: Arjan van de Ven arjan@linux.intel.com Cc: Sudeep Holla sudeep.holla@arm.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Ze Zuo zuoze1@huawei.com --- include/linux/mmzone.h | 3 ++- mm/page_alloc.c | 53 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 45 insertions(+), 11 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cdff247e8c6f..ba548ae20686 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -697,9 +697,10 @@ struct per_cpu_pages { int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ u8 flags; /* protected by pcp->lock */ + u8 alloc_factor; /* batch scaling factor during allocate */ u8 free_factor; /* batch scaling factor during free */ #ifdef CONFIG_NUMA - short expire; /* When 0, remote pagesets are drained */ + u8 expire; /* When 0, remote pagesets are drained */ #endif
/* Lists of pages, one per migrate type stored on the pcp-lists */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 49486ed22cf8..769874713837 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2373,6 +2373,12 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, int pindex; bool free_high = false;
+ /* + * On freeing, reduce the number of pages that are batch allocated. + * See nr_pcp_alloc() where alloc_factor is increased for subsequent + * allocations. + */ + pcp->alloc_factor >>= 1; __count_vm_events(PGFREE, 1 << order); pindex = order_to_pindex(migratetype, order); list_add(&page->pcp_list, &pcp->lists[pindex]); @@ -2679,6 +2685,42 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone, return page; }
+static int nr_pcp_alloc(struct per_cpu_pages *pcp, int order) +{ + int high, batch, max_nr_alloc; + + high = READ_ONCE(pcp->high); + batch = READ_ONCE(pcp->batch); + + /* Check for PCP disabled or boot pageset */ + if (unlikely(high < batch)) + return 1; + + /* + * Double the number of pages allocated each time there is subsequent + * allocation of order-0 pages without any freeing. + */ + if (!order) { + max_nr_alloc = max(high - pcp->count - batch, batch); + batch <<= pcp->alloc_factor; + if (batch <= max_nr_alloc && + pcp->alloc_factor < CONFIG_PCP_BATCH_SCALE_MAX) + pcp->alloc_factor++; + batch = min(batch, max_nr_alloc); + } + + /* + * Scale batch relative to order if batch implies free pages + * can be stored on the PCP. Batch can be 1 for small zones or + * for boot pagesets which should never store free pages as + * the pages may belong to arbitrary zones. + */ + if (batch > 1) + batch = max(batch >> order, 2); + + return batch; +} + /* Remove page from the per-cpu list, caller must protect the list */ static inline struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order, @@ -2691,18 +2733,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
do { if (list_empty(list)) { - int batch = READ_ONCE(pcp->batch); + int batch = nr_pcp_alloc(pcp, order); int alloced;
- /* - * Scale batch relative to order if batch implies - * free pages can be stored on the PCP. Batch can - * be 1 for small zones or for boot pagesets which - * should never store free pages as the pages may - * belong to arbitrary zones. - */ - if (batch > 1) - batch = max(batch >> order, 2); alloced = rmqueue_bulk(zone, order, batch, list, migratetype, alloc_flags);