The first patch is a bugfix from next branch. The last two patch is feature to allow more high-order pages stored on PCP lists.
Baolin Wang (1): mm: page_alloc: use the correct THP order for THP PCP
Kefeng Wang (2): mm: prepare more high-order pages to be stored on the per-cpu lists mm: add control to allow specified high-order pages stored on PCP list
Documentation/admin-guide/mm/transhuge.rst | 11 +++++ include/linux/gfp.h | 1 + include/linux/huge_mm.h | 1 + include/linux/mmzone.h | 4 +- mm/huge_memory.c | 47 ++++++++++++++++++++++ mm/page_alloc.c | 26 +++++++++--- 6 files changed, 84 insertions(+), 6 deletions(-)
From: Baolin Wang baolin.wang@linux.alibaba.com
next inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9OCYO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Commit 44042b449872 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists") extends the PCP allocator to store THP pages, and it determines whether to cache THP pages in PCP by comparing with pageblock_order. But the pageblock_order is not always equal to THP order. It might also be MAX_PAGE_ORDER, which could prevent PCP from caching THP pages.
Therefore, using HPAGE_PMD_ORDER instead to determine the need for caching THP for PCP will fix this issue
Link: https://lkml.kernel.org/r/a25c9e14cd03907d5978b60546a69e6aa3fc2a7d.171215183... Fixes: 44042b449872 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists") Signed-off-by: Baolin Wang baolin.wang@linux.alibaba.com Acked-by: Vlastimil Babka vbabka@suse.cz Cc: Mel Gorman mgorman@techsingularity.net Reviewed-by: Barry Song baohua@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 95bd8f6f7889..4652dc453964 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -529,7 +529,7 @@ static inline unsigned int order_to_pindex(int migratetype, int order) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { - VM_BUG_ON(order != pageblock_order); + VM_BUG_ON(order != HPAGE_PMD_ORDER); return NR_LOWORDER_PCP_LISTS; } #else @@ -545,7 +545,7 @@ static inline int pindex_to_order(unsigned int pindex)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pindex == NR_LOWORDER_PCP_LISTS) - order = pageblock_order; + order = HPAGE_PMD_ORDER; #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); #endif @@ -558,7 +558,7 @@ static inline bool pcp_allowed_order(unsigned int order) if (order <= PAGE_ALLOC_COSTLY_ORDER) return true; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order == pageblock_order) + if (order == HPAGE_PMD_ORDER) return true; #endif return false;
From: Kefeng Wang wangkefeng.wang@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9OCYO
--------------------------------
Both the file pages and anonymous pages support large folio, high-order pages except HPAGE_PMD_ORDER(PMD_SHIFT - PAGE_SHIFT) will be allocated frequently which will increase the zone lock contention, allow high-order pages on pcp lists could alleviate the big zone lock contention, in order to allows high-orders(PAGE_ALLOC_COSTLY_ORDER, HPAGE_PMD_ORDER) to be stored on the per-cpu lists, similar with PMD_ORDER pages, more lists is added in struct per_cpu_pages (one list each high-order pages), also a new PCP_MAX_ORDER instead of HPAGE_PMD_ORDER is added in mmzone.h.
But as commit 44042b449872 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists") pointed, it may not win in all the scenes, so this don't allow higher-order pages to be added to PCP list, the next will add a control to enable or disable it.
The struct per_cpu_pages increases in size from 256(4 cache lines) to 320 bytes (5 cache lines) on arm64 with defconfig.
Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com --- include/linux/mmzone.h | 4 +++- mm/page_alloc.c | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a394cbe31c59..b7b596059009 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -683,11 +683,13 @@ enum zone_watermarks { * failures. */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define NR_PCP_THP 1 +#define PCP_MAX_ORDER (PMD_SHIFT - PAGE_SHIFT) +#define NR_PCP_THP (PCP_MAX_ORDER - PAGE_ALLOC_COSTLY_ORDER) #else #define NR_PCP_THP 0 #endif #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) +#define HIGHORDER_PCP_LIST_INDEX (NR_LOWORDER_PCP_LISTS - (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4652dc453964..6432f1bb4fbb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -529,8 +529,8 @@ static inline unsigned int order_to_pindex(int migratetype, int order) { #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order > PAGE_ALLOC_COSTLY_ORDER) { - VM_BUG_ON(order != HPAGE_PMD_ORDER); - return NR_LOWORDER_PCP_LISTS; + VM_BUG_ON(order > PCP_MAX_ORDER); + return order + HIGHORDER_PCP_LIST_INDEX; } #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); @@ -544,8 +544,8 @@ static inline int pindex_to_order(unsigned int pindex) int order = pindex / MIGRATE_PCPTYPES;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pindex == NR_LOWORDER_PCP_LISTS) - order = HPAGE_PMD_ORDER; + if (pindex >= NR_LOWORDER_PCP_LISTS) + order = pindex - HIGHORDER_PCP_LIST_INDEX; #else VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); #endif @@ -558,7 +558,7 @@ static inline bool pcp_allowed_order(unsigned int order) if (order <= PAGE_ALLOC_COSTLY_ORDER) return true; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (order == HPAGE_PMD_ORDER) + if (order == PCP_MAX_ORDER) return true; #endif return false;
From: Kefeng Wang wangkefeng.wang@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9OCYO
--------------------------------
The high-order pages stored on PCP list may not always win, even herts some workloads, so it is disabled by default for high-orders except PMD_ORDER. Since there is already per-supported-THP-size interfaces to configrate mTHP behaviours, adding a new control pcp_enabled under above interfaces to allow user to enable/disable the specified high-order pages stored on PCP list or not, but it can't change the existing behaviour for order = PMD_ORDER and order <= PAGE_ALLOC_COSTLY_ORDER, they are always enabled and can't be disabled, meanwhile, when disabled by pcp_enabled for other high-orders, pcplists will be drained.
Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com --- Documentation/admin-guide/mm/transhuge.rst | 11 +++++ include/linux/gfp.h | 1 + include/linux/huge_mm.h | 1 + mm/huge_memory.c | 47 ++++++++++++++++++++++ mm/page_alloc.c | 16 ++++++++ 5 files changed, 76 insertions(+)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 936da10c5260..f39c9ef30aa6 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -189,6 +189,17 @@ madvise never should be self-explanatory.
+ +There's also sysfs knob to control hugepage to be stored on PCP lists for +high-orders(greated than PAGE_ALLOC_COSTLY_ORDER), which could reduce +the zone lock contention when allocate hige-order pages frequently. Please +note that the PCP behavior of low-order and PMD-order pages cannot changed, +it is possible to enable other higher-order pages stored on PCP lists by +writing 1 or disable it back by writing 0:: + + echo 0 >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/pcp_enabled + echo 1 >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/pcp_enabled + By default kernel tries to use huge, PMD-mappable zero page on read page fault to anonymous mapping. It's possible to disable huge zero page by writing 0 or enable it back by writing 1:: diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b18b7e3758be..b2d4f45a866b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -335,6 +335,7 @@ extern void page_frag_free(void *addr);
void page_alloc_init_cpuhp(void); int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); +void drain_all_zone_pages(void); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index abf2340a2d18..f09d2c23826a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -101,6 +101,7 @@ extern unsigned long transparent_hugepage_flags; extern unsigned long huge_anon_orders_always; extern unsigned long huge_anon_orders_madvise; extern unsigned long huge_anon_orders_inherit; +extern unsigned long huge_pcp_allow_orders;
static inline bool hugepage_global_enabled(void) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0c61e7c7c2c1..0acb3b330045 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -540,8 +540,49 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj, static struct kobj_attribute thpsize_enabled_attr = __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
+unsigned long huge_pcp_allow_orders __read_mostly; +static ssize_t thpsize_pcp_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + + return sysfs_emit(buf, "%d\n", + !!test_bit(order, &huge_pcp_allow_orders)); +} + +static ssize_t thpsize_pcp_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + unsigned long value; + int ret; + + if (order <= PAGE_ALLOC_COSTLY_ORDER || order == PMD_ORDER) + return -EINVAL; + + ret = kstrtoul(buf, 10, &value); + if (ret < 0) + return ret; + if (value > 1) + return -EINVAL; + + if (value) { + set_bit(order, &huge_pcp_allow_orders); + } else { + if (test_and_clear_bit(order, &huge_pcp_allow_orders)) + drain_all_zone_pages(); + } + + return count; +} + +static struct kobj_attribute thpsize_pcp_enabled_attr = __ATTR(pcp_enabled, + 0644, thpsize_pcp_enabled_show, thpsize_pcp_enabled_store); + static struct attribute *thpsize_attrs[] = { &thpsize_enabled_attr.attr, + &thpsize_pcp_enabled_attr.attr, NULL, };
@@ -600,6 +641,8 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) */ huge_anon_orders_inherit = BIT(PMD_ORDER);
+ huge_pcp_allow_orders = BIT(PMD_ORDER); + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { pr_err("failed to create transparent hugepage kobject\n"); @@ -627,6 +670,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) err = PTR_ERR(thpsize); goto remove_all; } + + if (order <= PAGE_ALLOC_COSTLY_ORDER) + huge_pcp_allow_orders |= BIT(order); + list_add(&thpsize->node, &thpsize_list); order = next_order(&orders, order); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6432f1bb4fbb..af9ca16c344c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -560,6 +560,8 @@ static inline bool pcp_allowed_order(unsigned int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order == PCP_MAX_ORDER) return true; + if (BIT(order) & huge_pcp_allow_orders) + return true; #endif return false; } @@ -6829,6 +6831,20 @@ void zone_pcp_reset(struct zone *zone) } }
+void drain_all_zone_pages(void) +{ + struct zone *zone; + + mutex_lock(&pcp_batch_high_lock); + for_each_populated_zone(zone) + __zone_set_pageset_high_and_batch(zone, 0, 0, 1); + __drain_all_pages(NULL, true); + for_each_populated_zone(zone) + __zone_set_pageset_high_and_batch(zone, zone->pageset_high_min, + zone->pageset_high_max, zone->pageset_batch); + mutex_unlock(&pcp_batch_high_lock); +} + #ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be in a single zone, must not contain holes,
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/7197 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/I...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/7197 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/I...