From: Kefeng Wang wangkefeng.wang@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9OCYO
--------------------------------
The high-order pages stored on PCP list may not always win, even herts some workloads, so it is disabled by default for high-orders except PMD_ORDER. Since there is already per-supported-THP-size interfaces to configrate mTHP behaviours, adding a new control pcp_enabled under above interfaces to allow user to enable/disable the specified high-order pages stored on PCP list or not, but it can't change the existing behaviour for order = PMD_ORDER and order <= PAGE_ALLOC_COSTLY_ORDER, they are always enabled and can't be disabled, meanwhile, when disabled by pcp_enabled for other high-orders, pcplists will be drained.
Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com --- Documentation/admin-guide/mm/transhuge.rst | 11 +++++ include/linux/gfp.h | 1 + include/linux/huge_mm.h | 1 + mm/huge_memory.c | 47 ++++++++++++++++++++++ mm/page_alloc.c | 16 ++++++++ 5 files changed, 76 insertions(+)
diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 936da10c5260..f39c9ef30aa6 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -189,6 +189,17 @@ madvise never should be self-explanatory.
+ +There's also sysfs knob to control hugepage to be stored on PCP lists for +high-orders(greated than PAGE_ALLOC_COSTLY_ORDER), which could reduce +the zone lock contention when allocate hige-order pages frequently. Please +note that the PCP behavior of low-order and PMD-order pages cannot changed, +it is possible to enable other higher-order pages stored on PCP lists by +writing 1 or disable it back by writing 0:: + + echo 0 >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/pcp_enabled + echo 1 >/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/pcp_enabled + By default kernel tries to use huge, PMD-mappable zero page on read page fault to anonymous mapping. It's possible to disable huge zero page by writing 0 or enable it back by writing 1:: diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b18b7e3758be..b2d4f45a866b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -335,6 +335,7 @@ extern void page_frag_free(void *addr);
void page_alloc_init_cpuhp(void); int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp); +void drain_all_zone_pages(void); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index abf2340a2d18..f09d2c23826a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -101,6 +101,7 @@ extern unsigned long transparent_hugepage_flags; extern unsigned long huge_anon_orders_always; extern unsigned long huge_anon_orders_madvise; extern unsigned long huge_anon_orders_inherit; +extern unsigned long huge_pcp_allow_orders;
static inline bool hugepage_global_enabled(void) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0c61e7c7c2c1..0acb3b330045 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -540,8 +540,49 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj, static struct kobj_attribute thpsize_enabled_attr = __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
+unsigned long huge_pcp_allow_orders __read_mostly; +static ssize_t thpsize_pcp_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int order = to_thpsize(kobj)->order; + + return sysfs_emit(buf, "%d\n", + !!test_bit(order, &huge_pcp_allow_orders)); +} + +static ssize_t thpsize_pcp_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int order = to_thpsize(kobj)->order; + unsigned long value; + int ret; + + if (order <= PAGE_ALLOC_COSTLY_ORDER || order == PMD_ORDER) + return -EINVAL; + + ret = kstrtoul(buf, 10, &value); + if (ret < 0) + return ret; + if (value > 1) + return -EINVAL; + + if (value) { + set_bit(order, &huge_pcp_allow_orders); + } else { + if (test_and_clear_bit(order, &huge_pcp_allow_orders)) + drain_all_zone_pages(); + } + + return count; +} + +static struct kobj_attribute thpsize_pcp_enabled_attr = __ATTR(pcp_enabled, + 0644, thpsize_pcp_enabled_show, thpsize_pcp_enabled_store); + static struct attribute *thpsize_attrs[] = { &thpsize_enabled_attr.attr, + &thpsize_pcp_enabled_attr.attr, NULL, };
@@ -600,6 +641,8 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) */ huge_anon_orders_inherit = BIT(PMD_ORDER);
+ huge_pcp_allow_orders = BIT(PMD_ORDER); + *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { pr_err("failed to create transparent hugepage kobject\n"); @@ -627,6 +670,10 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) err = PTR_ERR(thpsize); goto remove_all; } + + if (order <= PAGE_ALLOC_COSTLY_ORDER) + huge_pcp_allow_orders |= BIT(order); + list_add(&thpsize->node, &thpsize_list); order = next_order(&orders, order); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6432f1bb4fbb..af9ca16c344c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -560,6 +560,8 @@ static inline bool pcp_allowed_order(unsigned int order) #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (order == PCP_MAX_ORDER) return true; + if (BIT(order) & huge_pcp_allow_orders) + return true; #endif return false; } @@ -6829,6 +6831,20 @@ void zone_pcp_reset(struct zone *zone) } }
+void drain_all_zone_pages(void) +{ + struct zone *zone; + + mutex_lock(&pcp_batch_high_lock); + for_each_populated_zone(zone) + __zone_set_pageset_high_and_batch(zone, 0, 0, 1); + __drain_all_pages(NULL, true); + for_each_populated_zone(zone) + __zone_set_pageset_high_and_batch(zone, zone->pageset_high_min, + zone->pageset_high_max, zone->pageset_batch); + mutex_unlock(&pcp_batch_high_lock); +} + #ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be in a single zone, must not contain holes,