[PATCH OLK-6.6 08/12] mm: add control to allow specified high-order pages stored on PCP list

17 May 2024

hulk inclusion
category: feature
bugzilla: https://gitee.com/openeuler/kernel/issues/I9Q9DF
CVE: NA

-------------------------------------------------

The high-order pages stored on PCP list may not always win, so it is
disabled by default for high-orders except PMD_ORDER.

Adding a new control pcp_allow_high_order to allow user to enable/disable
the specified high-order(only order 4 for now) pages stored on PCP list
or not, note, the all pages on pcplists will be drained when disable it.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 Documentation/admin-guide/mm/transhuge.rst |  9 +++++++
 include/linux/gfp.h                        |  1 +
 include/linux/huge_mm.h                    |  1 +
 mm/huge_memory.c                           | 31 ++++++++++++++++++++++
 mm/page_alloc.c                            | 18 ++++++++++++-
 5 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
index e52cd57bb512..0dc0b4dab621 100644
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -220,6 +220,15 @@ writing the corresponding bit to 1::
 	echo 0x2 >/sys/kernel/mm/transparent_hugepage/thp_mapping_align
 	echo 0x3 >/sys/kernel/mm/transparent_hugepage/thp_mapping_align
 
+The kernel could enable high-orders(greated than PAGE_ALLOC_COSTLY_ORDER, only
+support order 4 for now) be stored on PCP lists(except PMD order), which could
+reduce the zone lock contention when allocate hige-order pages frequently. It
+is possible to enable order 4 pages stored on PCP lists by writing 4 or disable
+it back by writing 0::
+
+        echo 0 >/sys/kernel/mm/transparent_hugepage/pcp_allow_high_order
+        echo 4 >/sys/kernel/mm/transparent_hugepage/pcp_allow_high_order
+
 khugepaged will be automatically started when one or more hugepage
 sizes are enabled (either by directly setting "always" or "madvise",
 or by setting "inherit" while the top-level enabled is set to "always"
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index b18b7e3758be..b2d4f45a866b 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -335,6 +335,7 @@ extern void page_frag_free(void *addr);
 
 void page_alloc_init_cpuhp(void);
 int decay_pcp_high(struct zone *zone, struct per_cpu_pages *pcp);
+void drain_all_zone_pages(void);
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(struct zone *zone);
 void drain_local_pages(struct zone *zone);
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 8fdf17e80359..056f6918eeed 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -104,6 +104,7 @@ extern unsigned long transparent_hugepage_flags;
 extern unsigned long huge_anon_orders_always;
 extern unsigned long huge_anon_orders_madvise;
 extern unsigned long huge_anon_orders_inherit;
+extern unsigned long huge_pcp_allow_orders;
 
 static inline bool hugepage_global_enabled(void)
 {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 9f1100dfee66..508155fe9830 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -74,6 +74,7 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL;
 unsigned long huge_anon_orders_always __read_mostly;
 unsigned long huge_anon_orders_madvise __read_mostly;
 unsigned long huge_anon_orders_inherit __read_mostly;
+unsigned long huge_pcp_allow_orders __read_mostly;
 
 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 					 unsigned long vm_flags, bool smaps,
@@ -417,6 +418,35 @@ static ssize_t use_zero_page_store(struct kobject *kobj,
 }
 static struct kobj_attribute use_zero_page_attr = __ATTR_RW(use_zero_page);
 
+static ssize_t pcp_allow_high_order_show(struct kobject *kobj,
+					 struct kobj_attribute *attr, char *buf)
+{
+	return sysfs_emit(buf, "%lu\n", READ_ONCE(huge_pcp_allow_orders));
+}
+static ssize_t pcp_allow_high_order_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	unsigned long value;
+	int ret;
+
+	ret = kstrtoul(buf, 10, &value);
+	if (ret < 0)
+		return ret;
+
+	/* Only enable order 4 now, 0 is to disable it */
+	if (value != 0 && value != (PAGE_ALLOC_COSTLY_ORDER + 1))
+		return -EINVAL;
+
+	if (value == 0)
+		drain_all_zone_pages();
+
+	WRITE_ONCE(huge_pcp_allow_orders, value);
+
+	return count;
+}
+static struct kobj_attribute pcp_allow_high_order_attr =
+	__ATTR_RW(pcp_allow_high_order);
+
 static ssize_t hpage_pmd_size_show(struct kobject *kobj,
 				   struct kobj_attribute *attr, char *buf)
 {
@@ -531,6 +561,7 @@ static struct attribute *hugepage_attr[] = {
 	&enabled_attr.attr,
 	&defrag_attr.attr,
 	&use_zero_page_attr.attr,
+	&pcp_allow_high_order_attr.attr,
 	&hpage_pmd_size_attr.attr,
 #ifdef CONFIG_SHMEM
 	&shmem_enabled_attr.attr,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4652dc453964..f225f412e71d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -528,7 +528,7 @@ static void bad_page(struct page *page, const char *reason)
 static inline unsigned int order_to_pindex(int migratetype, int order)
 {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (order > PAGE_ALLOC_COSTLY_ORDER) {
+	if (order > PAGE_ALLOC_COSTLY_ORDER + 1) {
 		VM_BUG_ON(order != HPAGE_PMD_ORDER);
 		return NR_LOWORDER_PCP_LISTS;
 	}
@@ -560,6 +560,8 @@ static inline bool pcp_allowed_order(unsigned int order)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	if (order == HPAGE_PMD_ORDER)
 		return true;
+	if (order == READ_ONCE(huge_pcp_allow_orders))
+		return true;
 #endif
 	return false;
 }
@@ -6829,6 +6831,20 @@ void zone_pcp_reset(struct zone *zone)
 	}
 }
 
+void drain_all_zone_pages(void)
+{
+	struct zone *zone;
+
+	mutex_lock(&pcp_batch_high_lock);
+	for_each_populated_zone(zone)
+		__zone_set_pageset_high_and_batch(zone, 0, 0, 1);
+	__drain_all_pages(NULL, true);
+	for_each_populated_zone(zone)
+		__zone_set_pageset_high_and_batch(zone, zone->pageset_high_min,
+				zone->pageset_high_max, zone->pageset_batch);
+	mutex_unlock(&pcp_batch_high_lock);
+}
+
 #ifdef CONFIG_MEMORY_HOTREMOVE
 /*
  * All pages in the range must be in a single zone, must not contain holes,
-- 
2.27.0