From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
There are several functions that will be used in next patches for dynamic hugetlb feature. Declare them.
No functional changes.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/hugetlb.h | 3 +++ include/linux/memcontrol.h | 16 ++++++++++++++++ include/linux/memory_hotplug.h | 6 ++++++ mm/hugetlb.c | 2 +- mm/internal.h | 3 +++ mm/memcontrol.c | 16 ---------------- mm/memory_hotplug.c | 3 +-- mm/page_alloc.c | 6 +++--- 8 files changed, 33 insertions(+), 22 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 61c38e6c6c43..a1135c43719e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -471,6 +471,9 @@ static inline struct hstate *hstate_inode(struct inode *i) { return HUGETLBFS_SB(i->i_sb)->hstate; } + +bool prep_compound_gigantic_page(struct page *page, unsigned int order); + #else /* !CONFIG_HUGETLBFS */
#define is_file_hugepages(file) false diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e5826f1ff337..2e0a480a8665 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1239,6 +1239,22 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned);
+/* + * Test whether @memcg has children, dead or alive. Note that this + * function doesn't care whether @memcg has use_hierarchy enabled and + * returns %true if there are child csses according to the cgroup + * hierarchy. Testing use_hierarchy is the caller's responsibility. + */ +static inline bool memcg_has_children(struct mem_cgroup *memcg) +{ + bool ret; + + rcu_read_lock(); + ret = css_next_child(NULL, &memcg->css); + rcu_read_unlock(); + return ret; +} + #else /* CONFIG_MEMCG */
#define MEM_CGROUP_ID_SHIFT 0 diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index c60bda5cbb17..b9aeabcce49a 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -284,6 +284,7 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
#ifdef CONFIG_MEMORY_HOTREMOVE
+extern int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn); extern void try_offline_node(int nid); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern int remove_memory(int nid, u64 start, u64 size); @@ -291,6 +292,11 @@ extern void __remove_memory(int nid, u64 start, u64 size); extern int offline_and_remove_memory(int nid, u64 start, u64 size);
#else +static inline int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) +{ + return -ENOSYS; +} + static inline void try_offline_node(int nid) {}
static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 219bf083dc8a..fa3cba3571cc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1633,7 +1633,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) spin_unlock_irq(&hugetlb_lock); }
-static bool prep_compound_gigantic_page(struct page *page, unsigned int order) +bool prep_compound_gigantic_page(struct page *page, unsigned int order) { int i, j; int nr_pages = 1 << order; diff --git a/mm/internal.h b/mm/internal.h index db9546707695..31517354f3c7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -197,6 +197,9 @@ extern void __free_pages_core(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); +extern void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags); +extern bool free_pages_prepare(struct page *page, unsigned int order, bool check_free); extern int user_min_free_kbytes;
extern void zone_pcp_update(struct zone *zone); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8acf7ff56294..011aff396af2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3402,22 +3402,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, return nr_reclaimed; }
-/* - * Test whether @memcg has children, dead or alive. Note that this - * function doesn't care whether @memcg has use_hierarchy enabled and - * returns %true if there are child csses according to the cgroup - * hierarchy. Testing use_hierarchy is the caller's responsibility. - */ -static inline bool memcg_has_children(struct mem_cgroup *memcg) -{ - bool ret; - - rcu_read_lock(); - ret = css_next_child(NULL, &memcg->css); - rcu_read_unlock(); - return ret; -} - /* * Reclaims as many pages from the given memcg as possible. * diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1549b19b36f6..73ea92dae74a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1165,8 +1165,7 @@ static int scan_movable_pages(unsigned long start, unsigned long end, return 0; }
-static int -do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) +int do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) { unsigned long pfn; struct page *page, *head; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 988051bf6795..0ff4f4e3a538 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1203,7 +1203,7 @@ static void kernel_init_free_pages(struct page *page, int numpages) kasan_enable_current(); }
-static __always_inline bool free_pages_prepare(struct page *page, +__always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free) { int bad = 0; @@ -2283,8 +2283,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_owner(page, order, gfp_flags); }
-static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, - unsigned int alloc_flags) +void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags) { post_alloc_hook(page, order, gfp_flags);
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
In next patches, struct hugetlbfs_inode_info will be used to check whether a hugetlbfs file has memory in hpool, so add paramter hugetlbfs_inode_info to related functions, including hugetlb_acct_memory/hugepage_subpool_get_pages/ hugepage_subpool_put_pages.
No functional changes.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/hugetlb.c | 50 +++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 23 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fa3cba3571cc..1528a12ab3a9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -86,7 +86,7 @@ int sysctl_hugetlb_mig_noalloc; int sysctl_hugetlb_pmem_allocall;
/* Forward declaration */ -static int hugetlb_acct_memory(struct hstate *h, long delta); +static int hugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_inode_info *info);
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, unsigned long irq_flags) @@ -101,7 +101,7 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool, if (free) { if (spool->min_hpages != -1) hugetlb_acct_memory(spool->hstate, - -spool->min_hpages); + -spool->min_hpages, NULL); kfree(spool); } } @@ -121,7 +121,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, spool->hstate = h; spool->min_hpages = min_hpages;
- if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { + if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages, NULL)) { kfree(spool); return NULL; } @@ -149,7 +149,7 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) * a subpool minimum size must be maintained. */ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, - long delta) + long delta, struct hugetlbfs_inode_info *info) { long ret = delta;
@@ -194,7 +194,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, * in the case where a subpool minimum size must be maintained. */ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, - long delta) + long delta, struct hugetlbfs_inode_info *info) { long ret = delta; unsigned long flags; @@ -742,11 +742,11 @@ void hugetlb_fix_reserve_counts(struct inode *inode) long rsv_adjust; bool reserved = false;
- rsv_adjust = hugepage_subpool_get_pages(spool, 1); + rsv_adjust = hugepage_subpool_get_pages(spool, 1, HUGETLBFS_I(inode)); if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode);
- if (!hugetlb_acct_memory(h, 1)) + if (!hugetlb_acct_memory(h, 1, HUGETLBFS_I(inode))) reserved = true; } else if (!rsv_adjust) { reserved = true; @@ -1589,7 +1589,7 @@ void free_huge_page(struct page *page) * after page is free. Therefore, force restore_reserve * operation. */ - if (hugepage_subpool_put_pages(spool, 1) == 0) + if (hugepage_subpool_put_pages(spool, 1, NULL) == 0) restore_reserve = true; }
@@ -2465,6 +2465,7 @@ static void restore_reserve_on_error(struct hstate *h, struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { + struct hugetlbfs_inode_info *info = HUGETLBFS_I(file_inode(vma->vm_file)); struct hugepage_subpool *spool = subpool_vma(vma); struct hstate *h = hstate_vma(vma); struct page *page; @@ -2492,7 +2493,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * checked against any subpool limit. */ if (map_chg || avoid_reserve) { - gbl_chg = hugepage_subpool_get_pages(spool, 1); + gbl_chg = hugepage_subpool_get_pages(spool, 1, info); if (gbl_chg < 0) { vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); @@ -2570,8 +2571,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, */ long rsv_adjust;
- rsv_adjust = hugepage_subpool_put_pages(spool, 1); - hugetlb_acct_memory(h, -rsv_adjust); + rsv_adjust = hugepage_subpool_put_pages(spool, 1, info); + hugetlb_acct_memory(h, -rsv_adjust, info); if (deferred_reserve) hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), pages_per_huge_page(h), page); @@ -2586,7 +2587,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, h_cg); out_subpool_put: if (map_chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); + hugepage_subpool_put_pages(spool, 1, info); vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -3891,7 +3892,7 @@ unsigned long hugetlb_total_pages(void) return nr_total_pages; }
-static int hugetlb_acct_memory(struct hstate *h, long delta) +static int hugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_inode_info *info) { int ret = -ENOMEM;
@@ -3958,6 +3959,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
static void hugetlb_vm_op_close(struct vm_area_struct *vma) { + struct hugetlbfs_inode_info *info = HUGETLBFS_I(file_inode(vma->vm_file)); struct hstate *h = hstate_vma(vma); struct resv_map *resv = vma_resv_map(vma); struct hugepage_subpool *spool = subpool_vma(vma); @@ -3977,8 +3979,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) * Decrement reserve counts. The global reserve count may be * adjusted if the subpool has a minimum size. */ - gbl_reserve = hugepage_subpool_put_pages(spool, reserve); - hugetlb_acct_memory(h, -gbl_reserve); + gbl_reserve = hugepage_subpool_put_pages(spool, reserve, info); + hugetlb_acct_memory(h, -gbl_reserve, info); }
kref_put(&resv->refs, resv_map_release); @@ -5424,6 +5426,7 @@ int hugetlb_reserve_pages(struct inode *inode, struct resv_map *resv_map; struct hugetlb_cgroup *h_cg = NULL; long gbl_reserve, regions_needed = 0; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
/* This should never happen */ if (from > to) { @@ -5492,7 +5495,7 @@ int hugetlb_reserve_pages(struct inode *inode, * the subpool has a minimum size, there may be some global * reservations already in place (gbl_reserve). */ - gbl_reserve = hugepage_subpool_get_pages(spool, chg); + gbl_reserve = hugepage_subpool_get_pages(spool, chg, info); if (gbl_reserve < 0) { ret = -ENOSPC; goto out_uncharge_cgroup; @@ -5502,7 +5505,7 @@ int hugetlb_reserve_pages(struct inode *inode, * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ - ret = hugetlb_acct_memory(h, gbl_reserve); + ret = hugetlb_acct_memory(h, gbl_reserve, info); if (ret < 0) { goto out_put_pages; } @@ -5522,7 +5525,7 @@ int hugetlb_reserve_pages(struct inode *inode, add = region_add(resv_map, from, to, regions_needed, h, h_cg);
if (unlikely(add < 0)) { - hugetlb_acct_memory(h, -gbl_reserve); + hugetlb_acct_memory(h, -gbl_reserve, info); ret = add; goto out_put_pages; } else if (unlikely(chg > add)) { @@ -5544,8 +5547,8 @@ int hugetlb_reserve_pages(struct inode *inode, (chg - add) * pages_per_huge_page(h), h_cg);
rsv_adjust = hugepage_subpool_put_pages(spool, - chg - add); - hugetlb_acct_memory(h, -rsv_adjust); + chg - add, info); + hugetlb_acct_memory(h, -rsv_adjust, info); } else if (h_cg) { /* * The file_regions will hold their own reference to @@ -5559,7 +5562,7 @@ int hugetlb_reserve_pages(struct inode *inode, return 0; out_put_pages: /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); + (void)hugepage_subpool_put_pages(spool, chg, info); out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); @@ -5583,6 +5586,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
/* * Since this routine can be called in the evict inode path for all @@ -5607,8 +5611,8 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, * If the subpool has a minimum size, the number of global * reservations to be released may be adjusted. */ - gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); - hugetlb_acct_memory(h, -gbl_reserve); + gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed), info); + hugetlb_acct_memory(h, -gbl_reserve, info);
return 0; }
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
Dynamic hugetlb is a self-developed feature based on the hugetlb and memcontrol. It supports to split huge page dynamically in a memory cgroup. There is a new structure dhugetlb_pool in every mem_cgroup to manage the pages configured to the mem_cgroup. For the mem_cgroup configured with dhugetlb_pool, processes in the mem_cgroup will preferentially use the pages in dhugetlb_pool.
Dynamic hugetlb supports three types of pages, including 1G/2M huge pages and 4K pages. For the mem_cgroup configured with dhugetlb_pool, processes will be limited to alloc 1G/2M huge pages only from dhugetlb_pool. But there is no such constraint for 4K pages. If there are insufficient 4K pages in the dhugetlb_pool, pages can also be allocated from buddy system. So before using dynamic hugetlb, user must know how many huge pages they need.
Usage: 1. Add 'dynamic_hugetlb=on' in cmdline to enable dynamic hugetlb feature. 2. Prealloc some 1G hugepages through hugetlb. 3. Create a mem_cgroup and configure dhugetlb_pool to mem_cgroup. 4. Configure the count of 1G/2M hugepages, and the remaining pages in dhugetlb_pool will be used as basic pages. 5. Bound a process to mem_cgroup. then the memory for it will be allocated from dhugetlb_pool.
This patch add the corresponding structure dhugetlb_pool for dynamic hugetlb feature, the interface 'dhugetlb.nr_pages' in mem_cgroup to configure dhugetlb_pool and the cmdline 'dynamic_hugetlb=on' to enable dynamic hugetlb feature.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/Kconfig | 10 + include/linux/dynamic_hugetlb.h | 106 +++++++++ include/linux/memcontrol.h | 5 +- kernel/cgroup/cgroup.c | 11 + mm/Makefile | 1 + mm/dynamic_hugetlb.c | 375 ++++++++++++++++++++++++++++++++ mm/hugetlb.c | 3 + mm/memcontrol.c | 10 + 8 files changed, 520 insertions(+), 1 deletion(-) create mode 100644 include/linux/dynamic_hugetlb.h create mode 100644 mm/dynamic_hugetlb.c
diff --git a/fs/Kconfig b/fs/Kconfig index 3cc647e00f3c..20bd86b65dcc 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -251,6 +251,16 @@ config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON to enable freeing vmemmap pages of HugeTLB by default. It can then be disabled on the command line via hugetlb_free_vmemmap=off.
+config DYNAMIC_HUGETLB + bool "Dynamic HugeTLB" + depends on X86_64 + depends on HUGETLBFS + depends on MEMCG && CGROUP_HUGETLB + help + Dynamic hugepage are used in memcg and can be splited into small + pages automatically. The tasks in the memcg prefer to alloc dynamic + hugepage. + config MEMFD_CREATE def_bool TMPFS || HUGETLBFS
diff --git a/include/linux/dynamic_hugetlb.h b/include/linux/dynamic_hugetlb.h new file mode 100644 index 000000000000..30ccbd9f1853 --- /dev/null +++ b/include/linux/dynamic_hugetlb.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef __LINUX_DYNAMIC_HUGETLB_H +#define __LINUX_DYNAMIC_HUGETLB_H + +#include <linux/hugetlb.h> +#include <linux/memcontrol.h> + +#ifdef CONFIG_DYNAMIC_HUGETLB + +extern struct static_key_false dhugetlb_enabled_key; +#define dhugetlb_enabled (static_branch_unlikely(&dhugetlb_enabled_key)) + +#define NR_PERCPU_POOL num_possible_cpus() +#define PERCPU_POOL_PAGE_MAX 1024 +#define PERCPU_POOL_PAGE_BATCH (PERCPU_POOL_PAGE_MAX >> 2) + +struct split_hugepage { + struct list_head head_pages; + unsigned long start_pfn; +}; + +struct percpu_pages_pool { + spinlock_t lock; + unsigned long free_pages; + long used_pages; + struct list_head head_page; +}; + +struct huge_pages_pool { + /* + * This four counts is used for huge page allocation. + */ + unsigned long nr_huge_pages; + unsigned long free_huge_pages; + unsigned long resv_huge_pages; + unsigned long used_huge_pages; + /* + * free_normal_pages means how many huge pages can be split to + * smaller pages or reserved for huge page allocation. + */ + unsigned long free_normal_pages; + /* + * split_normal_pages means how many huge pages have already been + * split. + */ + unsigned long split_normal_pages; + struct list_head hugepage_freelists; + /* Used to record which hugepages have been split */ + struct list_head hugepage_splitlists; +}; + +enum huge_pages_pool_type { + HUGE_PAGES_POOL_1G, + HUGE_PAGES_POOL_2M, + HUGE_PAGES_POOL_4K, + HUGE_PAGES_POOL_MAX, +}; +/* + * Dynamic hugetlb pool data structure. Each Dynamic hugetlb pool is + * associated with one memory cgroup and controls the allocation of memory + * resources for both processes and files which belongs to the memory cgroup. + */ +struct dhugetlb_pool { + int nid; + spinlock_t lock; + spinlock_t reserved_lock; + atomic_t refcnt; + unsigned long normal_pages_disabled; + + struct mem_cgroup *attach_memcg; + + unsigned long total_huge_pages; + struct huge_pages_pool hpages_pool[HUGE_PAGES_POOL_MAX]; + struct percpu_pages_pool percpu_pool[0]; +}; + +bool dhugetlb_hide_files(struct cftype *cft); +ssize_t write_hugepage_to_hpool(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int hugetlb_pool_info_show(struct seq_file *m, void *v); +void hugetlb_pool_inherit(struct mem_cgroup *memcg, struct mem_cgroup *parent); +int hugetlb_pool_destroy(struct cgroup *cgrp); +void __init dynamic_hugetlb_init(void); + +#else + +#define dhugetlb_enabled 0 + +struct dhugetlb_pool {}; + +static inline bool dhugetlb_hide_files(struct cftype *cft) +{ + return false; +} +static inline void hugetlb_pool_inherit(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ +} +static inline int hugetlb_pool_destroy(struct cgroup *cgrp) +{ + return 0; +} +static inline void __init dynamic_hugetlb_init(void) +{ +} +#endif /* CONFIG_DYNAMIC_HUGETLB */ +#endif /* __LINUX_DYNAMIC_HUGETLB_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2e0a480a8665..7cc7cfe55d9a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -22,6 +22,7 @@ #include <linux/writeback.h> #include <linux/page-flags.h> #include <linux/kabi.h> +#include <linux/dynamic_hugetlb.h>
struct mem_cgroup; struct obj_cgroup; @@ -370,6 +371,9 @@ struct mem_cgroup { struct deferred_split deferred_split_queue; #endif
+#ifdef CONFIG_DYNAMIC_HUGETLB + struct dhugetlb_pool *hpool; +#endif KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) @@ -1238,7 +1242,6 @@ void split_page_memcg(struct page *head, unsigned int nr); unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); - /* * Test whether @memcg has children, dead or alive. Note that this * function doesn't care whether @memcg has use_hierarchy enabled and diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 8ef51ddfb301..59cc82ef52a6 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -57,6 +57,7 @@ #include <linux/fs_parser.h> #include <linux/sched/cputime.h> #include <linux/psi.h> +#include <linux/dynamic_hugetlb.h> #include <net/sock.h>
#define CREATE_TRACE_POINTS @@ -4009,6 +4010,9 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, continue; if ((cft->flags & CFTYPE_DEBUG) && !cgroup_debug) continue; + /* if dynamic hugetlb is not enabled, hide the interfaces */ + if (dhugetlb_hide_files(cft)) + continue; if (is_add) { ret = cgroup_add_file(css, cgrp, cft); if (ret) { @@ -5609,6 +5613,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (css_has_online_children(&cgrp->self)) return -EBUSY;
+ /* + * If dynamic hugetlb is enabled, make sure dhugtlb_pool is free + * before removing the corresponding memory cgroup. + */ + if (hugetlb_pool_destroy(cgrp)) + return -EBUSY; + /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling diff --git a/mm/Makefile b/mm/Makefile index ec3d0ab14a6a..f3dce99ee62f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -72,6 +72,7 @@ obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP) += hugetlb_vmemmap.o +obj-$(CONFIG_DYNAMIC_HUGETLB) += dynamic_hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c new file mode 100644 index 000000000000..8881e9e1a032 --- /dev/null +++ b/mm/dynamic_hugetlb.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * dynamic hugetlb core file + */ + +#include <linux/dynamic_hugetlb.h> + +static bool enable_dhugetlb = false; +DEFINE_STATIC_KEY_FALSE(dhugetlb_enabled_key); + +#define hugepage_index(pfn) ((pfn) >> (PUD_SHIFT - PAGE_SHIFT)) + +static bool get_hpool_unless_zero(struct dhugetlb_pool *hpool) +{ + if (!dhugetlb_enabled || !hpool) + return false; + return atomic_inc_not_zero(&hpool->refcnt); +} + +static void put_hpool(struct dhugetlb_pool *hpool) +{ + if (!dhugetlb_enabled || !hpool) + return; + if (atomic_dec_and_test(&hpool->refcnt)) { + css_put(&hpool->attach_memcg->css); + kfree(hpool); + } +} + +struct dhugetlb_pagelist { + unsigned long count; + struct dhugetlb_pool *hpool[0]; +}; + +static struct dhugetlb_pagelist *dhugetlb_pagelist_t; +static DEFINE_RWLOCK(dhugetlb_pagelist_rwlock); + +static int set_hpool_in_dhugetlb_pagelist(unsigned long idx, struct dhugetlb_pool *hpool) +{ + /* + * There is not conflit when write to dhugetlb_pagelist_t->hpool, so just + * need read_lock here. + */ + read_lock(&dhugetlb_pagelist_rwlock); + + /* + * If page's pfn is greater than dhugetlb_pagelist_t->count (which may + * occurs due to memory hotplug) then dhugetlb_pagelist_t need to be + * reallocated, so need write_lock here. + */ + if (idx >= dhugetlb_pagelist_t->count) { + unsigned long size; + struct dhugetlb_pagelist *tmp; + + read_unlock(&dhugetlb_pagelist_rwlock); + write_lock(&dhugetlb_pagelist_rwlock); + + size = sizeof(struct dhugetlb_pagelist) + + (idx + 1) * sizeof(struct dhugetlb_pool *); + tmp = krealloc(dhugetlb_pagelist_t, size, GFP_ATOMIC); + if (!tmp) { + write_unlock(&dhugetlb_pagelist_rwlock); + return -ENOMEM; + } + tmp->count = idx + 1; + dhugetlb_pagelist_t = tmp; + + write_unlock(&dhugetlb_pagelist_rwlock); + read_lock(&dhugetlb_pagelist_rwlock); + } + dhugetlb_pagelist_t->hpool[idx] = hpool; + read_unlock(&dhugetlb_pagelist_rwlock); + + return 0; +} + +static int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool, + unsigned long nid, unsigned long nr_pages) +{ + struct hstate *h = size_to_hstate(PUD_SIZE); + struct huge_pages_pool *hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_1G]; + struct page *page, *next; + unsigned long count = 0, idx; + int ret = 0; + + if (!h) + return -ENOMEM; + + spin_lock(&hpool->lock); + spin_lock(&hugetlb_lock); + if (h->free_huge_pages_node[nid] - h->resv_huge_pages_node[nid] < nr_pages) { + ret = -ENOMEM; + goto out_unlock; + } + + list_for_each_entry_safe(page, next, &h->hugepage_freelists[nid], lru) { + idx = hugepage_index(page_to_pfn(page)); + ret = set_hpool_in_dhugetlb_pagelist(idx, hpool); + if (ret) + continue; + + list_move_tail(&page->lru, &hpages_pool->hugepage_freelists); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + hpool->total_huge_pages++; + hpages_pool->free_normal_pages++; + + if (++count == nr_pages) + break; + } + +out_unlock: + spin_unlock(&hugetlb_lock); + spin_unlock(&hpool->lock); + return ret; +} + +static int free_hugepage_to_hugetlb(struct dhugetlb_pool *hpool) +{ + struct hstate *h = size_to_hstate(PUD_SIZE); + struct huge_pages_pool *hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_1G]; + struct page *page, *next, *p; + unsigned long pfn, idx; + unsigned int nr_pages; + int nid, ret = 0; + + spin_lock(&hpool->lock); + spin_lock(&hugetlb_lock); + list_for_each_entry_safe(page, next, &hpages_pool->hugepage_freelists, lru) { + nr_pages = 1 << huge_page_order(h); + pfn = page_to_pfn(page); + for (; nr_pages--; pfn++) { + p = pfn_to_page(pfn); + p->mapping = NULL; + } + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + + nid = page_to_nid(page); + list_move(&page->lru, &h->hugepage_freelists[nid]); + hpool->total_huge_pages--; + hpages_pool->free_normal_pages--; + h->free_huge_pages++; + h->free_huge_pages_node[nid]++; + + idx = hugepage_index(page_to_pfn(page)); + ret = set_hpool_in_dhugetlb_pagelist(idx, NULL); + if (ret) + break; + } + spin_unlock(&hugetlb_lock); + spin_unlock(&hpool->lock); + return ret; +} + +void hugetlb_pool_inherit(struct mem_cgroup *memcg, struct mem_cgroup *parent) +{ + if (!dhugetlb_enabled || !memcg || !parent) + return; + memcg->hpool = parent->hpool; +} + +static int hugetlb_pool_create(struct mem_cgroup *memcg, unsigned long nid) +{ + struct dhugetlb_pool *hpool; + int i; + + if (memcg_has_children(memcg)) + return -EINVAL; + + hpool = kzalloc(sizeof(struct dhugetlb_pool) + + NR_PERCPU_POOL * sizeof(struct percpu_pages_pool), GFP_KERNEL); + if (!hpool) + return -ENOMEM; + + spin_lock_init(&hpool->lock); + spin_lock_init(&hpool->reserved_lock); + hpool->nid = nid; + atomic_set(&hpool->refcnt, 1); + + for (i = 0; i < HUGE_PAGES_POOL_MAX; i++) { + INIT_LIST_HEAD(&hpool->hpages_pool[i].hugepage_freelists); + INIT_LIST_HEAD(&hpool->hpages_pool[i].hugepage_splitlists); + } + for (i = 0; i < NR_PERCPU_POOL; i++) { + spin_lock_init(&hpool->percpu_pool[i].lock); + INIT_LIST_HEAD(&hpool->percpu_pool[i].head_page); + } + + hpool->attach_memcg = memcg; + css_get(&memcg->css); + memcg->hpool = hpool; + + return 0; +} + +int hugetlb_pool_destroy(struct cgroup *cgrp) +{ + struct cgroup_subsys_state *css = cgrp->subsys[memory_cgrp_id]; + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct dhugetlb_pool *hpool = memcg ? memcg->hpool : NULL; + int ret = 0; + + if (!dhugetlb_enabled) + return 0; + + if (!hpool || hpool->attach_memcg != memcg) + return 0; + + ret = free_hugepage_to_hugetlb(hpool); + memcg->hpool = NULL; + + put_hpool(hpool); + return ret; +} + +static int hugetlb_pool_update(struct mem_cgroup *memcg, + unsigned long nid, unsigned long size) +{ + struct dhugetlb_pool *hpool; + bool new_create = false; + int ret = -EINVAL; + +again: + hpool = memcg->hpool; + if (!hpool) { + ret = hugetlb_pool_create(memcg, nid); + if (ret) + return ret; + new_create = true; + goto again; + } + if (!get_hpool_unless_zero(hpool)) + return -EINVAL; + + if (hpool->attach_memcg != memcg || hpool->nid != nid) + goto out; + ret = alloc_hugepage_from_hugetlb(hpool, nid, size); + /* + * if create a new hpool here but alloc hugepages failed, + * destroy it directly here. + */ + if (ret && new_create) { + memcg->hpool = NULL; + put_hpool(hpool); + } +out: + put_hpool(hpool); + return ret; +} + +bool dhugetlb_hide_files(struct cftype *cft) +{ + if (!dhugetlb_enabled && strstr(cft->name, "dhugetlb")) + return true; + return false; +} + +ssize_t write_hugepage_to_hpool(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long nid, size; + char *endp; + int ret; + + if (!dhugetlb_enabled || !memcg) + return -EINVAL; + + buf = strstrip(buf); + nid = memparse(buf, &endp); + if (*endp != ' ' || nid < 0 || nid >= MAX_NUMNODES) + return -EINVAL; + + buf = endp + 1; + size = memparse(buf, &endp); + if (*endp != '\0' || size == 0) + return -EINVAL; + + ret = hugetlb_pool_update(memcg, nid, size); + + return ret ? : nbytes; +} + +int hugetlb_pool_info_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct dhugetlb_pool *hpool = memcg ? memcg->hpool : NULL; + unsigned long free_pages; + long used_pages = 0; + int i; + + if (!dhugetlb_enabled) + return 0; + + if (!hpool) { + seq_printf(m, "Curent hierarchial have not memory pool.\n"); + return 0; + } + + if (!get_hpool_unless_zero(hpool)) + return 0; + + for (i = 0; i < NR_PERCPU_POOL; i++) + spin_lock(&hpool->percpu_pool[i].lock); + spin_lock(&hpool->lock); + + free_pages = hpool->hpages_pool[HUGE_PAGES_POOL_4K].free_normal_pages; + for (i = 0; i < NR_PERCPU_POOL; i++) { + free_pages += hpool->percpu_pool[i].free_pages; + used_pages += hpool->percpu_pool[i].used_pages; + } + + seq_printf(m, + "dhugetlb_total_pages %ld\n" + "1G_total_reserved_pages %ld\n" + "1G_free_reserved_pages %ld\n" + "1G_mmap_reserved_pages %ld\n" + "1G_used_pages %ld\n" + "2M_total_reserved_pages %ld\n" + "2M_free_reserved_pages %ld\n" + "2M_mmap_reserved_pages %ld\n" + "2M_used_pages %ld\n" + "1G_free_unreserved_pages %ld\n" + "2M_free_unreserved_pages %ld\n" + "4K_free_pages %ld\n" + "4K_used_pages %ld\n", + hpool->total_huge_pages, + hpool->hpages_pool[HUGE_PAGES_POOL_1G].nr_huge_pages, + hpool->hpages_pool[HUGE_PAGES_POOL_1G].free_huge_pages, + hpool->hpages_pool[HUGE_PAGES_POOL_1G].resv_huge_pages, + hpool->hpages_pool[HUGE_PAGES_POOL_1G].used_huge_pages, + hpool->hpages_pool[HUGE_PAGES_POOL_2M].nr_huge_pages, + hpool->hpages_pool[HUGE_PAGES_POOL_2M].free_huge_pages, + hpool->hpages_pool[HUGE_PAGES_POOL_2M].resv_huge_pages, + hpool->hpages_pool[HUGE_PAGES_POOL_2M].used_huge_pages, + hpool->hpages_pool[HUGE_PAGES_POOL_1G].free_normal_pages, + hpool->hpages_pool[HUGE_PAGES_POOL_2M].free_normal_pages, + free_pages, + used_pages); + + spin_unlock(&hpool->lock); + for (i = NR_PERCPU_POOL - 1; i >= 0; i--) + spin_unlock(&hpool->percpu_pool[i].lock); + put_hpool(hpool); + return 0; +} + +#define DEFAULT_PAGELIST_COUNT 4096 +void __init dynamic_hugetlb_init(void) +{ + unsigned long count, size; + + if (!enable_dhugetlb) + return; + + count = max(hugepage_index(max_pfn), (unsigned long)DEFAULT_PAGELIST_COUNT); + size = sizeof(struct dhugetlb_pagelist) + count * sizeof(struct dhugetlb_pool *); + dhugetlb_pagelist_t = kzalloc(size, GFP_KERNEL); + if (!dhugetlb_pagelist_t) { + pr_info("Dynamic hugetlb init failed, need %lu memory\n", size); + return; + } + + dhugetlb_pagelist_t->count = count; + static_branch_enable(&dhugetlb_enabled_key); + pr_info("Dynamic hugetlb is enabled\n"); +} + +static int __init dynamic_hugetlb_setup(char *s) +{ + if (!strcmp(s, "on")) + enable_dhugetlb = true; + return 1; +} +__setup("dynamic_hugetlb=", dynamic_hugetlb_setup); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1528a12ab3a9..6049fd4a9050 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -43,6 +43,7 @@ #include <linux/node.h> #include <linux/userfaultfd_k.h> #include <linux/page_owner.h> +#include <linux/dynamic_hugetlb.h> #include "internal.h" #include "hugetlb_vmemmap.h"
@@ -3460,6 +3461,8 @@ static int __init hugetlb_init(void) hugetlb_register_all_nodes(); hugetlb_cgroup_file_init();
+ dynamic_hugetlb_init(); + #ifdef CONFIG_SMP num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); #else diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 011aff396af2..1a292d54e7ad 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5195,6 +5195,14 @@ static struct cftype mem_cgroup_legacy_files[] = { .write_s64 = memcg_qos_write, }, #endif +#ifdef CONFIG_DYNAMIC_HUGETLB + { + .name = "dhugetlb.nr_pages", + .write = write_hugepage_to_hpool, + .seq_show = hugetlb_pool_info_show, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | CFTYPE_NOT_ON_ROOT, + }, +#endif #ifdef CONFIG_NUMA { .name = "numa_stat", @@ -5523,6 +5531,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; }
+ hugetlb_pool_inherit(memcg, parent); + error = memcg_online_kmem(memcg); if (error) goto fail;
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
PG_pool is used to identify whether a page is belonging to a hugetlb_pool.
Signed-off-by: Liu Shixin liushixin2@hauwei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page-flags.h | 6 ++++++ include/trace/events/mmflags.h | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 18dbfa2a7c5f..b47a5514ebc8 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -139,6 +139,7 @@ enum pageflags { #ifdef CONFIG_64BIT PG_arch_2, #endif + PG_pool, /* Used to track page allocated from dynamic hugetlb pool */
/* Add reserved page flags for internal extension. For the new page * flags which backported from kernel upstream, please place them @@ -461,6 +462,11 @@ PAGEFLAG(Idle, idle, PF_ANY) */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND)
+/* + * PagePool() is used to track page allocated from hpool. + */ +PAGEFLAG(Pool, pool, PF_NO_TAIL) + /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 30700ccb1eea..1a2896fc039e 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -108,7 +108,8 @@ {1UL << PG_mappedtodisk, "mappedtodisk" }, \ {1UL << PG_reclaim, "reclaim" }, \ {1UL << PG_swapbacked, "swapbacked" }, \ - {1UL << PG_unevictable, "unevictable" } \ + {1UL << PG_unevictable, "unevictable" }, \ + {1UL << PG_pool, "pool" } \ IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
Add two interfaces in mem_cgroup to configure the count of 1G/2M hugepages in dhugetlb_pool.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/dynamic_hugetlb.h | 4 +++ mm/dynamic_hugetlb.c | 56 +++++++++++++++++++++++++++++++++ mm/memcontrol.c | 10 ++++++ 3 files changed, 70 insertions(+)
diff --git a/include/linux/dynamic_hugetlb.h b/include/linux/dynamic_hugetlb.h index 30ccbd9f1853..d0f6c1dd2361 100644 --- a/include/linux/dynamic_hugetlb.h +++ b/include/linux/dynamic_hugetlb.h @@ -75,6 +75,10 @@ struct dhugetlb_pool { };
bool dhugetlb_hide_files(struct cftype *cft); +ssize_t write_2M_reserved_pages(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +ssize_t write_1G_reserved_pages(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); ssize_t write_hugepage_to_hpool(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int hugetlb_pool_info_show(struct seq_file *m, void *v); diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index 8881e9e1a032..d12e07ca90c2 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -255,6 +255,62 @@ bool dhugetlb_hide_files(struct cftype *cft) return false; }
+static ssize_t update_reserved_pages(struct mem_cgroup *memcg, char *buf, int hpages_pool_idx) +{ + struct dhugetlb_pool *hpool = memcg->hpool; + struct huge_pages_pool *hpages_pool; + unsigned long nr_pages; + unsigned long delta; + char *endp; + + if (!dhugetlb_enabled) + return -EINVAL; + + buf = strstrip(buf); + nr_pages = memparse(buf, &endp); + if (*endp != '\0') + return -EINVAL; + + if (!get_hpool_unless_zero(hpool)) + return -EINVAL; + + spin_lock(&hpool->reserved_lock); + spin_lock(&hpool->lock); + hpages_pool = &hpool->hpages_pool[hpages_pool_idx]; + if (nr_pages > hpages_pool->nr_huge_pages) { + delta = min(nr_pages - hpages_pool->nr_huge_pages, hpages_pool->free_normal_pages); + hpages_pool->nr_huge_pages += delta; + hpages_pool->free_huge_pages += delta; + hpages_pool->free_normal_pages -= delta; + } else { + delta = min(hpages_pool->nr_huge_pages - nr_pages, + hpages_pool->free_huge_pages - hpages_pool->resv_huge_pages); + hpages_pool->nr_huge_pages -= delta; + hpages_pool->free_huge_pages -= delta; + hpages_pool->free_normal_pages += delta; + } + spin_unlock(&hpool->lock); + spin_unlock(&hpool->reserved_lock); + put_hpool(hpool); + return 0; +} + +ssize_t write_2M_reserved_pages(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + return update_reserved_pages(memcg, buf, HUGE_PAGES_POOL_2M) ?: nbytes; +} + +ssize_t write_1G_reserved_pages(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + return update_reserved_pages(memcg, buf, HUGE_PAGES_POOL_1G) ?: nbytes; +} + ssize_t write_hugepage_to_hpool(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1a292d54e7ad..6381de898f31 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5202,6 +5202,16 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = hugetlb_pool_info_show, .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | CFTYPE_NOT_ON_ROOT, }, + { + .name = "dhugetlb.1G.reserved_pages", + .write = write_1G_reserved_pages, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | CFTYPE_NOT_ON_ROOT, + }, + { + .name = "dhugetlb.2M.reserved_pages", + .write = write_2M_reserved_pages, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | CFTYPE_NOT_ON_ROOT, + }, #endif #ifdef CONFIG_NUMA {
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
Currently, dynamic hugetlb support 1G/2M/4K pages. In the beginning, there were only 1G pages in the hpool. Add function to split pages in dhugetlb_pool. If 4K pages are insufficient, try to split 2M pages, and if 2M pages are insufficient, try to split 1G pages.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/dynamic_hugetlb.h | 2 + mm/dynamic_hugetlb.c | 114 ++++++++++++++++++++++++++++++++ 2 files changed, 116 insertions(+)
diff --git a/include/linux/dynamic_hugetlb.h b/include/linux/dynamic_hugetlb.h index d0f6c1dd2361..2004f174f7d3 100644 --- a/include/linux/dynamic_hugetlb.h +++ b/include/linux/dynamic_hugetlb.h @@ -2,7 +2,9 @@ #ifndef __LINUX_DYNAMIC_HUGETLB_H #define __LINUX_DYNAMIC_HUGETLB_H
+#include <linux/page_counter.h> #include <linux/hugetlb.h> +#include <linux/hugetlb_cgroup.h> #include <linux/memcontrol.h>
#ifdef CONFIG_DYNAMIC_HUGETLB diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index d12e07ca90c2..2050f6093544 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -5,10 +5,119 @@
#include <linux/dynamic_hugetlb.h>
+#include "internal.h" + static bool enable_dhugetlb = false; DEFINE_STATIC_KEY_FALSE(dhugetlb_enabled_key);
#define hugepage_index(pfn) ((pfn) >> (PUD_SHIFT - PAGE_SHIFT)) +static void add_new_page_to_pool(struct dhugetlb_pool *hpool, struct page *page, int hpages_pool_idx) +{ + struct huge_pages_pool *hpages_pool = &hpool->hpages_pool[hpages_pool_idx]; + + lockdep_assert_held(&hpool->lock); + VM_BUG_ON_PAGE(page_mapcount(page), page); + INIT_LIST_HEAD(&page->lru); + + switch (hpages_pool_idx) { + case HUGE_PAGES_POOL_1G: + prep_compound_gigantic_page(page, PUD_SHIFT - PAGE_SHIFT); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + set_hugetlb_cgroup(page, NULL); + break; + case HUGE_PAGES_POOL_2M: + prep_compound_page(page, PMD_SHIFT - PAGE_SHIFT); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + set_hugetlb_cgroup(page, NULL); + break; + } + list_add_tail(&page->lru, &hpages_pool->hugepage_freelists); + hpages_pool->free_normal_pages++; +} + +static void __hpool_split_gigantic_page(struct dhugetlb_pool *hpool, struct page *page) +{ + int nr_pages = 1 << (PUD_SHIFT - PAGE_SHIFT); + int nr_blocks = 1 << (PMD_SHIFT - PAGE_SHIFT); + int i; + + lockdep_assert_held(&hpool->lock); + atomic_set(compound_mapcount_ptr(page), 0); + atomic_set(compound_pincount_ptr(page), 0); + + for (i = 1; i < nr_pages; i++) + clear_compound_head(&page[i]); + set_compound_order(page, 0); + page[1].compound_nr = 0; + __ClearPageHead(page); + + for (i = 0; i < nr_pages; i+= nr_blocks) + add_new_page_to_pool(hpool, &page[i], HUGE_PAGES_POOL_2M); +} + +static void __hpool_split_huge_page(struct dhugetlb_pool *hpool, struct page *page) +{ + int nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + int i; + + lockdep_assert_held(&hpool->lock); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + set_compound_order(page, 0); + + __ClearPageHead(page); + for (i = 0; i < nr_pages; i++) { + page[i].flags &= ~(1 << PG_locked | 1 << PG_error | + 1 << PG_referenced | 1 << PG_dirty | + 1 << PG_active | 1 << PG_private | + 1 << PG_writeback); + if (i != 0) { + page[i].mapping = NULL; + clear_compound_head(&page[i]); + } + add_new_page_to_pool(hpool, &page[i], HUGE_PAGES_POOL_4K); + } +} + +static int hpool_split_page(struct dhugetlb_pool *hpool, int hpages_pool_idx) +{ + struct huge_pages_pool *hpages_pool; + struct split_hugepage *split_page; + struct page *page; + + lockdep_assert_held(&hpool->lock); + + if (hpages_pool_idx < 0 || hpages_pool_idx >= HUGE_PAGES_POOL_MAX - 1) + return -EINVAL; + + hpages_pool = &hpool->hpages_pool[hpages_pool_idx]; + + /* If hpages_pool has no pages to split, try higher hpages_pool */ + if (!hpages_pool->free_normal_pages && + hpool_split_page(hpool, hpages_pool_idx - 1)) + return -ENOMEM; + + split_page = kzalloc(sizeof(struct split_hugepage), GFP_ATOMIC); + if (!split_page) + return -ENOMEM; + + page = list_entry(hpages_pool->hugepage_freelists.next, struct page, lru); + list_del(&page->lru); + hpages_pool->free_normal_pages--; + + split_page->start_pfn = page_to_pfn(page); + list_add(&split_page->head_pages, &hpages_pool->hugepage_splitlists); + hpages_pool->split_normal_pages++; + + switch (hpages_pool_idx) { + case HUGE_PAGES_POOL_1G: + __hpool_split_gigantic_page(hpool, page); + break; + case HUGE_PAGES_POOL_2M: + __hpool_split_huge_page(hpool, page); + break; + } + return 0; +}
static bool get_hpool_unless_zero(struct dhugetlb_pool *hpool) { @@ -278,6 +387,11 @@ static ssize_t update_reserved_pages(struct mem_cgroup *memcg, char *buf, int hp spin_lock(&hpool->lock); hpages_pool = &hpool->hpages_pool[hpages_pool_idx]; if (nr_pages > hpages_pool->nr_huge_pages) { + delta = nr_pages - hpages_pool->nr_huge_pages; + while (delta > hpages_pool->free_normal_pages) { + if (hpool_split_page(hpool, hpages_pool_idx - 1)) + break; + } delta = min(nr_pages - hpages_pool->nr_huge_pages, hpages_pool->free_normal_pages); hpages_pool->nr_huge_pages += delta; hpages_pool->free_huge_pages += delta;
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
When destroying hpool or alloc huge pages, the pages has been split may need to be merged to huge pages. Add functions to merge pages in dhugetlb_pool. The information about split huge pages has been recorded in hugepage_splitlists and can traverse it to merge huge pages.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/dynamic_hugetlb.c | 127 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+)
diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index 2050f6093544..c03ea40ad7b7 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -119,6 +119,126 @@ static int hpool_split_page(struct dhugetlb_pool *hpool, int hpages_pool_idx) return 0; }
+static void reclaim_pages_from_percpu_pool(struct dhugetlb_pool *hpool, + struct percpu_pages_pool *percpu_pool, + unsigned long nr_pages) +{ + struct huge_pages_pool *hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_4K]; + struct page *page, *next; + int i = 0; + + list_for_each_entry_safe(page, next, &percpu_pool->head_page, lru) { + list_del(&page->lru); + percpu_pool->free_pages--; + list_add(&page->lru, &hpages_pool->hugepage_freelists); + hpages_pool->free_normal_pages++; + if (++i == nr_pages) + break; + } +} + +static void clear_percpu_pools(struct dhugetlb_pool *hpool) +{ + struct percpu_pages_pool *percpu_pool; + int i; + + lockdep_assert_held(&hpool->lock); + + spin_unlock(&hpool->lock); + for (i = 0; i < NR_PERCPU_POOL; i++) + spin_lock(&hpool->percpu_pool[i].lock); + spin_lock(&hpool->lock); + for (i = 0; i < NR_PERCPU_POOL; i++) { + percpu_pool = &hpool->percpu_pool[i]; + reclaim_pages_from_percpu_pool(hpool, percpu_pool, percpu_pool->free_pages); + } + for (i = 0; i < NR_PERCPU_POOL; i++) + spin_unlock(&hpool->percpu_pool[i].lock); +} + +static int hpool_merge_page(struct dhugetlb_pool *hpool, int hpages_pool_idx) +{ + struct huge_pages_pool *hpages_pool, *src_hpages_pool; + struct split_hugepage *split_page, *split_next; + unsigned long nr_pages, block_size; + struct page *page; + int i; + + lockdep_assert_held(&hpool->lock); + + if (hpages_pool_idx < 0 || hpages_pool_idx >= HUGE_PAGES_POOL_MAX - 1) + return -EINVAL; + + switch (hpages_pool_idx) { + case HUGE_PAGES_POOL_1G: + nr_pages = 1 << (PUD_SHIFT - PMD_SHIFT); + block_size = 1 << (PMD_SHIFT - PAGE_SHIFT); + break; + case HUGE_PAGES_POOL_2M: + nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + block_size = 1; + break; + } + + hpages_pool = &hpool->hpages_pool[hpages_pool_idx]; + src_hpages_pool = &hpool->hpages_pool[hpages_pool_idx + 1]; + if (!hpages_pool->split_normal_pages) + return -ENOMEM; + + list_for_each_entry_safe(split_page, split_next, &hpages_pool->hugepage_splitlists, head_pages) { + clear_percpu_pools(hpool); + page = pfn_to_page(split_page->start_pfn); + for (i = 0; i < nr_pages; i+= block_size) { + if (PagePool(&page[i])) + goto next; + } + list_del(&split_page->head_pages); + hpages_pool->split_normal_pages--; + kfree(split_page); + for (i = 0; i < nr_pages; i+= block_size) { + list_del(&page[i].lru); + src_hpages_pool->free_normal_pages--; + } + add_new_page_to_pool(hpool, page, hpages_pool_idx); + return 0; +next: + continue; + } + return -ENOMEM; +} + +static int hugetlb_pool_merge_all_pages(struct dhugetlb_pool *hpool) +{ + int ret = 0; + + spin_lock(&hpool->lock); + while (hpool->hpages_pool[HUGE_PAGES_POOL_2M].split_normal_pages) { + ret = hpool_merge_page(hpool, HUGE_PAGES_POOL_2M); + if (ret) { + pr_err("dynamic_hugetlb: some 4K pages are still in use, delete memcg: %s failed!\n", + hpool->attach_memcg->css.cgroup->kn->name); + goto out; + } + } + while (hpool->hpages_pool[HUGE_PAGES_POOL_1G].split_normal_pages) { + ret = hpool_merge_page(hpool, HUGE_PAGES_POOL_1G); + if (ret) { + pr_err("dynamic_hugetlb: some 2M pages are still in use, delete memcg: %s failed!\n", + hpool->attach_memcg->css.cgroup->kn->name); + goto out; + } + } + if (hpool->hpages_pool[HUGE_PAGES_POOL_1G].used_huge_pages) { + ret = -ENOMEM; + pr_err("dynamic_hugetlb: some 1G pages are still in use, delete memcg: %s failed!\n", + hpool->attach_memcg->css.cgroup->kn->name); + goto out; + } +out: + spin_unlock(&hpool->lock); + return ret; +} + static bool get_hpool_unless_zero(struct dhugetlb_pool *hpool) { if (!dhugetlb_enabled || !hpool) @@ -315,6 +435,9 @@ int hugetlb_pool_destroy(struct cgroup *cgrp) if (!hpool || hpool->attach_memcg != memcg) return 0;
+ ret = hugetlb_pool_merge_all_pages(hpool); + if (ret) + return -ENOMEM; ret = free_hugepage_to_hugetlb(hpool); memcg->hpool = NULL;
@@ -392,6 +515,10 @@ static ssize_t update_reserved_pages(struct mem_cgroup *memcg, char *buf, int hp if (hpool_split_page(hpool, hpages_pool_idx - 1)) break; } + while (delta > hpages_pool->free_normal_pages) { + if (hpool_merge_page(hpool, hpages_pool_idx)) + break; + } delta = min(nr_pages - hpages_pool->nr_huge_pages, hpages_pool->free_normal_pages); hpages_pool->nr_huge_pages += delta; hpages_pool->free_huge_pages += delta;
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
Sometimes, page merge may failed because some pages are still in use. Add migration function to enhance the merge function. This function relies on memory hotremove, so it only works when config MEMORY_HOTREMOVE is selected.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/dynamic_hugetlb.c | 71 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 8 deletions(-)
diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index c03ea40ad7b7..2dd58cbee610 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -3,6 +3,9 @@ * dynamic hugetlb core file */
+#include <linux/rmap.h> +#include <linux/migrate.h> +#include <linux/memory_hotplug.h> #include <linux/dynamic_hugetlb.h>
#include "internal.h" @@ -156,13 +159,18 @@ static void clear_percpu_pools(struct dhugetlb_pool *hpool) spin_unlock(&hpool->percpu_pool[i].lock); }
-static int hpool_merge_page(struct dhugetlb_pool *hpool, int hpages_pool_idx) +/* We only try 5 times to reclaim pages */ +#define HPOOL_RECLAIM_RETRIES 5 + +static int hpool_merge_page(struct dhugetlb_pool *hpool, int hpages_pool_idx, bool force_merge) { struct huge_pages_pool *hpages_pool, *src_hpages_pool; struct split_hugepage *split_page, *split_next; unsigned long nr_pages, block_size; - struct page *page; - int i; + struct page *page, *next; + bool need_migrate = false; + int i, try; + LIST_HEAD(wait_page_list);
lockdep_assert_held(&hpool->lock);
@@ -177,6 +185,7 @@ static int hpool_merge_page(struct dhugetlb_pool *hpool, int hpages_pool_idx) case HUGE_PAGES_POOL_2M: nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); block_size = 1; + need_migrate |= force_merge; break; }
@@ -186,12 +195,20 @@ static int hpool_merge_page(struct dhugetlb_pool *hpool, int hpages_pool_idx) return -ENOMEM;
list_for_each_entry_safe(split_page, split_next, &hpages_pool->hugepage_splitlists, head_pages) { + try = 0; + +merge: clear_percpu_pools(hpool); page = pfn_to_page(split_page->start_pfn); for (i = 0; i < nr_pages; i+= block_size) { - if (PagePool(&page[i])) - goto next; + if (PagePool(&page[i])) { + if (!need_migrate) + goto next; + else + goto migrate; + } } + list_del(&split_page->head_pages); hpages_pool->split_normal_pages--; kfree(split_page); @@ -203,6 +220,36 @@ static int hpool_merge_page(struct dhugetlb_pool *hpool, int hpages_pool_idx) return 0; next: continue; +migrate: + if (try++ >= HPOOL_RECLAIM_RETRIES) + goto next; + + /* Isolate free page first. */ + INIT_LIST_HEAD(&wait_page_list); + for (i = 0; i < nr_pages; i+= block_size) { + if (!PagePool(&page[i])) { + list_move(&page[i].lru, &wait_page_list); + src_hpages_pool->free_normal_pages--; + } + } + + /* Unlock and try migration. */ + spin_unlock(&hpool->lock); + for (i = 0; i < nr_pages; i+= block_size) { + if (PagePool(&page[i])) + /* + * TODO: fatal migration failures should bail + * out + */ + do_migrate_range(page_to_pfn(&page[i]), page_to_pfn(&page[i]) + block_size); + } + spin_lock(&hpool->lock); + + list_for_each_entry_safe(page, next, &wait_page_list, lru) { + list_move_tail(&page->lru, &src_hpages_pool->hugepage_freelists); + src_hpages_pool->free_normal_pages++; + } + goto merge; } return -ENOMEM; } @@ -213,7 +260,7 @@ static int hugetlb_pool_merge_all_pages(struct dhugetlb_pool *hpool)
spin_lock(&hpool->lock); while (hpool->hpages_pool[HUGE_PAGES_POOL_2M].split_normal_pages) { - ret = hpool_merge_page(hpool, HUGE_PAGES_POOL_2M); + ret = hpool_merge_page(hpool, HUGE_PAGES_POOL_2M, true); if (ret) { pr_err("dynamic_hugetlb: some 4K pages are still in use, delete memcg: %s failed!\n", hpool->attach_memcg->css.cgroup->kn->name); @@ -221,7 +268,7 @@ static int hugetlb_pool_merge_all_pages(struct dhugetlb_pool *hpool) } } while (hpool->hpages_pool[HUGE_PAGES_POOL_1G].split_normal_pages) { - ret = hpool_merge_page(hpool, HUGE_PAGES_POOL_1G); + ret = hpool_merge_page(hpool, HUGE_PAGES_POOL_1G, true); if (ret) { pr_err("dynamic_hugetlb: some 2M pages are still in use, delete memcg: %s failed!\n", hpool->attach_memcg->css.cgroup->kn->name); @@ -515,8 +562,16 @@ static ssize_t update_reserved_pages(struct mem_cgroup *memcg, char *buf, int hp if (hpool_split_page(hpool, hpages_pool_idx - 1)) break; } + /* + * First try to merge pages without migration, If this can not meet + * the requirements, then try to merge pages with migration. + */ + while (delta > hpages_pool->free_normal_pages) { + if (hpool_merge_page(hpool, hpages_pool_idx, false)) + break; + } while (delta > hpages_pool->free_normal_pages) { - if (hpool_merge_page(hpool, hpages_pool_idx)) + if (hpool_merge_page(hpool, hpages_pool_idx, true)) break; } delta = min(nr_pages - hpages_pool->nr_huge_pages, hpages_pool->free_normal_pages);
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
Add function to alloc page from dhugetlb_pool. When process is bound to a mem_cgroup configured with dhugtlb_pool, alloc page from dhugetlb_pool firstly. If there is no page in dhugetlb_pool, fallback to alloc page from buddy system.
As the process will alloc pages from dhugetlb_pool in the mem_cgroup, process is not allowed to migrate to other mem_cgroup.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/dynamic_hugetlb.h | 15 ++++ mm/dynamic_hugetlb.c | 135 ++++++++++++++++++++++++++++++++ mm/memcontrol.c | 4 + mm/page_alloc.c | 6 ++ 4 files changed, 160 insertions(+)
diff --git a/include/linux/dynamic_hugetlb.h b/include/linux/dynamic_hugetlb.h index 2004f174f7d3..9af8ed6ec96b 100644 --- a/include/linux/dynamic_hugetlb.h +++ b/include/linux/dynamic_hugetlb.h @@ -88,6 +88,10 @@ void hugetlb_pool_inherit(struct mem_cgroup *memcg, struct mem_cgroup *parent); int hugetlb_pool_destroy(struct cgroup *cgrp); void __init dynamic_hugetlb_init(void);
+struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp, unsigned int order, + unsigned int flags); +int task_has_mem_in_hpool(struct task_struct *tsk); + #else
#define dhugetlb_enabled 0 @@ -108,5 +112,16 @@ static inline int hugetlb_pool_destroy(struct cgroup *cgrp) static inline void __init dynamic_hugetlb_init(void) { } + +static inline struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp, unsigned int order, + unsigned int flags) +{ + return NULL; +} +static inline int task_has_mem_in_hpool(struct task_struct *tsk) +{ + return 0; +} + #endif /* CONFIG_DYNAMIC_HUGETLB */ #endif /* __LINUX_DYNAMIC_HUGETLB_H */ diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index 2dd58cbee610..c54c99627994 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -122,6 +122,34 @@ static int hpool_split_page(struct dhugetlb_pool *hpool, int hpages_pool_idx) return 0; }
+static int add_pages_to_percpu_pool(struct dhugetlb_pool *hpool, + struct percpu_pages_pool *percpu_pool, + unsigned long nr_pages) +{ + struct huge_pages_pool *hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_4K]; + struct page *page, *next; + int ret, i = 0; + + while (hpages_pool->free_normal_pages < nr_pages) { + ret = hpool_split_page(hpool, HUGE_PAGES_POOL_2M); + if (ret) + break; + } + + list_for_each_entry_safe(page, next, &hpages_pool->hugepage_freelists, lru) { + list_del(&page->lru); + hpages_pool->free_normal_pages--; + list_add_tail(&page->lru, &percpu_pool->head_page); + percpu_pool->free_pages++; + if (++i == nr_pages) + break; + } + + if (percpu_pool->free_pages == 0) + return -ENOMEM; + return 0; +} + static void reclaim_pages_from_percpu_pool(struct dhugetlb_pool *hpool, struct percpu_pages_pool *percpu_pool, unsigned long nr_pages) @@ -350,6 +378,113 @@ static int set_hpool_in_dhugetlb_pagelist(unsigned long idx, struct dhugetlb_poo return 0; }
+static struct dhugetlb_pool *find_hpool_by_task(struct task_struct *tsk) +{ + struct mem_cgroup *memcg; + + if (!dhugetlb_enabled) + return NULL; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(tsk); + rcu_read_unlock(); + + if (!memcg) + return NULL; + + return memcg->hpool; +} + +int task_has_mem_in_hpool(struct task_struct *tsk) +{ + struct dhugetlb_pool *hpool; + + if (!dhugetlb_enabled) + return 0; + + hpool = find_hpool_by_task(tsk); + + return hpool ? -EPERM : 0; +} + +static bool should_allocate_from_dhugetlb_pool(gfp_t gfp_mask) +{ + gfp_t gfp = gfp_mask & GFP_HIGHUSER_MOVABLE; + + if (current->flags & PF_KTHREAD) + return false; + + /* + * The cgroup only charges anonymous and file pages from usespage. + * some filesystem maybe has masked out the __GFP_IO | __GFP_FS + * to avoid recursive memory request. eg: loop device, xfs. + */ + if ((gfp | __GFP_IO | __GFP_FS) != GFP_HIGHUSER_MOVABLE) + return false; + + return true; +} + +static struct page *__alloc_page_from_dhugetlb_pool(void) +{ + struct percpu_pages_pool *percpu_pool; + struct dhugetlb_pool *hpool; + struct page *page = NULL; + unsigned long flags; + + hpool = find_hpool_by_task(current); + + if (!get_hpool_unless_zero(hpool)) + return NULL; + + percpu_pool = &hpool->percpu_pool[smp_processor_id()]; + /* + * Before we lock percpu_pool, must be sure hpool is unlocked. + */ + spin_lock_irqsave(&percpu_pool->lock, flags); + + if (percpu_pool->free_pages == 0) { + int ret; + + spin_lock(&hpool->lock); + ret = add_pages_to_percpu_pool(hpool, percpu_pool, + PERCPU_POOL_PAGE_BATCH); + spin_unlock(&hpool->lock); + if (ret) + goto unlock; + } + + page = list_entry(percpu_pool->head_page.next, struct page, lru); + list_del(&page->lru); + percpu_pool->free_pages--; + percpu_pool->used_pages++; + SetPagePool(page); + +unlock: + spin_unlock_irqrestore(&percpu_pool->lock, flags); + put_hpool(hpool); + return page; +} + +struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp, unsigned int order, + unsigned int flags) +{ + struct page *page = NULL; + + if (!dhugetlb_enabled) + return NULL; + + if (order != 0) + return NULL; + + if (should_allocate_from_dhugetlb_pool(gfp)) + page = __alloc_page_from_dhugetlb_pool(); + + if (page) + prep_new_page(page, order, gfp, flags); + return page; +} + static int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool, unsigned long nid, unsigned long nr_pages) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6381de898f31..f06c7349d9a4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6254,6 +6254,10 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) if (!p) return 0;
+ ret = task_has_mem_in_hpool(p); + if (ret) + return ret; + /* * We are now commited to this value whatever it is. Changes in this * tunable will only affect upcoming migrations, not the current one. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0ff4f4e3a538..403898a3ab1e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,6 +72,7 @@ #include <linux/khugepaged.h> #include <linux/buffer_head.h> #include <linux/vmalloc.h> +#include <linux/dynamic_hugetlb.h>
#include <asm/sections.h> #include <asm/tlbflush.h> @@ -5160,6 +5161,11 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, */ alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
+ /* Before alloc from buddy system, alloc from hpool firstly */ + page = alloc_page_from_dhugetlb_pool(alloc_gfp, order, alloc_flags); + if (page) + goto out; + /* First allocation attempt */ page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac); if (likely(page))
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
Add function to free page to dhugetlb_pool.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/dynamic_hugetlb.h | 9 +++++ mm/dynamic_hugetlb.c | 68 +++++++++++++++++++++++++++++++++ mm/page_alloc.c | 7 ++++ 3 files changed, 84 insertions(+)
diff --git a/include/linux/dynamic_hugetlb.h b/include/linux/dynamic_hugetlb.h index 9af8ed6ec96b..ebb5a36a4a26 100644 --- a/include/linux/dynamic_hugetlb.h +++ b/include/linux/dynamic_hugetlb.h @@ -90,6 +90,8 @@ void __init dynamic_hugetlb_init(void);
struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp, unsigned int order, unsigned int flags); +bool free_page_to_dhugetlb_pool(struct page *page); +void free_page_list_to_dhugetlb_pool(struct list_head *list); int task_has_mem_in_hpool(struct task_struct *tsk);
#else @@ -118,6 +120,13 @@ static inline struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp, unsigned int { return NULL; } +static inline bool free_page_to_dhugetlb_pool(struct page *page) +{ + return false; +} +static inline void free_page_list_to_dhugetlb_pool(struct list_head *list) +{ +} static inline int task_has_mem_in_hpool(struct task_struct *tsk) { return 0; diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index c54c99627994..40f79f3e6aeb 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -378,6 +378,18 @@ static int set_hpool_in_dhugetlb_pagelist(unsigned long idx, struct dhugetlb_poo return 0; }
+static struct dhugetlb_pool *find_hpool_by_dhugetlb_pagelist(struct page *page) +{ + unsigned long idx = hugepage_index(page_to_pfn(page)); + struct dhugetlb_pool *hpool = NULL; + + read_lock(&dhugetlb_pagelist_rwlock); + if (idx < dhugetlb_pagelist_t->count) + hpool = dhugetlb_pagelist_t->hpool[idx]; + read_unlock(&dhugetlb_pagelist_rwlock); + return hpool; +} + static struct dhugetlb_pool *find_hpool_by_task(struct task_struct *tsk) { struct mem_cgroup *memcg; @@ -485,6 +497,62 @@ struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp, unsigned int order, return page; }
+static void __free_page_to_dhugetlb_pool(struct page *page) +{ + struct percpu_pages_pool *percpu_pool; + struct dhugetlb_pool *hpool; + unsigned long flags; + + hpool = find_hpool_by_dhugetlb_pagelist(page); + + if (!get_hpool_unless_zero(hpool)) { + pr_err("dhugetlb: free error: get hpool failed\n"); + return; + } + + percpu_pool = &hpool->percpu_pool[smp_processor_id()]; + spin_lock_irqsave(&percpu_pool->lock, flags); + + ClearPagePool(page); + list_add(&page->lru, &percpu_pool->head_page); + percpu_pool->free_pages++; + percpu_pool->used_pages--; + if (percpu_pool->free_pages > PERCPU_POOL_PAGE_MAX) { + spin_lock(&hpool->lock); + reclaim_pages_from_percpu_pool(hpool, percpu_pool, PERCPU_POOL_PAGE_BATCH); + spin_unlock(&hpool->lock); + } + + spin_unlock_irqrestore(&percpu_pool->lock, flags); + put_hpool(hpool); +} + +bool free_page_to_dhugetlb_pool(struct page *page) +{ + if (!dhugetlb_enabled || !PagePool(page)) + return false; + + if (free_pages_prepare(page, 0, true)) + __free_page_to_dhugetlb_pool(page); + return true; +} + +void free_page_list_to_dhugetlb_pool(struct list_head *list) +{ + struct page *page, *next; + + if (!dhugetlb_enabled) + return; + + list_for_each_entry_safe(page, next, list, lru) { + if (PagePool(page)) { + list_del(&page->lru); + if (free_pages_prepare(page, 0, true)) + __free_page_to_dhugetlb_pool(page); + } + } +} + static int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool, unsigned long nid, unsigned long nr_pages) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 403898a3ab1e..e078e3acb3de 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3230,6 +3230,10 @@ void free_unref_page(struct page *page) unsigned long flags; unsigned long pfn = page_to_pfn(page);
+ /* Free dynamic hugetlb page */ + if (free_page_to_dhugetlb_pool(page)) + return; + if (!free_unref_page_prepare(page, pfn)) return;
@@ -3247,6 +3251,9 @@ void free_unref_page_list(struct list_head *list) unsigned long flags, pfn; int batch_count = 0;
+ /* Free dynamic hugetlb page list */ + free_page_list_to_dhugetlb_pool(list); + /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page);
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
Add new interface "dhugetlb.disable_normal_pages" to disable the allocation of normal pages from a hpool. This makes dynamic hugetlb more flexible.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/dynamic_hugetlb.h | 2 ++ mm/dynamic_hugetlb.c | 30 ++++++++++++++++++++++++++++++ mm/memcontrol.c | 6 ++++++ 3 files changed, 38 insertions(+)
diff --git a/include/linux/dynamic_hugetlb.h b/include/linux/dynamic_hugetlb.h index ebb5a36a4a26..2b2c90562bcf 100644 --- a/include/linux/dynamic_hugetlb.h +++ b/include/linux/dynamic_hugetlb.h @@ -81,6 +81,8 @@ ssize_t write_2M_reserved_pages(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); ssize_t write_1G_reserved_pages(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); +int normal_pages_disabled_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val); +u64 normal_pages_disabled_read(struct cgroup_subsys_state *css, struct cftype *cft); ssize_t write_hugepage_to_hpool(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); int hugetlb_pool_info_show(struct seq_file *m, void *v); diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index 40f79f3e6aeb..423d9624d4f0 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -449,6 +449,8 @@ static struct page *__alloc_page_from_dhugetlb_pool(void) if (!get_hpool_unless_zero(hpool)) return NULL;
+ if (hpool->normal_pages_disabled) + goto out; percpu_pool = &hpool->percpu_pool[smp_processor_id()]; /* * Before we lock percpu_pool, must be sure hpool is unlocked. @@ -474,6 +476,7 @@ static struct page *__alloc_page_from_dhugetlb_pool(void)
unlock: spin_unlock_irqrestore(&percpu_pool->lock, flags); +out: put_hpool(hpool); return page; } @@ -810,6 +813,33 @@ ssize_t write_1G_reserved_pages(struct kernfs_open_file *of, return update_reserved_pages(memcg, buf, HUGE_PAGES_POOL_1G) ?: nbytes; }
+int normal_pages_disabled_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct dhugetlb_pool *hpool = memcg->hpool; + + if (!dhugetlb_enabled || !hpool) + return -EINVAL; + if (!((val == 0) || (val == 1))) + return -EINVAL; + + hpool->normal_pages_disabled = val; + return 0; +} + +u64 normal_pages_disabled_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct dhugetlb_pool *hpool = memcg->hpool; + + if (!dhugetlb_enabled || !hpool) + return 0; + + return hpool->normal_pages_disabled; +} + ssize_t write_hugepage_to_hpool(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f06c7349d9a4..2804fe9d3dae 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5212,6 +5212,12 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = write_2M_reserved_pages, .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | CFTYPE_NOT_ON_ROOT, }, + { + .name = "dhugetlb.disable_normal_pages", + .read_u64 = normal_pages_disabled_read, + .write_u64 = normal_pages_disabled_write, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | CFTYPE_NOT_ON_ROOT, + }, #endif #ifdef CONFIG_NUMA {
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
The dynamic hugetlb feature is based on hugetlb. There is a reserve count in hugetlb to determine if there were enough free huge pages to satisfy the requirement while mmap() to avoid SIGBUS at the next page fault time. Add similar count for dhugetlb_pool to avoid same problem.
References: Documentation/vm/hugetlbfs_reserv.rst
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/hugetlbfs/inode.c | 4 +++ include/linux/dynamic_hugetlb.h | 22 +++++++++++++ include/linux/hugetlb.h | 1 + mm/dynamic_hugetlb.c | 56 +++++++++++++++++++++++++++++++++ mm/hugetlb.c | 11 +++++++ 5 files changed, 94 insertions(+)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 246858ea0a52..6f2943465bff 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -36,6 +36,7 @@ #include <linux/magic.h> #include <linux/migrate.h> #include <linux/uio.h> +#include <linux/dynamic_hugetlb.h>
#include <linux/uaccess.h> #include <linux/sched/mm.h> @@ -1191,6 +1192,8 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) * private inode. This simplifies hugetlbfs_destroy_inode. */ mpol_shared_policy_init(&p->policy, NULL); + /* Initialize hpool here in case of a quick call to destroy */ + link_hpool(p);
return &p->vfs_inode; } @@ -1204,6 +1207,7 @@ static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); + unlink_hpool(HUGETLBFS_I(inode)); }
static const struct address_space_operations hugetlbfs_aops = { diff --git a/include/linux/dynamic_hugetlb.h b/include/linux/dynamic_hugetlb.h index 2b2c90562bcf..8512f509899b 100644 --- a/include/linux/dynamic_hugetlb.h +++ b/include/linux/dynamic_hugetlb.h @@ -96,6 +96,11 @@ bool free_page_to_dhugetlb_pool(struct page *page); void free_page_list_to_dhugetlb_pool(struct list_head *list); int task_has_mem_in_hpool(struct task_struct *tsk);
+void link_hpool(struct hugetlbfs_inode_info *p); +void unlink_hpool(struct hugetlbfs_inode_info *p); +bool file_has_mem_in_hpool(struct hugetlbfs_inode_info *p); +int dhugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_inode_info *p); + #else
#define dhugetlb_enabled 0 @@ -134,5 +139,22 @@ static inline int task_has_mem_in_hpool(struct task_struct *tsk) return 0; }
+#ifdef CONFIG_HUGETLBFS +static inline void link_hpool(struct hugetlbfs_inode_info *p) +{ +} +static inline void unlink_hpool(struct hugetlbfs_inode_info *p) +{ +} +static inline bool file_has_mem_in_hpool(struct hugetlbfs_inode_info *p) +{ + return false; +} +static inline int dhugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_inode_info *p) +{ + return 0; +} +#endif + #endif /* CONFIG_DYNAMIC_HUGETLB */ #endif /* __LINUX_DYNAMIC_HUGETLB_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a1135c43719e..634630ebc8a7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -446,6 +446,7 @@ struct hugetlbfs_inode_info { struct shared_policy policy; struct inode vfs_inode; unsigned int seals; + struct dhugetlb_pool *hpool; };
static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index 423d9624d4f0..f8ae9ba90bcb 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -556,6 +556,62 @@ void free_page_list_to_dhugetlb_pool(struct list_head *list) } }
+void link_hpool(struct hugetlbfs_inode_info *p) +{ + if (!dhugetlb_enabled || !p) + return; + + p->hpool = find_hpool_by_task(current); + if (!get_hpool_unless_zero(p->hpool)) + p->hpool = NULL; +} + +void unlink_hpool(struct hugetlbfs_inode_info *p) +{ + if (!dhugetlb_enabled || !p) + return; + + put_hpool(p->hpool); + p->hpool = NULL; +} + +bool file_has_mem_in_hpool(struct hugetlbfs_inode_info *p) +{ + if (!dhugetlb_enabled || !p || !p->hpool) + return false; + return true; +} + +int dhugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_inode_info *p) +{ + struct dhugetlb_pool *hpool = p ? p->hpool : NULL; + struct huge_pages_pool *hpages_pool; + int ret = -ENOMEM; + + if (!dhugetlb_enabled || !hpool) + return 0; + + if (delta == 0) + return 0; + + spin_lock(&hpool->lock); + if (hstate_is_gigantic(h)) + hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_1G]; + else + hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_2M]; + if (delta > 0 && delta <= hpages_pool->free_huge_pages - hpages_pool->resv_huge_pages) { + hpages_pool->resv_huge_pages += delta; + ret = 0; + } else if (delta < 0) { + hpages_pool->resv_huge_pages -= (unsigned long)(-delta); + WARN_ON(hpages_pool->resv_huge_pages < 0); + ret = 0; + } + spin_unlock(&hpool->lock); + + return ret; +} + static int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool, unsigned long nid, unsigned long nr_pages) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6049fd4a9050..d26f0a7ca780 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -157,6 +157,10 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, if (!spool) return ret;
+ /* Skip subpool when hugetlb file belongs to a hugetlb_pool */ + if (file_has_mem_in_hpool(info)) + return ret; + spin_lock_irq(&spool->lock);
if (spool->max_hpages != -1) { /* maximum size accounting */ @@ -203,6 +207,10 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, if (!spool) return delta;
+ /* Skip subpool when hugetlb file belongs to a hugetlb_pool */ + if (file_has_mem_in_hpool(info)) + return ret; + spin_lock_irqsave(&spool->lock, flags);
if (spool->max_hpages != -1) /* maximum size accounting */ @@ -3899,6 +3907,9 @@ static int hugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_in { int ret = -ENOMEM;
+ if (file_has_mem_in_hpool(info)) + return dhugetlb_acct_memory(h, delta, info); + spin_lock_irq(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
Add function to alloc huge page from dhugetlb_pool. When process is bound to a mem_cgroup configured with dhugetlb_pool, only allowed to alloc huge page from dhugetlb_pool. If there is no huge pages in dhugetlb_pool, the mmap() will failed due to the reserve count introduced in previous patch.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/dynamic_hugetlb.h | 8 +++++++ mm/dynamic_hugetlb.c | 39 ++++++++++++++++++++++++++++++++- mm/hugetlb.c | 14 ++++++++++++ 3 files changed, 60 insertions(+), 1 deletion(-)
diff --git a/include/linux/dynamic_hugetlb.h b/include/linux/dynamic_hugetlb.h index 8512f509899b..65d4b5dbf3f6 100644 --- a/include/linux/dynamic_hugetlb.h +++ b/include/linux/dynamic_hugetlb.h @@ -100,6 +100,8 @@ void link_hpool(struct hugetlbfs_inode_info *p); void unlink_hpool(struct hugetlbfs_inode_info *p); bool file_has_mem_in_hpool(struct hugetlbfs_inode_info *p); int dhugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_inode_info *p); +struct page *alloc_huge_page_from_dhugetlb_pool(struct hstate *h, struct dhugetlb_pool *hpool, + bool need_unreserved);
#else
@@ -154,6 +156,12 @@ static inline int dhugetlb_acct_memory(struct hstate *h, long delta, struct huge { return 0; } +static inline +struct page *alloc_huge_page_from_dhugetlb_pool(struct hstate *h, struct dhugetlb_pool *hpool, + bool need_unreserved) +{ + return NULL; +} #endif
#endif /* CONFIG_DYNAMIC_HUGETLB */ diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index f8ae9ba90bcb..126b3d9d3754 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -103,7 +103,7 @@ static int hpool_split_page(struct dhugetlb_pool *hpool, int hpages_pool_idx) if (!split_page) return -ENOMEM;
- page = list_entry(hpages_pool->hugepage_freelists.next, struct page, lru); + page = list_entry(hpages_pool->hugepage_freelists.prev, struct page, lru); list_del(&page->lru); hpages_pool->free_normal_pages--;
@@ -612,6 +612,43 @@ int dhugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_inode_in return ret; }
+struct page *alloc_huge_page_from_dhugetlb_pool(struct hstate *h, struct dhugetlb_pool *hpool, + bool need_unreserved) +{ + struct huge_pages_pool *hpages_pool; + struct page *page = NULL; + unsigned long flags; + + if (!dhugetlb_enabled) + return NULL; + + spin_lock_irqsave(&hpool->lock, flags); + if (hstate_is_gigantic(h)) + hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_1G]; + else + hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_2M]; + + if (hpages_pool->free_huge_pages) { + page = list_entry(hpages_pool->hugepage_freelists.next, struct page, lru); + list_del(&page->lru); + hpages_pool->free_huge_pages--; + hpages_pool->used_huge_pages++; + if (need_unreserved) { + SetHPageRestoreReserve(page); + hpages_pool->resv_huge_pages--; + } + } + if (page) { + INIT_LIST_HEAD(&page->lru); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + set_page_refcounted(page); + SetPagePool(page); + } + spin_unlock_irqrestore(&hpool->lock, flags); + + return page; +} + static int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool, unsigned long nid, unsigned long nr_pages) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d26f0a7ca780..031ad320f10c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2534,6 +2534,19 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (ret) goto out_uncharge_cgroup_reservation;
+ if (file_has_mem_in_hpool(info)) { + bool need_unreserved = false; + + if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) + need_unreserved = true; + page = alloc_huge_page_from_dhugetlb_pool(h, info->hpool, need_unreserved); + if (!page) + goto out_uncharge_cgroup; + spin_lock_irq(&hugetlb_lock); + list_add(&page->lru, &h->hugepage_activelist); + goto out; + } + spin_lock_irq(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken @@ -2554,6 +2567,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, list_add(&page->lru, &h->hugepage_activelist); /* Fall through */ } +out: hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); /* If allocation is not consuming a reservation, also store the * hugetlb_cgroup pointer on the page.
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
Add function to free huge page to dhugetlb_pool.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/dynamic_hugetlb.h | 5 +++++ mm/dynamic_hugetlb.c | 30 ++++++++++++++++++++++++++++++ mm/hugetlb.c | 13 +++++++++++++ 3 files changed, 48 insertions(+)
diff --git a/include/linux/dynamic_hugetlb.h b/include/linux/dynamic_hugetlb.h index 65d4b5dbf3f6..05bc55ef831b 100644 --- a/include/linux/dynamic_hugetlb.h +++ b/include/linux/dynamic_hugetlb.h @@ -102,6 +102,7 @@ bool file_has_mem_in_hpool(struct hugetlbfs_inode_info *p); int dhugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_inode_info *p); struct page *alloc_huge_page_from_dhugetlb_pool(struct hstate *h, struct dhugetlb_pool *hpool, bool need_unreserved); +void free_huge_page_to_dhugetlb_pool(struct page *page, bool restore_reserve);
#else
@@ -162,6 +163,10 @@ struct page *alloc_huge_page_from_dhugetlb_pool(struct hstate *h, struct dhugetl { return NULL; } +static inline +void free_huge_page_to_dhugetlb_pool(struct page *page, bool restore_reserve) +{ +} #endif
#endif /* CONFIG_DYNAMIC_HUGETLB */ diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index 126b3d9d3754..73795d533f7e 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -649,6 +649,36 @@ struct page *alloc_huge_page_from_dhugetlb_pool(struct hstate *h, struct dhugetl return page; }
+void free_huge_page_to_dhugetlb_pool(struct page *page, bool restore_reserve) +{ + struct hstate *h = page_hstate(page); + struct huge_pages_pool *hpages_pool; + struct dhugetlb_pool *hpool; + + hpool = find_hpool_by_dhugetlb_pagelist(page); + + if (!get_hpool_unless_zero(hpool)) { + pr_err("dhugetlb: free error: get hpool failed\n"); + return; + } + + spin_lock(&hpool->lock); + ClearPagePool(page); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + if (hstate_is_gigantic(h)) + hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_1G]; + else + hpages_pool = &hpool->hpages_pool[HUGE_PAGES_POOL_2M]; + + list_add(&page->lru, &hpages_pool->hugepage_freelists); + hpages_pool->free_huge_pages++; + hpages_pool->used_huge_pages--; + if (restore_reserve) + hpages_pool->resv_huge_pages++; + spin_unlock(&hpool->lock); + put_hpool(hpool); +} + static int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool, unsigned long nid, unsigned long nr_pages) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 031ad320f10c..3b787cb56699 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1583,6 +1583,19 @@ void free_huge_page(struct page *page) restore_reserve = HPageRestoreReserve(page); ClearHPageRestoreReserve(page);
+ if (dhugetlb_enabled && PagePool(page)) { + spin_lock(&hugetlb_lock); + ClearHPageMigratable(page); + list_del(&page->lru); + hugetlb_cgroup_uncharge_page(hstate_index(h), + pages_per_huge_page(h), page); + hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h), + pages_per_huge_page(h), page); + spin_unlock(&hugetlb_lock); + free_huge_page_to_dhugetlb_pool(page, restore_reserve); + return; + } + /* * If HPageRestoreReserve was set on page, page allocation consumed a * reservation. If the page was associated with a subpool, there
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
Add tracepoints for dynamic_hugetlb to track the process of page split, page merge, page migration, page allocation and page free.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/trace/events/dynamic_hugetlb.h | 121 +++++++++++++++++++++++++ mm/dynamic_hugetlb.c | 22 ++++- 2 files changed, 142 insertions(+), 1 deletion(-) create mode 100644 include/trace/events/dynamic_hugetlb.h
diff --git a/include/trace/events/dynamic_hugetlb.h b/include/trace/events/dynamic_hugetlb.h new file mode 100644 index 000000000000..1de0df5df793 --- /dev/null +++ b/include/trace/events/dynamic_hugetlb.h @@ -0,0 +1,121 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM dynamic_hugetlb + +#if !defined(_TRACE_DHUGETLB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DHUGETLB_H + +#include <linux/tracepoint.h> +#include <trace/events/mmflags.h> + +#define DHUGETLB_SPLIT 0x01u +#define DHUGETLB_MERGE 0x02u +#define DHUGETLB_MIGRATE 0x04u +#define DHUGETLB_RESV 0x08u +#define DHUGETLB_UNRESV 0x10u +#define DHUGETLB_ALLOC 0x20u +#define DHUGETLB_FREE 0x40u + +#define __def_action_names \ + {(unsigned long)DHUGETLB_SPLIT, "split page"}, \ + {(unsigned long)DHUGETLB_MERGE, "merge page"}, \ + {(unsigned long)DHUGETLB_MIGRATE, "migrate page"}, \ + {(unsigned long)DHUGETLB_RESV, "resv page"}, \ + {(unsigned long)DHUGETLB_UNRESV, "unresv page"}, \ + {(unsigned long)DHUGETLB_ALLOC, "alloc page"}, \ + {(unsigned long)DHUGETLB_FREE, "free page"} + +#define show_action(action) \ + (action) ? __print_flags(action, "", \ + __def_action_names \ + ) : "none" + +TRACE_EVENT(dynamic_hugetlb_split_merge, + + TP_PROTO(const void *hpool, struct page *page, unsigned long action, unsigned long size), + + TP_ARGS(hpool, page, action, size), + + TP_STRUCT__entry( + __field( const void *, hpool ) + __field( unsigned long, pfn ) + __field( unsigned long, action ) + __field( unsigned long, size ) + ), + + TP_fast_assign( + __entry->hpool = hpool; + __entry->pfn = page ? page_to_pfn(page) : -1UL; + __entry->action = action; + __entry->size = size; + ), + + TP_printk("hpool=%p page=%p pfn=%lu action=%s size=%lu", + __entry->hpool, + __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, + __entry->pfn, + show_action(__entry->action), + __entry->size) +); + +TRACE_EVENT(dynamic_hugetlb_acct_memory, + + TP_PROTO(const void *hpool, unsigned long count, unsigned long action, unsigned long size), + + TP_ARGS(hpool, count, action, size), + + TP_STRUCT__entry( + __field( const void *, hpool ) + __field( unsigned long, count ) + __field( unsigned long, action ) + __field( unsigned long, size ) + ), + + TP_fast_assign( + __entry->hpool = hpool; + __entry->size = size; + __entry->count = count; + __entry->action = action; + ), + + TP_printk("hpool=%p action=%s size = %lu mmap_count=%lu", + __entry->hpool, + show_action(__entry->action), + __entry->size, + __entry->count) +); + +TRACE_EVENT(dynamic_hugetlb_alloc_free, + + TP_PROTO(const void *hpool, struct page *page, unsigned long count, unsigned long action, unsigned long size), + + TP_ARGS(hpool, page, count, action, size), + + TP_STRUCT__entry( + __field( const void *, hpool ) + __field( unsigned long, pfn ) + __field( unsigned long, count ) + __field( unsigned long, action ) + __field( unsigned long, size ) + ), + + TP_fast_assign( + __entry->hpool = hpool; + __entry->pfn = page ? page_to_pfn(page) : -1UL; + __entry->count = count; + __entry->action = action; + __entry->size = size; + ), + + TP_printk("hpool=%p page=%p pfn=%lu action=%s size = %lu free_count=%lu", + __entry->hpool, + __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, + __entry->pfn, + show_action(__entry->action), + __entry->size, + __entry->count) +); + +#endif /* _TRACE_DHUGETLB_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index 73795d533f7e..1faac3b4572b 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -10,6 +10,11 @@
#include "internal.h"
+#if (defined CONFIG_DYNAMIC_HUGETLB) && (!defined __GENKSYMS__) +#define CREATE_TRACE_POINTS +#include <trace/events/dynamic_hugetlb.h> +#endif + static bool enable_dhugetlb = false; DEFINE_STATIC_KEY_FALSE(dhugetlb_enabled_key);
@@ -110,6 +115,7 @@ static int hpool_split_page(struct dhugetlb_pool *hpool, int hpages_pool_idx) split_page->start_pfn = page_to_pfn(page); list_add(&split_page->head_pages, &hpages_pool->hugepage_splitlists); hpages_pool->split_normal_pages++; + trace_dynamic_hugetlb_split_merge(hpool, page, DHUGETLB_SPLIT, page_size(page));
switch (hpages_pool_idx) { case HUGE_PAGES_POOL_1G: @@ -245,6 +251,7 @@ static int hpool_merge_page(struct dhugetlb_pool *hpool, int hpages_pool_idx, bo src_hpages_pool->free_normal_pages--; } add_new_page_to_pool(hpool, page, hpages_pool_idx); + trace_dynamic_hugetlb_split_merge(hpool, page, DHUGETLB_MERGE, page_size(page)); return 0; next: continue; @@ -602,10 +609,14 @@ int dhugetlb_acct_memory(struct hstate *h, long delta, struct hugetlbfs_inode_in if (delta > 0 && delta <= hpages_pool->free_huge_pages - hpages_pool->resv_huge_pages) { hpages_pool->resv_huge_pages += delta; ret = 0; + trace_dynamic_hugetlb_acct_memory(hpool, hpages_pool->resv_huge_pages, + DHUGETLB_RESV, huge_page_size(h)); } else if (delta < 0) { hpages_pool->resv_huge_pages -= (unsigned long)(-delta); WARN_ON(hpages_pool->resv_huge_pages < 0); ret = 0; + trace_dynamic_hugetlb_acct_memory(hpool, hpages_pool->resv_huge_pages, + DHUGETLB_UNRESV, huge_page_size(h)); } spin_unlock(&hpool->lock);
@@ -636,7 +647,11 @@ struct page *alloc_huge_page_from_dhugetlb_pool(struct hstate *h, struct dhugetl if (need_unreserved) { SetHPageRestoreReserve(page); hpages_pool->resv_huge_pages--; + trace_dynamic_hugetlb_acct_memory(hpool, hpages_pool->resv_huge_pages, + DHUGETLB_UNRESV, huge_page_size(h)); } + trace_dynamic_hugetlb_alloc_free(hpool, page, hpages_pool->free_huge_pages, + DHUGETLB_ALLOC, huge_page_size(h)); } if (page) { INIT_LIST_HEAD(&page->lru); @@ -673,8 +688,13 @@ void free_huge_page_to_dhugetlb_pool(struct page *page, bool restore_reserve) list_add(&page->lru, &hpages_pool->hugepage_freelists); hpages_pool->free_huge_pages++; hpages_pool->used_huge_pages--; - if (restore_reserve) + if (restore_reserve) { hpages_pool->resv_huge_pages++; + trace_dynamic_hugetlb_acct_memory(hpool, hpages_pool->resv_huge_pages, + DHUGETLB_RESV, huge_page_size(h)); + } + trace_dynamic_hugetlb_alloc_free(hpool, page, hpages_pool->free_huge_pages, + DHUGETLB_FREE, huge_page_size(h)); spin_unlock(&hpool->lock); put_hpool(hpool); }
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
When THP is enabled, the allocation of a page(order=0) may be converted to an allocation of pages(order>0). In this case, the allocation will skip the dhugetlb_pool. When we want to use dynamic hugetlb feature, we have to disable THP for now.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/dynamic_hugetlb.h | 2 ++ mm/dynamic_hugetlb.c | 2 +- mm/huge_memory.c | 13 +++++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/include/linux/dynamic_hugetlb.h b/include/linux/dynamic_hugetlb.h index 05bc55ef831b..237a7329ff64 100644 --- a/include/linux/dynamic_hugetlb.h +++ b/include/linux/dynamic_hugetlb.h @@ -9,6 +9,7 @@
#ifdef CONFIG_DYNAMIC_HUGETLB
+extern bool enable_dhugetlb; extern struct static_key_false dhugetlb_enabled_key; #define dhugetlb_enabled (static_branch_unlikely(&dhugetlb_enabled_key))
@@ -106,6 +107,7 @@ void free_huge_page_to_dhugetlb_pool(struct page *page, bool restore_reserve);
#else
+#define enable_dhugetlb 0 #define dhugetlb_enabled 0
struct dhugetlb_pool {}; diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index 1faac3b4572b..e9f5348bcccd 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -15,7 +15,7 @@ #include <trace/events/dynamic_hugetlb.h> #endif
-static bool enable_dhugetlb = false; +bool enable_dhugetlb = false; DEFINE_STATIC_KEY_FALSE(dhugetlb_enabled_key);
#define hugepage_index(pfn) ((pfn) >> (PUD_SHIFT - PAGE_SHIFT)) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index acb2e2c9e043..37704a21b3dc 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -33,6 +33,7 @@ #include <linux/oom.h> #include <linux/numa.h> #include <linux/page_owner.h> +#include <linux/dynamic_hugetlb.h>
#include <asm/tlb.h> #include <asm/pgalloc.h> @@ -392,6 +393,18 @@ static int __init hugepage_init(void) return -EINVAL; }
+ /* + * When we alloc some pages(order = 0), system may help us to alloc a page(order > 0) + * due to transparent hugepage. This result dynamic hugetlb to be skipped. + * Actually, using dynamic hugetlb means we have already optimized the program, so we + * should not use transparent hugepage in addition. (May result negative optimization) + */ + if (enable_dhugetlb) { + transparent_hugepage_flags = 0; + pr_info("transparent hugepage is disabled due to confilct with dynamic hugetlb\n"); + return -EINVAL; + } + /* * hugepages can't be allocated by the buddy allocator */
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
The dynamic_hugetlb feature need to split and merge pages frequently. hugetlb_vmemmap will affects the perforemance of page split and merge. If want to use dynamic hugetlb, please disable hugetlb_vmemmap.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/dynamic_hugetlb.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/mm/dynamic_hugetlb.c b/mm/dynamic_hugetlb.c index e9f5348bcccd..f20e654cc856 100644 --- a/mm/dynamic_hugetlb.c +++ b/mm/dynamic_hugetlb.c @@ -1081,6 +1081,17 @@ void __init dynamic_hugetlb_init(void) if (!enable_dhugetlb) return;
+ /* + * The dynamic_hugetlb feature need to split and merge pages frequently. + * hugetlb_vmemmap will affects the perforemance of page split and merge. + * If want to use dynamic hugetlb, please disable hugetlb_vmemmap. + */ + if (hugetlb_free_vmemmap_enabled) { + enable_dhugetlb = false; + pr_info("Please set hugetlb_free_vmemmap=off if want to enable dynamic hugetlb\n"); + return; + } + count = max(hugepage_index(max_pfn), (unsigned long)DEFAULT_PAGELIST_COUNT); size = sizeof(struct dhugetlb_pagelist) + count * sizeof(struct dhugetlb_pool *); dhugetlb_pagelist_t = kzalloc(size, GFP_KERNEL);
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
Add a Document to describe dynamic hugetlb feature, including the conflict description, usage description and interface description.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/vm/dynamic_hugetlb.rst | 109 +++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 Documentation/vm/dynamic_hugetlb.rst
diff --git a/Documentation/vm/dynamic_hugetlb.rst b/Documentation/vm/dynamic_hugetlb.rst new file mode 100644 index 000000000000..77231d570792 --- /dev/null +++ b/Documentation/vm/dynamic_hugetlb.rst @@ -0,0 +1,109 @@ +.. _dynamic_hugetlb: + +=============== +Dynamic Hugetlb +=============== + +Overview +======== + +Dynamic hugetlb is a self-developed feature based on the hugetlb and memcontrol. +It supports to split huge page dynamically in a memory cgroup. There is a new structure +dhugetlb_pool in every mem_cgroup to manage the pages configured to the mem_cgroup. +For the mem_cgroup configured with dhugetlb_pool, processes in the mem_cgroup will +preferentially use the pages in dhugetlb_pool. + +Dynamic hugetlb supports three types of pages, including 1G/2M huge pages and 4K pages. +For the mem_cgroup configured with dhugetlb_pool, processes will be limited to alloc +1G/2M huge pages only from dhugetlb_pool. But there is no such constraint for 4K pages. +If there are insufficient 4K pages in the dhugetlb_pool, pages can also be allocated from +buddy system. So before using dynamic hugetlb, user must know how many huge pages they +need. + +Conflict +======== + +1. Conflict with THP +-------------------- + +When THP is enabled, the allocation of a page(order=0) may be converted to +an allocation of pages(order>0). In this case, the allocation will skip the +dhugetlb_pool. When we want to use dynamic hugetlb feature, we have to +disable THP for now. + +2. Conflict with hugetlb_vmemmap +-------------------------------- + +The dynamic_hugetlb feature need to split and merge pages frequently. +hugetlb_vmemmap will affects the perforemance of page split and merge. +If want to use dynamic hugetlb, please disable hugetlb_vmemmap. + +Usage +===== + +1) Add 'dynamic_hugetlb=on' in cmdline to enable dynamic hugetlb feature. + +2) Prealloc some 1G hugepages through hugetlb. + +3) Create a mem_cgroup and configure dhugetlb_pool to mem_cgroup. + +4) Configure the count of 1G/2M hugepages, and the remaining pages in dhugetlb_pool will + be used as basic pages. + +5) Bound the process to mem_cgroup. then the memory for it will be allocated from dhugetlb_pool. + +User control +============ + +1. dynamic_hugetlb= +------------------- + +Add ``dynamic_hugtlb=on`` in cmdline to enable dynamic hugetlb feature. +By default, the feature si disabled. + +2. dhugetlb.nr_pages +-------------------- + +In each memory cgroup, there is a ``dhugetlb.nr_pages`` interface used to create and configure dynamic +hugetlb. If this interface is not configured, the original functions are not affected. If configured, +then the memory used by processes in this memory cgroup will be allocated from corresponding hpool. + +Usage: + ``echo <nid> <nr_pages> > /sys/fs/cgroup/memory/<memory cgroup>/dhugetlb.nr_pages``: + + Create a dynamic hugetlb pool and add <nr_pages> 1G hugepages from numa node <nid> to the pool. + + ``cat /sys/fs/cgroup/memory/<memory cgroup>/dhugetlb.nr_pages``: + + Reads the memory information in the hpool, include the free amount and used amount of huge pages and + normal pages. + +3. dhugetlb.1G.reserved_pages +----------------------------- + +In each memory cgroup, there is a ``dhugetlb.nr_pages`` interface used to reserved 1G huge pages. +By default, all memory configured to a dynamic hugetlb pool can be used only as normal pages, if want to use +it as 1G huge pages, need to configure the number of 1G huge pages by this interface firstly. + +Usage: + ``echo <nr_pages> > /sys/fs/cgroup/memory/<memory cgroup>/dhugetlb.1G.reserved_pages`` + +4. dhugetlb.2M.reserved_pages +----------------------------- + +Similar to the previous interface, this is used to configure the number of 2M huge pages. + +Usage: + ``echo <nr_pages> > /sys/fs/cgroup/memory/<memory cgroup>/dhugetlb.2M.reserved_pages`` + +5. dhugetlb.normal_pages_disabled +--------------------------------- + +The dynamic hugetlb pool can be used as just a huge pages pool. This interface is used to disable allocation +of normal pages from the dynamic hugetlb pool. + +Usage: + ``echo 1 > /sys/fs/cgroup/memory/<memory cgroup>/dhugetlb.normal_pages_disabled`` + +--- +Liu Shixin, Jan 2022
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4QSHG CVE: NA
--------------------------------
Enable CONFIG_DYNAMIC_HUGETLB for x86 by default.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/configs/openeuler_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 0200c289ccec..fc7be06e8054 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -7547,6 +7547,7 @@ CONFIG_TMPFS_XATTR=y # CONFIG_TMPFS_INODE64 is not set CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y +CONFIG_DYNAMIC_HUGETLB=y CONFIG_MEMFD_CREATE=y CONFIG_ARCH_HAS_GIGANTIC_PAGE=y CONFIG_CONFIGFS_FS=y
From: jinyiting jinyiting@huawei.com
mainline-inclusion from mainline-v5.16-rc7 commit 83d686a6822322c4981b745dc1d7185f1f40811b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4RADY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
------------------------------------------------- The bond works in mode 4, and performs down/up operations on the bond that is normally negotiated. The probability of bond-> slave_arr is NULL
Test commands: ifconfig bond1 down ifconfig bond1 up
The conflict occurs in the following process:
__dev_open (CPU A) --bond_open --queue_delayed_work(bond->wq,&bond->ad_work,0); --bond_update_slave_arr --bond_3ad_get_active_agg_info
ad_work(CPU B) --bond_3ad_state_machine_handler --ad_agg_selection_logic
ad_work runs on cpu B. In the function ad_agg_selection_logic, all agg->is_active will be cleared. Before the new active aggregator is selected on CPU B, bond_3ad_get_active_agg_info failed on CPU A, bond->slave_arr will be set to NULL. The best aggregator in ad_agg_selection_logic has not changed, no need to update slave arr.
The conflict occurred in that ad_agg_selection_logic clears agg->is_active under mode_lock, but bond_open -> bond_update_slave_arr is inspecting agg->is_active outside the lock.
Also, bond_update_slave_arr is normal for potential sleep when allocating memory, so replace the WARN_ON with a call to might_sleep.
Signed-off-by: jinyiting jinyiting@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Aichun Li liaichun@huawei.com Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: wuchangye wuchangye@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/bonding/bond_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 645c7cabcbe4..4f0894748347 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4458,9 +4458,7 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) int agg_id = 0; int ret = 0;
-#ifdef CONFIG_LOCKDEP - WARN_ON(lockdep_is_held(&bond->mode_lock)); -#endif + might_sleep();
usable_slaves = kzalloc(struct_size(usable_slaves, arr, bond->slave_cnt), GFP_KERNEL); @@ -4473,7 +4471,9 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) if (BOND_MODE(bond) == BOND_MODE_8023AD) { struct ad_info ad_info;
+ spin_lock_bh(&bond->mode_lock); if (bond_3ad_get_active_agg_info(bond, &ad_info)) { + spin_unlock_bh(&bond->mode_lock); pr_debug("bond_3ad_get_active_agg_info failed\n"); /* No active aggragator means it's not safe to use * the previous array. @@ -4481,6 +4481,7 @@ int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave) bond_reset_slave_arr(bond); goto out; } + spin_unlock_bh(&bond->mode_lock); agg_id = ad_info.aggregator_id; } bond_for_each_slave(bond, slave, iter) {
From: Di Zhu zhudi21@huawei.com
mainline-inclusion from mainline-v5.16-rc7 commit 3c9ef511b9fa128a4c62e3aa0aac4c6b190f0d55 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4RADY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The following steps will definitely cause the kernel to crash: ip link add vrf1 type vrf table 1 modprobe bonding.ko max_bonds=1 echo "+vrf1" >/sys/class/net/bond0/bonding/slaves rmmod bonding
The root cause is that: When the VRF is added to the slave device, it will fail, and some cleaning work will be done. because VRF device has IFF_MASTER flag, cleanup process will not clear the IFF_BONDING flag. Then, when we unload the bonding module, unregister_netdevice_notifier() will treat the VRF device as a bond master device and treat netdev_priv() as struct bonding{} which actually is struct net_vrf{}.
By analyzing the processing logic of bond_enslave(), it seems that it is not allowed to add the slave device with the IFF_MASTER flag, so we need to add a code check for this situation.
Signed-off-by: Di Zhu zhudi21@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Aichun Li liaichun@huawei.com Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: wuchangye wuchangye@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/bonding/bond_main.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 4f0894748347..97b9187bc8af 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1700,6 +1700,12 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, int link_reporting; int res = 0, i;
+ if (slave_dev->flags & IFF_MASTER) { + netdev_err(bond_dev, + "Error: Device with IFF_MASTER cannot be enslaved\n"); + return -EPERM; + } + if (!bond->params.use_carrier && slave_dev->ethtool_ops->get_link == NULL && slave_ops->ndo_do_ioctl == NULL) {
From: Di Zhu zhudi21@huawei.com
mainline-inclusion from mainline-v5.16-rc7 commit 4d293fe1c69c157c15ac06918a805e5fef036682 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4RADY CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The commit 3c9ef511b9fa ("bonding: avoid adding slave device with IFF_MASTER flag") fix a crash when add slave device with IFF_MASTER, but it rejects the scenario of nested bonding device.
As Eric Dumazet described: since there indeed is a usage scenario about nesting bonding, we should not break it.
So we add a new judgment condition to allow nesting of bonding device.
Fixes: 3c9ef511b9fa ("bonding: avoid adding slave device with IFF_MASTER flag") Suggested-by: Jay Vosburgh jay.vosburgh@canonical.com Signed-off-by: Di Zhu zhudi21@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Aichun Li liaichun@huawei.com Signed-off-by: Aichun Li liaichun@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: wuchangye wuchangye@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/bonding/bond_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 97b9187bc8af..cb823e2da910 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1700,7 +1700,9 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, int link_reporting; int res = 0, i;
- if (slave_dev->flags & IFF_MASTER) { + if (slave_dev->flags & IFF_MASTER && + !netif_is_bond_master(slave_dev)) { + NL_SET_ERR_MSG(extack, "Device with IFF_MASTER cannot be enslaved"); netdev_err(bond_dev, "Error: Device with IFF_MASTER cannot be enslaved\n"); return -EPERM;
From: nifujia nifujia1@hisilicon.com
mainline inclusion from mainline-v5.16-rc1 commit 00aeaf329a3a category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4QKP3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/dr...
-----------------------------------------------------------------------------------
Export sas_phy_enable() so LLDDs can directly use it to control remote phys.
We already do this for companion function sas_phy_reset().
Link: https://lore.kernel.org/r/1634041588-74824-4-git-send-email-john.garry@huawe... Signed-off-by: Luo Jiaxing luojiaxing@huawei.com Signed-off-by: John Garry john.garry@huawei.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Reviewed-by: Ouyangdelong ouyangdelong@huawei.com Signed-off-by: Nifujia nifujia1@hisilicon.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/scsi/libsas/sas_init.c | 3 ++- include/scsi/libsas.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/scsi/libsas/sas_init.c b/drivers/scsi/libsas/sas_init.c index 2b0f98ca6ec3..23f8d34ccb0d 100644 --- a/drivers/scsi/libsas/sas_init.c +++ b/drivers/scsi/libsas/sas_init.c @@ -252,7 +252,7 @@ static int transport_sas_phy_reset(struct sas_phy *phy, int hard_reset) } }
-static int sas_phy_enable(struct sas_phy *phy, int enable) +int sas_phy_enable(struct sas_phy *phy, int enable) { int ret; enum phy_func cmd; @@ -284,6 +284,7 @@ static int sas_phy_enable(struct sas_phy *phy, int enable) } return ret; } +EXPORT_SYMBOL_GPL(sas_phy_enable);
int sas_phy_reset(struct sas_phy *phy, int hard_reset) { diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index 6fe125a71b60..79e4903bd414 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -664,6 +664,7 @@ extern void sas_suspend_ha(struct sas_ha_struct *sas_ha);
int sas_set_phy_speed(struct sas_phy *phy, struct sas_phy_linkrates *rates); int sas_phy_reset(struct sas_phy *phy, int hard_reset); +int sas_phy_enable(struct sas_phy *phy, int enable); extern int sas_queuecommand(struct Scsi_Host *, struct scsi_cmnd *); extern int sas_target_alloc(struct scsi_target *); extern int sas_slave_configure(struct scsi_device *);
From: nifujia nifujia1@hisilicon.com
mainline inclusion from mainline-v5.16-rc1 commit 21c7e972475e6a975fbe97f8974c96fe4713077c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4QKP3 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/dr...
----------------------------------------------------------------------------------
If the softreset fails in the I_T reset, libsas will then continue to issue a controller reset to try to recover.
However a faulty disk may cause the softreset to fail, and resetting the controller will not help this scenario. Indeed, we will just continue the cycle of error handle handling to try to recover.
So if the softreset fails upon certain conditions, just disable the phy associated with the disk. The user needs to handle this problem.
Link: https://lore.kernel.org/r/1634041588-74824-5-git-send-email-john.garry@huawe... Signed-off-by: Luo Jiaxing luojiaxing@huawei.com Signed-off-by: John Garry john.garry@huawei.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Reviewed-by: Ouyangdelong ouyangdelong@huawei.com Signed-off-by: Nifujia nifujia1@hisilicon.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/scsi/hisi_sas/hisi_sas_main.c | 29 ++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-)
diff --git a/drivers/scsi/hisi_sas/hisi_sas_main.c b/drivers/scsi/hisi_sas/hisi_sas_main.c index 3a903e8e0384..98a1754907d1 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_main.c +++ b/drivers/scsi/hisi_sas/hisi_sas_main.c @@ -1839,14 +1839,33 @@ static int hisi_sas_I_T_nexus_reset(struct domain_device *device) } hisi_sas_dereg_device(hisi_hba, device);
- if (dev_is_sata(device)) { + rc = hisi_sas_debug_I_T_nexus_reset(device); + if (rc == TMF_RESP_FUNC_COMPLETE && dev_is_sata(device)) { + struct sas_phy *local_phy; + rc = hisi_sas_softreset_ata_disk(device); - if (rc == TMF_RESP_FUNC_FAILED) - return TMF_RESP_FUNC_FAILED; + switch (rc) { + case -ECOMM: + rc = -ENODEV; + break; + case TMF_RESP_FUNC_FAILED: + case -EMSGSIZE: + case -EIO: + local_phy = sas_get_local_phy(device); + rc = sas_phy_enable(local_phy, 0); + if (!rc) { + local_phy->enabled = 0; + dev_err(dev, "Disabled local phy of ATA disk %016llx due to softreset fail (%d)\n", + SAS_ADDR(device->sas_addr), rc); + rc = -ENODEV; + } + sas_put_local_phy(local_phy); + break; + default: + break; + } }
- rc = hisi_sas_debug_I_T_nexus_reset(device); - if ((rc == TMF_RESP_FUNC_COMPLETE) || (rc == -ENODEV)) hisi_sas_release_task(hisi_hba, device);