From: Liu Shixin liushixin2@hauwei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I6BDME CVE: NA
--------------------------------
This feature has already beed supported on x86_64 and this is the origin description:
Dynamic hugetlb which is based on Hugetlb, supports to be splited dynamically in a specified cgroup. We add a hugetlb_pool in a mem_cgroup to manage dynamic hugetlb for corresponding cgroup. After dynamic hugepages are allocated for a cgroup, these hugepages can be used as 1G/2M/4K pages by split/merge opreation.
It is now supported on arm64. This feature will be limited to depends on ARM64_4K_PAGES and not support cont-bits hugepage. We merge the previous patches into one patch which is patch[1]. While merge the code ,we found some code can be isolated by config DYNAMIC_HUGETLB, so we add patch[2] to re-isolated them. In patch[3], we restrict the feature on mentioned limit. The patch[4] add skip of dissolve hugepage which may conflict with memory hotplug and memory failure. The patch[5] set DYNAMIC_HUGETLB to y in hulk_defconfig to enable by default.
This patch includes all previous patches and the patches list is recorded in bugzilla.
Signed-off-by: Liu Shixin liushixin2@hauwei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- fs/Kconfig | 9 + fs/hugetlbfs/inode.c | 4 + include/linux/gfp.h | 4 +- include/linux/hugetlb.h | 97 +++ include/linux/memcontrol.h | 15 + include/linux/page-flags.h | 3 + include/trace/events/dhugetlb.h | 123 ++++ include/trace/events/mmflags.h | 1 + kernel/cgroup/cgroup.c | 6 + mm/huge_memory.c | 16 +- mm/hugetlb.c | 1188 ++++++++++++++++++++++++++++++- mm/internal.h | 1 + mm/memcontrol.c | 391 ++++++++++ mm/page_alloc.c | 33 +- 14 files changed, 1862 insertions(+), 29 deletions(-) create mode 100644 include/trace/events/dhugetlb.h
diff --git a/fs/Kconfig b/fs/Kconfig index 5921bfbebee4..e8800d8a73b3 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -211,6 +211,15 @@ config TMPFS_INODE64
If unsure, say N.
+config DYNAMIC_HUGETLB + bool "Dynamic HugeTLB" + depends on HUGETLB_PAGE + depends on MEMCG + depends on CGROUP_HUGETLB + help + Dynamic hugepage are used in memcg and can be splited into small pages + automatically. The tasks in the memcg prefer to alloc dynamic hugepage. + config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 005e05c442c5..30a29936372c 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1164,6 +1164,8 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) * private inode. This simplifies hugetlbfs_destroy_inode. */ mpol_shared_policy_init(&p->policy, NULL); + /* Initialize hpool here in case of a quick call to destroy */ + p->hpool = get_dhugetlb_pool_from_task(current);
return &p->vfs_inode; } @@ -1178,6 +1180,8 @@ static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); + dhugetlb_pool_put(HUGETLBFS_I(inode)->hpool); + HUGETLBFS_I(inode)->hpool = NULL; call_rcu(&inode->i_rcu, hugetlbfs_i_callback); }
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 152cb9bdf436..74b0375d7d2b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -501,7 +501,9 @@ static inline void arch_alloc_page(struct page *page, int order) { } struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask); - +void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags); +bool free_pages_prepare(struct page *page, unsigned int order, bool check_free); static inline struct page * __alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid) { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2d2b06b36bd0..3a82ea9283ec 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -289,6 +289,7 @@ struct hugetlbfs_inode_info { struct shared_policy policy; struct inode vfs_inode; unsigned int seals; + struct dhugetlb_pool *hpool; };
static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) @@ -655,6 +656,102 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
#endif /* CONFIG_HUGETLB_PAGE */
+#ifdef CONFIG_DYNAMIC_HUGETLB +/* The number of small_page_pool for a dhugetlb_pool */ +#define NR_SMPOOL num_possible_cpus() +/* The max page number in a small_page_pool */ +#define MAX_SMPOOL_PAGE 1024 +/* number to move between list */ +#define BATCH_SMPOOL_PAGE (MAX_SMPOOL_PAGE >> 2) +/* We don't need to try 5 times, or we can't migrate the pages. */ +#define HPOOL_RECLAIM_RETRIES 5 + +extern struct static_key_false dhugetlb_enabled_key; +#define dhugetlb_enabled (static_branch_unlikely(&dhugetlb_enabled_key)) + +#define DEFAULT_PAGESIZE 4096 +extern rwlock_t dhugetlb_pagelist_rwlock; +struct dhugetlb_pagelist { + unsigned long count; + struct dhugetlb_pool *hpool[0]; +}; +extern struct dhugetlb_pagelist *dhugetlb_pagelist_t; + +struct split_pages { + struct list_head list; + unsigned long start_pfn; + unsigned long free_pages; +}; + +struct small_page_pool { + spinlock_t lock; + unsigned long free_pages; + long used_pages; + struct list_head head_page; +}; + +struct dhugetlb_pool { + int nid; + spinlock_t lock; + spinlock_t reserved_lock; + atomic_t refcnt; + + struct mem_cgroup *attach_memcg; + + struct list_head dhugetlb_1G_freelists; + struct list_head dhugetlb_2M_freelists; + struct list_head dhugetlb_4K_freelists; + + struct list_head split_1G_freelists; + struct list_head split_2M_freelists; + + unsigned long total_nr_pages; + + unsigned long total_reserved_1G; + unsigned long free_reserved_1G; + unsigned long mmap_reserved_1G; + unsigned long used_1G; + unsigned long free_unreserved_1G; + unsigned long nr_split_1G; + + unsigned long total_reserved_2M; + unsigned long free_reserved_2M; + unsigned long mmap_reserved_2M; + unsigned long used_2M; + unsigned long free_unreserved_2M; + unsigned long nr_split_2M; + + unsigned long free_pages; + struct small_page_pool smpool[0]; +}; + +bool dhugetlb_pool_get(struct dhugetlb_pool *hpool); +void dhugetlb_pool_put(struct dhugetlb_pool *hpool); +struct dhugetlb_pool *hpool_alloc(unsigned long nid); +int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool, + unsigned long nid, unsigned long size); +bool free_dhugetlb_pool(struct dhugetlb_pool *hpool); +int update_dhugetlb_pagelist(unsigned long idx, struct dhugetlb_pool *hpool); +struct dhugetlb_pool *get_dhugetlb_pool_from_dhugetlb_pagelist( + struct page *page); +struct dhugetlb_pool *get_dhugetlb_pool_from_task(struct task_struct *tsk); +bool move_pages_from_hpool_to_smpool(struct dhugetlb_pool *hpool, + struct small_page_pool *smpool); +void move_pages_from_smpool_to_hpool(struct dhugetlb_pool *hpool, + struct small_page_pool *smpool); +void dhugetlb_reserve_hugepages(struct dhugetlb_pool *hpool, + unsigned long count, bool gigantic); +#else +#define dhugetlb_enabled 0 +struct dhugetlb_pool {}; +static inline struct dhugetlb_pool *get_dhugetlb_pool_from_task( + struct task_struct *tsk) +{ + return NULL; +} +static inline void dhugetlb_pool_put(struct dhugetlb_pool *hpool) { return; } +#endif /* CONFIG_DYNAMIC_HUGETLB */ + static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4517d132d1e2..22f40d5e0e8b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -326,6 +326,7 @@ struct mem_cgroup { };
struct mem_cgroup_extension { + struct dhugetlb_pool *hpool; #ifdef CONFIG_MEMCG_QOS /* Currently support 0 and -1. * in the future it can expand to other value. @@ -1406,4 +1407,18 @@ static inline void memcg_put_cache_ids(void)
#endif /* CONFIG_MEMCG_KMEM */
+#ifdef CONFIG_DYNAMIC_HUGETLB +struct dhugetlb_pool *get_dhugetlb_pool_from_memcg(struct mem_cgroup *memcg); +struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp_mask); +void free_page_to_dhugetlb_pool(struct page *page); +int dhugetlb_pool_force_empty(struct mem_cgroup *memcg); +bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css); +#else +static inline struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp_mask) +{ + return NULL; +} +static inline void free_page_to_dhugetlb_pool(struct page *page) {} +#endif + #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0c5d1c4c71e6..fd6cd68e00a2 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -102,6 +102,7 @@ enum pageflags { PG_idle, #endif PG_percpu_ref, + PG_pool, __NR_PAGEFLAGS,
/* Filesystems */ @@ -284,6 +285,7 @@ PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) __PAGEFLAG(Slab, slab, PF_NO_TAIL) __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ +PAGEFLAG(Pool, pool, PF_NO_TAIL)
/* Xen */ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) @@ -770,6 +772,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_slab | 1UL << PG_active | \ + 1UL << PG_pool | \ 1UL << PG_unevictable | __PG_MLOCKED)
/* diff --git a/include/trace/events/dhugetlb.h b/include/trace/events/dhugetlb.h new file mode 100644 index 000000000000..20b3a54589d1 --- /dev/null +++ b/include/trace/events/dhugetlb.h @@ -0,0 +1,123 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM dhugetlb + +#if !defined(_TRACE_DHUGETLB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DHUGETLB_H + +#include <linux/tracepoint.h> +#include <trace/events/mmflags.h> + +#define DHUGETLB_SPLIT_1G 0x01u +#define DHUGETLB_SPLIT_2M 0x02u +#define DHUGETLB_MERGE_4K 0x04u +#define DHUGETLB_MIGRATE_4K 0x08u +#define DHUGETLB_RESV_1G 0x10u +#define DHUGETLB_UNRESV_1G 0x20u +#define DHUGETLB_RESV_2M 0x40u +#define DHUGETLB_UNRESV_2M 0x80u +#define DHUGETLB_ALLOC_1G 0x100u +#define DHUGETLB_FREE_1G 0x200u +#define DHUGETLB_ALLOC_2M 0x400u +#define DHUGETLB_FREE_2M 0x800u + +#define __def_action_names \ + {(unsigned long)DHUGETLB_SPLIT_1G, "split_1G_to_2M"}, \ + {(unsigned long)DHUGETLB_SPLIT_2M, "split_2M_to_4K"}, \ + {(unsigned long)DHUGETLB_MERGE_4K, "merge_4K_to_2M"}, \ + {(unsigned long)DHUGETLB_MIGRATE_4K, "migrate_4K_to_2M"}, \ + {(unsigned long)DHUGETLB_RESV_1G, "resv_1G_page"}, \ + {(unsigned long)DHUGETLB_UNRESV_1G, "unresv_1G_page"}, \ + {(unsigned long)DHUGETLB_RESV_2M, "resv_2M_page"}, \ + {(unsigned long)DHUGETLB_UNRESV_2M, "unresv_2M_page"}, \ + {(unsigned long)DHUGETLB_ALLOC_1G, "alloc_1G_page"}, \ + {(unsigned long)DHUGETLB_FREE_1G, "free_1G_page"}, \ + {(unsigned long)DHUGETLB_ALLOC_2M, "alloc_2M_page"}, \ + {(unsigned long)DHUGETLB_FREE_2M, "free_2M_page"} + +#define show_action(action) \ + (action) ? __print_flags(action, "", \ + __def_action_names \ + ) : "none" + +TRACE_EVENT(dhugetlb_split_merge, + + TP_PROTO(const void *hpool, struct page *page, unsigned long action), + + TP_ARGS(hpool, page, action), + + TP_STRUCT__entry( + __field( const void *, hpool ) + __field( unsigned long, pfn ) + __field( unsigned long, action ) + ), + + TP_fast_assign( + __entry->hpool = hpool; + __entry->pfn = page ? page_to_pfn(page) : -1UL; + __entry->action = action; + ), + + TP_printk("hpool=%p page=%p pfn=%lu action=%s", + __entry->hpool, + __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, + __entry->pfn != -1UL ? __entry->pfn : 0, + show_action(__entry->action)) +); + +TRACE_EVENT(dhugetlb_acct_memory, + + TP_PROTO(const void *hpool, unsigned long count, unsigned long action), + + TP_ARGS(hpool, count, action), + + TP_STRUCT__entry( + __field( const void *, hpool ) + __field( unsigned long, count ) + __field( unsigned long, action ) + ), + + TP_fast_assign( + __entry->hpool = hpool; + __entry->count = count; + __entry->action = action; + ), + + TP_printk("hpool=%p action=%s, mmap_count=%lu", + __entry->hpool, + show_action(__entry->action), + __entry->count) +); + +TRACE_EVENT(dhugetlb_alloc_free, + + TP_PROTO(const void *hpool, struct page *page, unsigned long count, + unsigned long action), + + TP_ARGS(hpool, page, count, action), + + TP_STRUCT__entry( + __field( const void *, hpool ) + __field( unsigned long, pfn ) + __field( unsigned long, count ) + __field( unsigned long, action ) + ), + + TP_fast_assign( + __entry->hpool = hpool; + __entry->pfn = page ? page_to_pfn(page) : -1UL; + __entry->count = count; + __entry->action = action; + ), + + TP_printk("hpool=%p page=%p pfn=%lu action=%s free_count=%lu", + __entry->hpool, + __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, + __entry->pfn != -1UL ? __entry->pfn : 0, + show_action(__entry->action), + __entry->count) +); + +#endif /* _TRACE_DHUGETLB_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index b817bf1885a0..4d06b47129f3 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -81,6 +81,7 @@
#define __def_pageflag_names \ {1UL << PG_locked, "locked" }, \ + {1UL << PG_pool, "pool" }, \ {1UL << PG_waiters, "waiters" }, \ {1UL << PG_error, "error" }, \ {1UL << PG_referenced, "referenced" }, \ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7456882e1a0f..b01490b71f32 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -65,6 +65,7 @@ /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
+bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css); /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -5280,6 +5281,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (css_has_online_children(&cgrp->self)) return -EBUSY;
+#ifdef CONFIG_MEMCG + /* If we use dynamic hugetlb, make sure dhugtlb_pool is free */ + if (!dhugetlb_pool_is_free(cgrp->subsys[memory_cgrp_id])) + return -EBUSY; +#endif /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f8319265c1cf..484ffdbf5f45 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -396,6 +396,20 @@ static int __init hugepage_init(void) return -EINVAL; }
+ /* + * When we alloc some pages(order = 0), system may help us to alloc + * a page(order > 0) due to transparent hugepage. This result + * dynamic hugetlb to be skipped. Actually, using dynamic hugetlb + * means we have already optimized the program, so we should not + * use transparent hugepage in addition. + * (May result negative optimization) + */ + if (dhugetlb_enabled) { + transparent_hugepage_flags = 0; + pr_info("transparent hugepage is disabled due to confilct with dynamic hugetlb\n"); + return -EINVAL; + } + /* * hugepages can't be allocated by the buddy allocator */ @@ -2946,9 +2960,9 @@ static unsigned long deferred_split_count(struct shrinker *shrink, { struct pglist_data *pgdata = NODE_DATA(sc->nid); unsigned long *split_queue_len = &pgdata->split_queue_len; +#ifdef CONFIG_MEMCG struct mem_cgroup_extension *memcg_ext;
-#ifdef CONFIG_MEMCG if (sc->memcg) { memcg_ext = container_of(sc->memcg, struct mem_cgroup_extension, memcg); split_queue_len = &memcg_ext->split_queue_len; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 495d8b5b38fc..4c8c91acd6d5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -27,6 +27,12 @@ #include <linux/jhash.h> #include <linux/mman.h> #include <linux/share_pool.h> +#include <linux/kthread.h> +#include <linux/cpuhotplug.h> +#include <linux/freezer.h> +#include <linux/delay.h> +#include <linux/migrate.h> +#include <linux/mm_inline.h>
#include <asm/page.h> #include <asm/pgtable.h> @@ -39,8 +45,14 @@ #include <linux/userfaultfd_k.h> #include <linux/page_owner.h> #include <linux/share_pool.h> +#include <linux/memblock.h> #include "internal.h"
+#if (defined CONFIG_DYNAMIC_HUGETLB) && (!defined __GENKSYMS__) +#define CREATE_TRACE_POINTS +#include <trace/events/dhugetlb.h> +#endif + int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -89,7 +101,8 @@ static inline void ClearPageHugeFreed(struct page *head) }
/* Forward declaration */ -static int hugetlb_acct_memory(struct hstate *h, long delta); +static int hugetlb_acct_memory(struct hstate *h, long delta, + struct dhugetlb_pool *hpool);
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { @@ -103,7 +116,7 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) if (free) { if (spool->min_hpages != -1) hugetlb_acct_memory(spool->hstate, - -spool->min_hpages); + -spool->min_hpages, NULL); kfree(spool); } } @@ -123,7 +136,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, spool->hstate = h; spool->min_hpages = min_hpages;
- if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { + if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages, NULL)) { kfree(spool); return NULL; } @@ -149,13 +162,17 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) * a subpool minimum size must be manitained. */ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, - long delta) + long delta, struct dhugetlb_pool *hpool) { long ret = delta;
if (!spool) return ret;
+ /* Skip subpool when hugetlb file belongs to a hugetlb_pool */ + if (dhugetlb_enabled && hpool) + return ret; + spin_lock(&spool->lock);
if (spool->max_hpages != -1) { /* maximum size accounting */ @@ -194,13 +211,17 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, * in the case where a subpool minimum size must be maintained. */ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, - long delta) + long delta, struct dhugetlb_pool *hpool) { long ret = delta;
if (!spool) return delta;
+ /* Skip subpool when hugetlb file belongs to a hugetlb_pool */ + if (dhugetlb_enabled && hpool) + return ret; + spin_lock(&spool->lock);
if (spool->max_hpages != -1) /* maximum size accounting */ @@ -594,12 +615,13 @@ void hugetlb_fix_reserve_counts(struct inode *inode) struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; bool reserved = false; + struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool;
- rsv_adjust = hugepage_subpool_get_pages(spool, 1); + rsv_adjust = hugepage_subpool_get_pages(spool, 1, hpool); if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode);
- if (!hugetlb_acct_memory(h, 1)) + if (!hugetlb_acct_memory(h, 1, hpool)) reserved = true; } else if (!rsv_adjust) { reserved = true; @@ -1300,6 +1322,56 @@ static inline void ClearPageHugeTemporary(struct page *page) page[2].mapping = NULL; }
+#ifdef CONFIG_DYNAMIC_HUGETLB +static void free_huge_page_to_dhugetlb_pool(struct page *page, + bool restore_reserve) +{ + struct hstate *h = page_hstate(page); + struct dhugetlb_pool *hpool; + + hpool = get_dhugetlb_pool_from_dhugetlb_pagelist(page); + if (unlikely(!hpool)) { + pr_err("dhugetlb: free error: get hpool failed\n"); + return; + } + + spin_lock(&hpool->lock); + ClearPagePool(page); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + if (!hstate_is_gigantic(h)) { + list_add(&page->lru, &hpool->dhugetlb_2M_freelists); + hpool->free_reserved_2M++; + hpool->used_2M--; + if (restore_reserve) { + hpool->mmap_reserved_2M++; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_2M, + DHUGETLB_RESV_2M); + } + trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_2M, + DHUGETLB_FREE_2M); + } else { + list_add(&page->lru, &hpool->dhugetlb_1G_freelists); + hpool->free_reserved_1G++; + hpool->used_1G--; + if (restore_reserve) { + hpool->mmap_reserved_1G++; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_1G, + DHUGETLB_RESV_1G); + } + trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_1G, + DHUGETLB_FREE_1G); + } + spin_unlock(&hpool->lock); + dhugetlb_pool_put(hpool); +} +#else +void free_huge_page_to_dhugetlb_pool(struct page *page, bool restore_reserve) +{ +} +#endif + void free_huge_page(struct page *page) { /* @@ -1320,6 +1392,17 @@ void free_huge_page(struct page *page) restore_reserve = PagePrivate(page); ClearPagePrivate(page);
+ if (dhugetlb_enabled && PagePool(page)) { + spin_lock(&hugetlb_lock); + clear_page_huge_active(page); + list_del(&page->lru); + hugetlb_cgroup_uncharge_page(hstate_index(h), + pages_per_huge_page(h), page); + spin_unlock(&hugetlb_lock); + free_huge_page_to_dhugetlb_pool(page, restore_reserve); + return; + } + /* * If PagePrivate() was set on page, page allocation consumed a * reservation. If the page was associated with a subpool, there @@ -1335,7 +1418,7 @@ void free_huge_page(struct page *page) * after page is free. Therefore, force restore_reserve * operation. */ - if (hugepage_subpool_put_pages(spool, 1) == 0) + if (hugepage_subpool_put_pages(spool, 1, NULL) == 0) restore_reserve = true; }
@@ -2211,6 +2294,81 @@ static void restore_reserve_on_error(struct hstate *h, } }
+#ifdef CONFIG_DYNAMIC_HUGETLB +static struct page *__alloc_huge_page_from_dhugetlb_pool( + struct dhugetlb_pool *hpool, int idx, bool need_unreserved) +{ + unsigned long flags; + struct page *page = NULL; + + spin_lock_irqsave(&hpool->lock, flags); + if (hstate_is_gigantic(&hstates[idx]) && hpool->free_reserved_1G) { + page = list_entry(hpool->dhugetlb_1G_freelists.next, + struct page, lru); + list_del(&page->lru); + hpool->free_reserved_1G--; + hpool->used_1G++; + if (need_unreserved) { + SetPagePrivate(page); + hpool->mmap_reserved_1G--; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_1G, + DHUGETLB_UNRESV_1G); + } + trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_1G, + DHUGETLB_ALLOC_1G); + } else if (!hstate_is_gigantic(&hstates[idx]) && + hpool->free_reserved_2M) { + page = list_entry(hpool->dhugetlb_2M_freelists.next, + struct page, lru); + list_del(&page->lru); + hpool->free_reserved_2M--; + hpool->used_2M++; + if (need_unreserved) { + SetPagePrivate(page); + hpool->mmap_reserved_2M--; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_2M, + DHUGETLB_UNRESV_2M); + } + trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_2M, + DHUGETLB_ALLOC_2M); + } + if (page) { + INIT_LIST_HEAD(&page->lru); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + set_page_refcounted(page); + SetPagePool(page); + } + spin_unlock_irqrestore(&hpool->lock, flags); + + return page; +} + +static struct page *alloc_huge_page_from_dhugetlb_pool( + struct vm_area_struct *vma, int idx, int avoid_reserve, + long gbl_chg, struct dhugetlb_pool *hpool) +{ + struct page *page; + bool need_unreserved = false; + + if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) + need_unreserved = true; + + page = __alloc_huge_page_from_dhugetlb_pool(hpool, idx, + need_unreserved); + + return page; +} +#else +static inline struct page *alloc_huge_page_from_dhugetlb_pool( + struct vm_area_struct *vma, int idx, int avoid_reserve, + long gbl_chg, struct dhugetlb_pool *hpool) +{ + return NULL; +} +#endif + struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -2221,6 +2379,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, long gbl_chg; int ret, idx; struct hugetlb_cgroup *h_cg; + struct dhugetlb_pool *hpool = + HUGETLBFS_I(file_inode(vma->vm_file))->hpool;
idx = hstate_index(h); /* @@ -2240,7 +2400,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * checked against any subpool limit. */ if (map_chg || avoid_reserve) { - gbl_chg = hugepage_subpool_get_pages(spool, 1); + gbl_chg = hugepage_subpool_get_pages(spool, 1, hpool); if (gbl_chg < 0) { vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); @@ -2262,6 +2422,26 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (ret) goto out_subpool_put;
+ if (dhugetlb_enabled && hpool) { + page = alloc_huge_page_from_dhugetlb_pool(vma, idx, + avoid_reserve, + gbl_chg, hpool); + if (page) { + /* + * Use hugetlb_lock to manage the account of + * hugetlb cgroup. + */ + spin_lock(&hugetlb_lock); + list_add(&page->lru, &h->hugepage_activelist); + hugetlb_cgroup_commit_charge(idx, + pages_per_huge_page(hstate_vma(vma)), + h_cg, page); + spin_unlock(&hugetlb_lock); + goto out; + } + goto out_uncharge_cgroup; + } + spin_lock(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken @@ -2284,7 +2464,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, } hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); spin_unlock(&hugetlb_lock); - +out: set_page_private(page, (unsigned long)spool);
map_commit = vma_commit_reservation(h, vma, addr); @@ -2300,8 +2480,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, */ long rsv_adjust;
- rsv_adjust = hugepage_subpool_put_pages(spool, 1); - hugetlb_acct_memory(h, -rsv_adjust); + rsv_adjust = hugepage_subpool_put_pages(spool, 1, hpool); + hugetlb_acct_memory(h, -rsv_adjust, hpool); } return page;
@@ -2309,7 +2489,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_subpool_put: if (map_chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); + hugepage_subpool_put_pages(spool, 1, hpool); vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -3098,6 +3278,932 @@ static void hugetlb_register_all_nodes(void) { }
#endif
+#ifdef CONFIG_DYNAMIC_HUGETLB +static bool enable_dhugetlb; +DEFINE_STATIC_KEY_FALSE(dhugetlb_enabled_key); +DEFINE_RWLOCK(dhugetlb_pagelist_rwlock); +struct dhugetlb_pagelist *dhugetlb_pagelist_t; + +bool dhugetlb_pool_get(struct dhugetlb_pool *hpool) +{ + if (!hpool) + return false; + + return atomic_inc_not_zero(&hpool->refcnt); +} + +void dhugetlb_pool_put(struct dhugetlb_pool *hpool) +{ + if (!dhugetlb_enabled || !hpool) + return; + + if (atomic_dec_and_test(&hpool->refcnt)) { + css_put(&hpool->attach_memcg->css); + kfree(hpool); + } +} + +struct dhugetlb_pool *hpool_alloc(unsigned long nid) +{ + int i; + struct dhugetlb_pool *hpool; + + hpool = kzalloc(sizeof(struct dhugetlb_pool) + + NR_SMPOOL * sizeof(struct small_page_pool), GFP_KERNEL); + if (!hpool) + return NULL; + + spin_lock_init(&hpool->lock); + spin_lock_init(&hpool->reserved_lock); + hpool->nid = nid; + atomic_set(&hpool->refcnt, 1); + INIT_LIST_HEAD(&hpool->dhugetlb_1G_freelists); + INIT_LIST_HEAD(&hpool->dhugetlb_2M_freelists); + INIT_LIST_HEAD(&hpool->dhugetlb_4K_freelists); + INIT_LIST_HEAD(&hpool->split_1G_freelists); + INIT_LIST_HEAD(&hpool->split_2M_freelists); + + for (i = 0; i < NR_SMPOOL; i++) { + spin_lock_init(&hpool->smpool[i].lock); + INIT_LIST_HEAD(&hpool->smpool[i].head_page); + } + + return hpool; +} + +int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool, + unsigned long nid, unsigned long size) +{ + int ret; + struct page *page, *next; + unsigned long idx; + unsigned long i = 0; + struct hstate *h = size_to_hstate(PUD_SIZE); + + if (!h) + return -ENOMEM; + + spin_lock(&hpool->lock); + spin_lock(&hugetlb_lock); + if (h->free_huge_pages_node[nid] < size) { + ret = -ENOMEM; + goto out_unlock; + } + + list_for_each_entry_safe(page, next, &h->hugepage_freelists[nid], lru) { + idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT); + ret = update_dhugetlb_pagelist(idx, hpool); + if (ret) + continue; + ClearPageHugeFreed(page); + list_move_tail(&page->lru, &hpool->dhugetlb_1G_freelists); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + hpool->total_nr_pages++; + hpool->free_unreserved_1G++; + if (++i == size) + break; + } + ret = 0; +out_unlock: + spin_unlock(&hugetlb_lock); + spin_unlock(&hpool->lock); + return ret; +} + +/* + * When we assign a hugepage to dhugetlb_pool, we need to record it in + * dhugetlb_pagelist_t. In this situation, we just need read_lock because + * there is not conflit when write to dhugetlb_pagelist_t->hpool. + * + * If page's pfn is greater than dhugetlb_pagelist_t->count (which may + * occurs due to memory hotplug), we need to realloc enough memory so that + * pfn = dhugetlb_pagelist_t->count - 1 and then record it. + * In this situation, we need write_lock because while we are reallocating, + * the read request should wait. + */ +int update_dhugetlb_pagelist(unsigned long idx, struct dhugetlb_pool *hpool) +{ + read_lock(&dhugetlb_pagelist_rwlock); + if (idx >= dhugetlb_pagelist_t->count) { + unsigned long size; + struct dhugetlb_pagelist *tmp; + + read_unlock(&dhugetlb_pagelist_rwlock); + write_lock(&dhugetlb_pagelist_rwlock); + + size = sizeof(struct dhugetlb_pagelist) + + (idx + 1) * sizeof(struct dhugetlb_pool *); + tmp = krealloc(dhugetlb_pagelist_t, size, GFP_ATOMIC); + if (!tmp) { + write_unlock(&dhugetlb_pagelist_rwlock); + return -ENOMEM; + } + tmp->count = idx + 1; + dhugetlb_pagelist_t = tmp; + + write_unlock(&dhugetlb_pagelist_rwlock); + read_lock(&dhugetlb_pagelist_rwlock); + } + dhugetlb_pagelist_t->hpool[idx] = hpool; + read_unlock(&dhugetlb_pagelist_rwlock); + return 0; +} + +struct dhugetlb_pool *get_dhugetlb_pool_from_dhugetlb_pagelist( + struct page *page) +{ + struct dhugetlb_pool *hpool = NULL; + unsigned long idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT); + + read_lock(&dhugetlb_pagelist_rwlock); + if (idx < dhugetlb_pagelist_t->count) + hpool = dhugetlb_pagelist_t->hpool[idx]; + read_unlock(&dhugetlb_pagelist_rwlock); + if (dhugetlb_pool_get(hpool)) + return hpool; + return NULL; +} + +struct dhugetlb_pool *get_dhugetlb_pool_from_task(struct task_struct *tsk) +{ + struct mem_cgroup *memcg; + struct dhugetlb_pool *hpool; + + if (!dhugetlb_enabled) + return NULL; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(tsk); + rcu_read_unlock(); + + hpool = get_dhugetlb_pool_from_memcg(memcg); + + return hpool; +} + +static void add_new_huge_page_to_pool(struct dhugetlb_pool *hpool, + struct page *page, bool gigantic) +{ + lockdep_assert_held(&hpool->lock); + VM_BUG_ON_PAGE(page_mapcount(page), page); + INIT_LIST_HEAD(&page->lru); + + if (gigantic) { + prep_compound_gigantic_page(page, PUD_SHIFT - PAGE_SHIFT); + list_add_tail(&page->lru, &hpool->dhugetlb_1G_freelists); + hpool->free_unreserved_1G++; + } else { + prep_new_page(page, PMD_SHIFT - PAGE_SHIFT, __GFP_COMP, 0); + set_page_count(page, 0); + list_add_tail(&page->lru, &hpool->dhugetlb_2M_freelists); + hpool->free_unreserved_2M++; + } + set_page_private(page, 0); + page->mapping = NULL; + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + set_hugetlb_cgroup(page, NULL); +} + +static void free_dhugetlb_pcpool(struct dhugetlb_pool *hpool) +{ + int i; + struct small_page_pool *smpool; + + for (i = 0; i < NR_SMPOOL; i++) { + smpool = &hpool->smpool[i]; + list_splice(&smpool->head_page, &hpool->dhugetlb_4K_freelists); + smpool->free_pages = 0; + smpool->used_pages = 0; + INIT_LIST_HEAD(&smpool->head_page); + } +} + +static void __free_dhugetlb_small_page(struct dhugetlb_pool *hpool) +{ + struct page *page, *next; + struct split_pages *split_huge, *split_next; + + if (list_empty(&hpool->dhugetlb_4K_freelists)) + return; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_4K_freelists, lru) { + list_del(&page->lru); + add_new_huge_page_to_pool(hpool, page, false); + } + + list_for_each_entry_safe(split_huge, split_next, + &hpool->split_2M_freelists, list) { + list_del(&split_huge->list); + kfree(split_huge); + hpool->nr_split_2M--; + } + + hpool->free_pages = 0; + INIT_LIST_HEAD(&hpool->dhugetlb_4K_freelists); +} + +static void free_dhugetlb_small_page(struct dhugetlb_pool *hpool) +{ + struct page *page, *next; + unsigned long nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + + lockdep_assert_held(&hpool->lock); + if (list_empty(&hpool->dhugetlb_4K_freelists)) + return; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_4K_freelists, lru) { + if (page_to_pfn(page) % nr_pages != 0) + list_del(&page->lru); + } + + __free_dhugetlb_small_page(hpool); +} + +static void __free_dhugetlb_huge_page(struct dhugetlb_pool *hpool) +{ + struct page *page, *next; + struct split_pages *split_giga, *split_next; + + if (list_empty(&hpool->dhugetlb_2M_freelists)) + return; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_2M_freelists, lru) { + list_del(&page->lru); + add_new_huge_page_to_pool(hpool, page, true); + } + list_for_each_entry_safe(split_giga, split_next, + &hpool->split_1G_freelists, list) { + list_del(&split_giga->list); + kfree(split_giga); + hpool->nr_split_1G--; + } + + hpool->total_reserved_2M = 0; + hpool->free_reserved_2M = 0; + hpool->free_unreserved_2M = 0; + INIT_LIST_HEAD(&hpool->dhugetlb_2M_freelists); +} + +static void free_dhugetlb_huge_page(struct dhugetlb_pool *hpool) +{ + struct page *page, *next; + unsigned long nr_pages = 1 << (PUD_SHIFT - PAGE_SHIFT); + unsigned long block_size = 1 << (PMD_SHIFT - PAGE_SHIFT); + int i; + + lockdep_assert_held(&hpool->lock); + if (list_empty(&hpool->dhugetlb_2M_freelists)) + return; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_2M_freelists, lru) { + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + atomic_set(compound_mapcount_ptr(page), 0); + for (i = 1; i < block_size; i++) + clear_compound_head(&page[i]); + set_compound_order(page, 0); + __ClearPageHead(page); + if (page_to_pfn(page) % nr_pages != 0) + list_del(&page->lru); + } + __free_dhugetlb_huge_page(hpool); +} + +static int try_migrate_page(struct page *page, unsigned long nid) +{ + unsigned long pfn = page_to_pfn(page); + int ret = 0; + + LIST_HEAD(source); + + if (!pfn_valid(pfn)) + return 0; + BUG_ON(PageHuge(page) || PageTransHuge(page)); + /* + * HWPoison pages have elevated reference counts so the migration + * would fail on them. It also doesn't make any sense to migrate them + * in the first place. Still try to unmap such a page in case it is + * still mapped(e.g. current hwpoison implementation doesn't unmap + * KSM pages but keep the unmap as the catch all safety net). + */ + if (PageHWPoison(page)) { + if (WARN_ON(PageLRU(page))) + isolate_lru_page(page); + if (page_mapped(page)) + try_to_unmap(page, + TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS); + return 0; + } + + if (!get_page_unless_zero(page)) + return 0; + /* + * We can skip free pages. And we can deal with pages on + * LRU and non-lru movable pages. + */ + if (PageLRU(page)) + ret = isolate_lru_page(page); + else + ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); + put_page(page); + if (ret) { + if (page_count(page)) + ret = -EBUSY; + return ret; + } + list_add_tail(&page->lru, &source); + if (!__PageMovable(page)) + inc_node_page_state(page, + NR_ISOLATED_ANON + page_is_file_cache(page)); + + ret = migrate_pages(&source, alloc_new_node_page, NULL, nid, + MIGRATE_SYNC_LIGHT, MR_COMPACTION); + if (ret) + putback_movable_pages(&source); + return ret; +} + +static void try_migrate_pages(struct dhugetlb_pool *hpool) +{ + int i, j; + unsigned long nr_free_pages; + struct split_pages *split_giga, *next; + unsigned int nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + struct page *page; + int sleep_interval = 100; /* wait for the migration */ + + spin_unlock(&hpool->lock); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + + msleep(sleep_interval); + dhugetlb_pool_force_empty(hpool->attach_memcg); + + spin_lock(&hpool->lock); + nr_free_pages = hpool->free_pages; + spin_unlock(&hpool->lock); + for (i = 0; i < NR_SMPOOL; i++) { + spin_lock(&hpool->smpool[i].lock); + nr_free_pages += hpool->smpool[i].free_pages; + spin_unlock(&hpool->smpool[i].lock); + } + + if (nr_free_pages >> HUGETLB_PAGE_ORDER < hpool->nr_split_2M) { + list_for_each_entry_safe(split_giga, next, + &hpool->split_1G_freelists, list) { + for (i = 0; i < nr_pages; i++) { + if (PageCompound(pfn_to_page( + split_giga->start_pfn + i * nr_pages))) + continue; + page = pfn_to_page(split_giga->start_pfn + + i * nr_pages); + for (j = 0; j < nr_pages; j++) { + if (PagePool(page + j)) + try_migrate_page(page + j, + hpool->nid); + } + } + } + } + + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); +} + +/* + * If there are some pages are still in use. We will try to reclaim/migrate it. + * After trying at most HPOOL_RECLAIM_RETRIES times, we may success. + * Or we will print the failed information and return false. + */ +static bool free_dhugetlb_pages(struct dhugetlb_pool *hpool) +{ + int i; + long used_pages; + int try_count = 0; + +retry: + used_pages = 0; + for (i = 0; i < NR_SMPOOL; i++) + used_pages += hpool->smpool[i].used_pages; + + if (try_count < HPOOL_RECLAIM_RETRIES && + (used_pages || hpool->used_2M || hpool->used_1G)) { + try_migrate_pages(hpool); + try_count++; + goto retry; + } + + if (used_pages) + pr_err("dhugetlb: some 4K pages not free, memcg: %s delete failed!\n", + hpool->attach_memcg->css.cgroup->kn->name); + else if (hpool->used_2M) + pr_err("dhugetlb: some 2M pages not free, memcg: %s delete failed!\n", + hpool->attach_memcg->css.cgroup->kn->name); + else if (hpool->used_1G) + pr_err("dhugetlb: some 1G pages not free, memcg: %s delete failed!\n", + hpool->attach_memcg->css.cgroup->kn->name); + else { + free_dhugetlb_pcpool(hpool); + free_dhugetlb_small_page(hpool); + free_dhugetlb_huge_page(hpool); + return true; + } + return false; +} + +static void free_back_hugetlb(struct dhugetlb_pool *hpool) +{ + int nid; + unsigned int nr_pages; + unsigned long pfn, idx; + struct page *page, *page_next, *p; + struct hstate *h = size_to_hstate(PUD_SIZE); + + if (!h) + return; + + spin_lock(&hugetlb_lock); + list_for_each_entry_safe(page, page_next, + &hpool->dhugetlb_1G_freelists, lru) { + nr_pages = 1 << huge_page_order(h); + pfn = page_to_pfn(page); + for (; nr_pages--; pfn++) { + p = pfn_to_page(pfn); + p->mapping = NULL; + } + SetPageHugeFreed(page); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + nid = page_to_nid(page); + BUG_ON(nid >= MAX_NUMNODES); + list_move(&page->lru, &h->hugepage_freelists[nid]); + h->free_huge_pages_node[nid]++; + read_lock(&dhugetlb_pagelist_rwlock); + idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT); + if (idx < dhugetlb_pagelist_t->count) + dhugetlb_pagelist_t->hpool[idx] = NULL; + read_unlock(&dhugetlb_pagelist_rwlock); + } + h->free_huge_pages += hpool->total_nr_pages; + hpool->total_nr_pages = 0; + hpool->free_unreserved_1G = 0; + hpool->free_reserved_1G = 0; + hpool->total_reserved_1G = 0; + INIT_LIST_HEAD(&hpool->dhugetlb_1G_freelists); + spin_unlock(&hugetlb_lock); +} + +bool free_dhugetlb_pool(struct dhugetlb_pool *hpool) +{ + int i; + bool ret = false; + + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); + + ret = free_dhugetlb_pages(hpool); + if (!ret) + goto out_unlock; + + free_back_hugetlb(hpool); + +out_unlock: + spin_unlock(&hpool->lock); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + + if (ret) + dhugetlb_pool_put(hpool); + return ret; +} + +static void __split_free_huge_page(struct dhugetlb_pool *hpool, + struct page *page) +{ + int i; + int order_h = PUD_SHIFT - PAGE_SHIFT; + int order_m = PMD_SHIFT - PAGE_SHIFT; + int blocks = 1 << (order_h - order_m); + struct page *p = page + 1; + + lockdep_assert_held(&hpool->lock); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + atomic_set(compound_mapcount_ptr(page), 0); + for (i = 1; i < (1 << order_h); i++, p = mem_map_next(p, page, i)) + clear_compound_head(p); + + set_compound_order(page, 0); + __ClearPageHead(page); + + /* make it be 2M huge pages and put it to huge pool */ + for (i = 0; i < blocks; i++, page += (1 << order_m)) + add_new_huge_page_to_pool(hpool, page, false); +} + +static void __split_free_small_page(struct dhugetlb_pool *hpool, + struct page *page) +{ + int i; + int nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + + lockdep_assert_held(&hpool->lock); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + set_compound_order(page, 0); + for (i = 0; i < nr_pages; i++) { + if (i != 0) { + page[i].mapping = NULL; + clear_compound_head(&page[i]); + } else + __ClearPageHead(page); + + /* + * If a hugepage is mapped in private mode, the PG_uptodate bit + * will not be cleared when the hugepage freed. Clear the + * hugepage using free_pages_prepare() here. + */ + free_pages_prepare(&page[i], 0, false); + hpool->free_pages++; + list_add_tail(&page[i].lru, &hpool->dhugetlb_4K_freelists); + } +} + +static bool split_free_huge_page(struct dhugetlb_pool *hpool) +{ + struct page *page; + struct split_pages *split_page; + + lockdep_assert_held(&hpool->lock); + + if (!hpool->free_unreserved_1G) + return false; + + split_page = kzalloc(sizeof(struct split_pages), GFP_ATOMIC); + if (!split_page) + return false; + + page = list_entry(hpool->dhugetlb_1G_freelists.next, struct page, lru); + list_del(&page->lru); + hpool->free_unreserved_1G--; + + split_page->start_pfn = page_to_pfn(page); + list_add(&split_page->list, &hpool->split_1G_freelists); + hpool->nr_split_1G++; + + trace_dhugetlb_split_merge(hpool, page, DHUGETLB_SPLIT_1G); + + __split_free_huge_page(hpool, page); + return true; +} + +static bool split_free_small_page(struct dhugetlb_pool *hpool) +{ + struct page *page; + struct split_pages *split_page; + + lockdep_assert_held(&hpool->lock); + + if (!hpool->free_unreserved_2M && !split_free_huge_page(hpool)) + return false; + + split_page = kzalloc(sizeof(struct split_pages), GFP_ATOMIC); + if (!split_page) + return false; + + page = list_entry(hpool->dhugetlb_2M_freelists.next, struct page, lru); + list_del(&page->lru); + hpool->free_unreserved_2M--; + + split_page->start_pfn = page_to_pfn(page); + list_add(&split_page->list, &hpool->split_2M_freelists); + hpool->nr_split_2M++; + + trace_dhugetlb_split_merge(hpool, page, DHUGETLB_SPLIT_2M); + + __split_free_small_page(hpool, page); + return true; +} + +bool move_pages_from_hpool_to_smpool(struct dhugetlb_pool *hpool, + struct small_page_pool *smpool) +{ + int i = 0; + struct page *page, *next; + + if (!hpool->free_pages && !split_free_small_page(hpool)) + return false; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_4K_freelists, lru) { + list_del(&page->lru); + hpool->free_pages--; + list_add_tail(&page->lru, &smpool->head_page); + smpool->free_pages++; + if (++i == BATCH_SMPOOL_PAGE) + break; + } + return true; +} + +void move_pages_from_smpool_to_hpool(struct dhugetlb_pool *hpool, + struct small_page_pool *smpool) +{ + int i = 0; + struct page *page, *next; + + list_for_each_entry_safe(page, next, &smpool->head_page, lru) { + list_del(&page->lru); + smpool->free_pages--; + list_add(&page->lru, &hpool->dhugetlb_4K_freelists); + hpool->free_pages++; + if (++i == BATCH_SMPOOL_PAGE) + break; + } +} + +static unsigned long list_len(struct list_head *head) +{ + unsigned long len = 0; + struct page *page; + + list_for_each_entry(page, head, lru) + len++; + + return len; +} + +static void hugetlb_migrate_pages(struct dhugetlb_pool *hpool, + unsigned long count) +{ + int i, try; + struct page *page; + struct split_pages *split_huge, *split_next; + unsigned long nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + LIST_HEAD(wait_page_list); + + list_for_each_entry_safe(split_huge, split_next, + &hpool->split_2M_freelists, list) { + /* + * Isolate free page first because we dont want them to be + * allocated. + */ + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + if (!PagePool(page)) + list_move(&page->lru, &wait_page_list); + } + + for (try = 0; try < HPOOL_RECLAIM_RETRIES; try++) { + /* + * Unlock and try migration, after migration we need + * to lock back. + */ + for (i = 0; i < NR_SMPOOL; i++) + hpool->smpool[i].free_pages = + list_len(&hpool->smpool[i].head_page); + hpool->free_pages = + list_len(&hpool->dhugetlb_4K_freelists); + spin_unlock(&hpool->lock); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + if (PagePool(page)) + try_migrate_page(page, hpool->nid); + } + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); + + /* + * Isolate free page. If all page in the split_huge + * is free, return it. + */ + split_huge->free_pages = 0; + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + if (!PagePool(page)) { + list_move(&page->lru, &wait_page_list); + split_huge->free_pages++; + } + } + if (split_huge->free_pages == nr_pages) + break; + } + if (split_huge->free_pages == nr_pages) { + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + list_del(&page->lru); + } + INIT_LIST_HEAD(&wait_page_list); + page = pfn_to_page(split_huge->start_pfn); + add_new_huge_page_to_pool(hpool, page, false); + list_del(&split_huge->list); + kfree(split_huge); + hpool->nr_split_2M--; + + trace_dhugetlb_split_merge(hpool, page, + DHUGETLB_MIGRATE_4K); + + if (--count == 0) + return; + } else { + /* Failed, put back the isolate pages */ + list_splice(&wait_page_list, + &hpool->dhugetlb_4K_freelists); + INIT_LIST_HEAD(&wait_page_list); + } + } +} + +static unsigned long merge_free_split_huge(struct dhugetlb_pool *hpool, + unsigned long count) +{ + int i; + struct page *page; + struct split_pages *split_huge, *split_next; + unsigned long nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + + list_for_each_entry_safe(split_huge, split_next, + &hpool->split_2M_freelists, list) { + split_huge->free_pages = 0; + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + if (!PagePool(page)) + split_huge->free_pages++; + } + if (split_huge->free_pages == nr_pages) { + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + list_del(&page->lru); + } + page = pfn_to_page(split_huge->start_pfn); + add_new_huge_page_to_pool(hpool, page, false); + list_del(&split_huge->list); + kfree(split_huge); + hpool->nr_split_2M--; + + trace_dhugetlb_split_merge(hpool, page, + DHUGETLB_MERGE_4K); + + if (--count == 0) + return 0; + } + } + return count; +} + +static void merge_free_small_page(struct dhugetlb_pool *hpool, + unsigned long count) +{ + int i; + unsigned long need_migrate; + + if (!hpool->nr_split_2M) + return; + + need_migrate = merge_free_split_huge(hpool, count); + if (need_migrate) + hugetlb_migrate_pages(hpool, need_migrate); + + for (i = 0; i < NR_SMPOOL; i++) + hpool->smpool[i].free_pages = + list_len(&hpool->smpool[i].head_page); + hpool->free_pages = list_len(&hpool->dhugetlb_4K_freelists); +} + +static void dhugetlb_collect_2M_pages(struct dhugetlb_pool *hpool, + unsigned long count) +{ + int i; + + while (hpool->free_unreserved_1G && + count > hpool->free_unreserved_2M) + split_free_huge_page(hpool); + + /* + * If we try to merge 4K pages to 2M, we need to unlock hpool->lock + * first, and then try to lock every lock in order to avoid deadlock. + */ + if (count > hpool->free_unreserved_2M) { + spin_unlock(&hpool->lock); + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); + merge_free_small_page(hpool, count - hpool->free_unreserved_2M); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + } +} + +/* + * Parameter gigantic: true means reserve 1G pages and false means reserve + * 2M pages. When we want to reserve 2M pages more than + * hpool->free_unreserved_2M, we have to try split/merge. Still, we can't + * guarantee success. + */ +void dhugetlb_reserve_hugepages(struct dhugetlb_pool *hpool, + unsigned long count, bool gigantic) +{ + unsigned long delta; + + spin_lock(&hpool->lock); + if (gigantic) { + if (count > hpool->total_reserved_1G) { + delta = min(count - hpool->total_reserved_1G, + hpool->free_unreserved_1G); + hpool->total_reserved_1G += delta; + hpool->free_reserved_1G += delta; + hpool->free_unreserved_1G -= delta; + } else { + delta = min(hpool->total_reserved_1G - count, + hpool->free_reserved_1G - + hpool->mmap_reserved_1G); + hpool->total_reserved_1G -= delta; + hpool->free_reserved_1G -= delta; + hpool->free_unreserved_1G += delta; + } + } else { + if (count > hpool->total_reserved_2M) { + delta = count - hpool->total_reserved_2M; + if (delta > hpool->free_unreserved_2M) + dhugetlb_collect_2M_pages(hpool, delta); + delta = min(count - hpool->total_reserved_2M, + hpool->free_unreserved_2M); + hpool->total_reserved_2M += delta; + hpool->free_reserved_2M += delta; + hpool->free_unreserved_2M -= delta; + } else { + delta = min(hpool->total_reserved_2M - count, + hpool->free_reserved_2M - + hpool->mmap_reserved_2M); + hpool->total_reserved_2M -= delta; + hpool->free_reserved_2M -= delta; + hpool->free_unreserved_2M += delta; + } + } + spin_unlock(&hpool->lock); +} + +static int dhugetlb_acct_memory(struct hstate *h, long delta, + struct dhugetlb_pool *hpool) +{ + int ret = -ENOMEM; + + if (delta == 0) + return 0; + + spin_lock(&hpool->lock); + if (hstate_is_gigantic(h)) { + if (delta > 0 && delta <= hpool->free_reserved_1G - + hpool->mmap_reserved_1G) { + hpool->mmap_reserved_1G += delta; + ret = 0; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_1G, + DHUGETLB_RESV_1G); + } else if (delta < 0) { + hpool->mmap_reserved_1G -= (unsigned long)(-delta); + WARN_ON(hpool->mmap_reserved_1G < 0); + ret = 0; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_1G, + DHUGETLB_UNRESV_1G); + } + } else { + if (delta > 0 && delta <= hpool->free_reserved_2M - + hpool->mmap_reserved_2M) { + hpool->mmap_reserved_2M += delta; + ret = 0; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_2M, + DHUGETLB_RESV_2M); + } else if (delta < 0) { + hpool->mmap_reserved_2M -= (unsigned long)(-delta); + WARN_ON(hpool->mmap_reserved_2M < 0); + ret = 0; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_2M, + DHUGETLB_UNRESV_2M); + } + } + spin_unlock(&hpool->lock); + + return ret; +} +#else +static int dhugetlb_acct_memory(struct hstate *h, long delta, + struct dhugetlb_pool *hpool) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_HUGETLB */ + static int __init hugetlb_init(void) { int i; @@ -3134,6 +4240,23 @@ static int __init hugetlb_init(void) hugetlb_register_all_nodes(); hugetlb_cgroup_file_init();
+#ifdef CONFIG_DYNAMIC_HUGETLB + if (enable_dhugetlb) { + unsigned long count = max(max_pfn >> (PUD_SHIFT - PAGE_SHIFT), + (unsigned long)DEFAULT_PAGESIZE); + unsigned long size = sizeof(struct dhugetlb_pagelist) + + count * sizeof(struct dhugetlb_pool *); + dhugetlb_pagelist_t = kzalloc(size, GFP_KERNEL); + if (dhugetlb_pagelist_t) { + dhugetlb_pagelist_t->count = count; + static_branch_enable(&dhugetlb_enabled_key); + pr_info("Dynamic 1G hugepage enabled\n"); + } else + pr_info("Dynamic 1G hugepage disabled due to out of memory, need %lu\n", + size); + } +#endif + #ifdef CONFIG_SMP num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); #else @@ -3270,6 +4393,16 @@ static int __init hugetlb_nrpages_setup(char *s) } __setup("hugepages=", hugetlb_nrpages_setup);
+#ifdef CONFIG_DYNAMIC_HUGETLB +static int __init dhugetlb_setup(char *s) +{ + if (!strcmp(s, "on")) + enable_dhugetlb = true; + return 1; +} +__setup("dynamic_1G_hugepage=", dhugetlb_setup); +#endif + static int __init hugetlb_default_setup(char *s) { default_hstate_size = memparse(s, &s); @@ -3471,10 +4604,14 @@ unsigned long hugetlb_total_pages(void) return nr_total_pages; }
-static int hugetlb_acct_memory(struct hstate *h, long delta) +static int hugetlb_acct_memory(struct hstate *h, long delta, + struct dhugetlb_pool *hpool) { int ret = -ENOMEM;
+ if (dhugetlb_enabled && hpool) + return dhugetlb_acct_memory(h, delta, hpool); + spin_lock(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page @@ -3535,6 +4672,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; long gbl_reserve; + struct dhugetlb_pool *hpool = + HUGETLBFS_I(file_inode(vma->vm_file))->hpool;
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -3551,8 +4690,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) * Decrement reserve counts. The global reserve count may be * adjusted if the subpool has a minimum size. */ - gbl_reserve = hugepage_subpool_put_pages(spool, reserve); - hugetlb_acct_memory(h, -gbl_reserve); + gbl_reserve = hugepage_subpool_put_pages(spool, reserve, hpool); + hugetlb_acct_memory(h, -gbl_reserve, hpool); } }
@@ -4934,6 +6073,7 @@ int hugetlb_reserve_pages(struct inode *inode, struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; long gbl_reserve; + struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool;
/* This should never happen */ if (from > to) { @@ -4986,7 +6126,7 @@ int hugetlb_reserve_pages(struct inode *inode, * the subpool has a minimum size, there may be some global * reservations already in place (gbl_reserve). */ - gbl_reserve = hugepage_subpool_get_pages(spool, chg); + gbl_reserve = hugepage_subpool_get_pages(spool, chg, hpool); if (gbl_reserve < 0) { ret = -ENOSPC; goto out_err; @@ -4996,10 +6136,10 @@ int hugetlb_reserve_pages(struct inode *inode, * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ - ret = hugetlb_acct_memory(h, gbl_reserve); + ret = hugetlb_acct_memory(h, gbl_reserve, hpool); if (ret < 0) { /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); + (void)hugepage_subpool_put_pages(spool, chg, hpool); goto out_err; }
@@ -5028,8 +6168,9 @@ int hugetlb_reserve_pages(struct inode *inode, long rsv_adjust;
rsv_adjust = hugepage_subpool_put_pages(spool, - chg - add); - hugetlb_acct_memory(h, -rsv_adjust); + chg - add, + hpool); + hugetlb_acct_memory(h, -rsv_adjust, hpool); } } return 0; @@ -5051,6 +6192,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; + struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool;
/* * Since this routine can be called in the evict inode path for all @@ -5075,8 +6217,8 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, * If the subpool has a minimum size, the number of global * reservations to be released may be adjusted. */ - gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); - hugetlb_acct_memory(h, -gbl_reserve); + gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed), hpool); + hugetlb_acct_memory(h, -gbl_reserve, hpool);
return 0; } diff --git a/mm/internal.h b/mm/internal.h index 1b861446c751..deffd247b010 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -182,6 +182,7 @@ extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); extern void __free_pages_core(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); +extern int check_new_page(struct page *page); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 63b91a030b02..bdc90e6fc082 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -997,6 +997,41 @@ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) return get_mem_cgroup_from_mm(current->mm); }
+#ifdef CONFIG_DYNAMIC_HUGETLB +void free_page_to_dhugetlb_pool(struct page *page) +{ + struct dhugetlb_pool *hpool; + struct small_page_pool *smpool; + unsigned long flags; + + hpool = get_dhugetlb_pool_from_dhugetlb_pagelist(page); + if (unlikely(!hpool)) { + pr_err("dhugetlb: free error: get hpool failed\n"); + return; + } + + smpool = &hpool->smpool[smp_processor_id()]; + spin_lock_irqsave(&smpool->lock, flags); + + ClearPagePool(page); + if (!free_pages_prepare(page, 0, false)) { + SetPagePool(page); + goto out; + } + list_add(&page->lru, &smpool->head_page); + smpool->free_pages++; + smpool->used_pages--; + if (smpool->free_pages > MAX_SMPOOL_PAGE) { + spin_lock(&hpool->lock); + move_pages_from_smpool_to_hpool(hpool, smpool); + spin_unlock(&hpool->lock); + } +out: + spin_unlock_irqrestore(&smpool->lock, flags); + dhugetlb_pool_put(hpool); +} +#endif /* CONFIG_DYNAMIC_HUGETLB */ + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -3118,6 +3153,31 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return 0; } +#ifdef CONFIG_DYNAMIC_HUGETLB +int dhugetlb_pool_force_empty(struct mem_cgroup *memcg) +{ + lru_add_drain_all(); + + drain_all_stock(memcg); + + while (page_counter_read(&memcg->memory)) { + int progress; + + if (signal_pending(current)) + return -EINTR; + + progress = try_to_free_mem_cgroup_pages(memcg, 1, + GFP_HIGHUSER_MOVABLE, + false); + + if (!progress) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + break; + } + } + return 0; +} +#endif
static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -4652,6 +4712,305 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, return ret; }
+#ifdef CONFIG_DYNAMIC_HUGETLB +struct dhugetlb_pool *get_dhugetlb_pool_from_memcg(struct mem_cgroup *memcg) +{ + struct mem_cgroup_extension *memcg_ext; + + if (!memcg) + return NULL; + + memcg_ext = container_of(memcg, struct mem_cgroup_extension, memcg); + if (dhugetlb_pool_get(memcg_ext->hpool)) + return memcg_ext->hpool; + return NULL; +} + +static void set_dhugetlb_pool_to_memcg(struct mem_cgroup *memcg, + struct dhugetlb_pool *hpool) +{ + struct mem_cgroup_extension *memcg_ext; + + memcg_ext = container_of(memcg, struct mem_cgroup_extension, memcg); + + memcg_ext->hpool = hpool; +} + +static bool should_allocate_from_dhugetlb_pool(gfp_t gfp_mask) +{ + gfp_t gfp = gfp_mask & GFP_HIGHUSER_MOVABLE; + + if (current->flags & PF_KTHREAD) + return false; + + /* + * The cgroup only charges anonymous and file pages from usespage. + * some filesystem maybe has masked out the __GFP_IO | __GFP_FS + * to avoid recursive memory request. eg: loop device, xfs. + */ + if ((gfp | __GFP_IO | __GFP_FS) != GFP_HIGHUSER_MOVABLE) + return false; + + return true; +} + +static struct page *__alloc_page_from_dhugetlb_pool(void) +{ + bool ret; + struct dhugetlb_pool *hpool; + struct small_page_pool *smpool; + struct page *page = NULL; + unsigned long flags; + + hpool = get_dhugetlb_pool_from_task(current); + if (unlikely(!hpool)) + goto out; + + smpool = &hpool->smpool[smp_processor_id()]; + spin_lock_irqsave(&smpool->lock, flags); + + if (smpool->free_pages == 0) { + spin_lock(&hpool->lock); + ret = move_pages_from_hpool_to_smpool(hpool, smpool); + spin_unlock(&hpool->lock); + if (!ret) + goto unlock; + } + + page = list_entry(smpool->head_page.next, struct page, lru); + list_del(&page->lru); + smpool->free_pages--; + smpool->used_pages++; + check_new_page(page); + SetPagePool(page); +unlock: + spin_unlock_irqrestore(&smpool->lock, flags); +out: + dhugetlb_pool_put(hpool); + return page; +} + +struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp_mask) +{ + struct page *page = NULL; + + if (should_allocate_from_dhugetlb_pool(gfp_mask)) + page = __alloc_page_from_dhugetlb_pool(); + + return page; +} + +static void assign_new_dhugetlb_pool(struct mem_cgroup *memcg, + unsigned long nid) +{ + struct dhugetlb_pool *hpool; + + hpool = hpool_alloc(nid); + if (!hpool) + return; + + hpool->attach_memcg = memcg; + css_get(&memcg->css); + set_dhugetlb_pool_to_memcg(memcg, hpool); +} + +static int update_dhugetlb_pool(struct mem_cgroup *memcg, + unsigned long nid, unsigned long size) +{ + int ret; + struct dhugetlb_pool *hpool = get_dhugetlb_pool_from_memcg(memcg); + + if (!hpool) { + if (memcg_has_children(memcg)) + return -EINVAL; + assign_new_dhugetlb_pool(memcg, nid); + hpool = get_dhugetlb_pool_from_memcg(memcg); + } + if (!hpool) + return -ENOMEM; + if (hpool->attach_memcg != memcg || hpool->nid != nid) { + dhugetlb_pool_put(hpool); + return -EINVAL; + } + + ret = alloc_hugepage_from_hugetlb(hpool, nid, size); + + dhugetlb_pool_put(hpool); + return ret; +} + +/* + * Test whether an process can allocate specified memory size. + * + * Input must be in format '<nid> <size>'. + * size is regarded as how many it does 1G huge page. + */ +static ssize_t memcg_write_dhugetlb(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + int ret; + unsigned long nid, size; + char *endp; + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + if (!dhugetlb_enabled) + return -EINVAL; + + buf = strstrip(buf); + nid = memparse(buf, &endp); + if (*endp != ' ' || nid >= MAX_NUMNODES) + return -EINVAL; + + buf = endp + 1; + size = memparse(buf, &endp); + if (*endp != '\0' || size == 0) + return -EINVAL; + + ret = update_dhugetlb_pool(memcg, nid, size); + + return ret ?: nbytes; +} + +static int memcg_read_dhugetlb(struct seq_file *m, void *v) +{ + int i; + unsigned long free_pages; + long used_pages = 0; + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct dhugetlb_pool *hpool = get_dhugetlb_pool_from_memcg(memcg); + + if (!dhugetlb_enabled) + return 0; + if (!hpool) { + seq_printf(m, "Curent hierarchial have not memory pool.\n"); + return 0; + } + + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); + + free_pages = hpool->free_pages; + for (i = 0; i < NR_SMPOOL; i++) { + free_pages += hpool->smpool[i].free_pages; + used_pages += hpool->smpool[i].used_pages; + } + + seq_printf(m, "dhugetlb_total_pages %ld\n" + "1G_total_reserved_pages %ld\n" + "1G_free_reserved_pages %ld\n" + "1G_mmap_reserved_pages %ld\n" + "1G_used_pages %ld\n" + "1G_free_unreserved_pages %ld\n" + "2M_total_reserved_pages %ld\n" + "2M_free_reserved_pages %ld\n" + "2M_mmap_reserved_pages %ld\n" + "2M_used_pages %ld\n" + "2M_free_unreserved_pages %ld\n" + "4K_free_pages %ld\n" + "4K_used_pages %ld\n", + hpool->total_nr_pages, + hpool->total_reserved_1G, + hpool->free_reserved_1G, + hpool->mmap_reserved_1G, + hpool->used_1G, + hpool->free_unreserved_1G, + hpool->total_reserved_2M, + hpool->free_reserved_2M, + hpool->mmap_reserved_2M, + hpool->used_2M, + hpool->free_unreserved_2M, + free_pages, + used_pages); + + spin_unlock(&hpool->lock); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + dhugetlb_pool_put(hpool); + return 0; +} + +static int update_reserve_pages(struct kernfs_open_file *of, + char *buf, bool gigantic) +{ + unsigned long size; + char *endp; + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct dhugetlb_pool *hpool; + + if (!dhugetlb_enabled) + return -EINVAL; + + buf = strstrip(buf); + size = memparse(buf, &endp); + if (*endp != '\0') + return -EINVAL; + + hpool = get_dhugetlb_pool_from_memcg(memcg); + if (!hpool) + return -EINVAL; + spin_lock(&hpool->reserved_lock); + dhugetlb_reserve_hugepages(hpool, size, gigantic); + spin_unlock(&hpool->reserved_lock); + dhugetlb_pool_put(hpool); + return 0; +} + +static ssize_t dhugetlb_1G_reserve_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return update_reserve_pages(of, buf, true) ?: nbytes; +} + +static ssize_t dhugetlb_2M_reserve_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return update_reserve_pages(of, buf, false) ?: nbytes; +} + +static void dhugetlb_pool_inherits(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ + struct dhugetlb_pool *hpool; + + hpool = get_dhugetlb_pool_from_memcg(parent); + if (!hpool) + return; + + set_dhugetlb_pool_to_memcg(memcg, hpool); + dhugetlb_pool_put(hpool); +} + +static bool dhugetlb_pool_free(struct mem_cgroup *memcg) +{ + bool ret = true; + struct dhugetlb_pool *hpool; + + hpool = get_dhugetlb_pool_from_memcg(memcg); + if (hpool && hpool->attach_memcg == memcg) + ret = free_dhugetlb_pool(hpool); + dhugetlb_pool_put(hpool); + return ret; +} + +bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css) +{ + if (dhugetlb_enabled) + return dhugetlb_pool_free(mem_cgroup_from_css(css)); + return true; +} +#else +static void dhugetlb_pool_inherits(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ +} + +bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css) +{ + return true; +} +#endif /* CONFIG_DYNAMIC_HUGETLB */ + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -4700,6 +5059,27 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = memcg_write_event_control, .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, }, +#ifdef CONFIG_DYNAMIC_HUGETLB + { + .name = "dhugetlb.nr_pages", + .write = memcg_write_dhugetlb, + .seq_show = memcg_read_dhugetlb, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | + CFTYPE_NOT_ON_ROOT, + }, + { + .name = "dhugetlb.1G.reserved_pages", + .write = dhugetlb_1G_reserve_write, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | + CFTYPE_NOT_ON_ROOT, + }, + { + .name = "dhugetlb.2M.reserved_pages", + .write = dhugetlb_2M_reserve_write, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | + CFTYPE_NOT_ON_ROOT, + }, +#endif { .name = "swappiness", .read_u64 = mem_cgroup_swappiness_read, @@ -5063,6 +5443,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; }
+ if (dhugetlb_enabled) + dhugetlb_pool_inherits(memcg, parent); + error = memcg_online_kmem(memcg); if (error) goto fail; @@ -5681,6 +6064,14 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) if (!p) return 0;
+ if (dhugetlb_enabled) { + struct dhugetlb_pool *hpool = get_dhugetlb_pool_from_task(p); + + if (hpool) { + dhugetlb_pool_put(hpool); + return -EPERM; + } + } /* * We are now commited to this value whatever it is. Changes in this * tunable will only affect upcoming migrations, not the current one. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a6a2f254f61f..e722d73a3724 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1052,7 +1052,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) return ret; }
-static __always_inline bool free_pages_prepare(struct page *page, +__always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free) { int bad = 0; @@ -2012,7 +2012,7 @@ static void check_new_page_bad(struct page *page) /* * This page is about to be returned from the page allocator */ -static inline int check_new_page(struct page *page) +inline int check_new_page(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) @@ -2075,8 +2075,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_owner(page, order, gfp_flags); }
-static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, - unsigned int alloc_flags) +void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags) { int i;
@@ -2955,6 +2955,12 @@ void free_unref_page(struct page *page) unsigned long flags; unsigned long pfn = page_to_pfn(page);
+ /* Free dynamic hugetlb page */ + if (dhugetlb_enabled && PagePool(page)) { + free_page_to_dhugetlb_pool(page); + return; + } + if (!free_unref_page_prepare(page, pfn)) return;
@@ -2972,6 +2978,16 @@ void free_unref_page_list(struct list_head *list) unsigned long flags, pfn; int batch_count = 0;
+ /* Free dynamic hugetlb pages */ + if (dhugetlb_enabled) { + list_for_each_entry_safe(page, next, list, lru) { + if (PagePool(page)) { + list_del(&page->lru); + free_page_to_dhugetlb_pool(page); + } + } + } + /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); @@ -4785,6 +4801,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
finalise_ac(gfp_mask, &ac);
+ /* Dynamic hugetlb allocation attemp */ + if (dhugetlb_enabled && likely(order == 0)) { + page = alloc_page_from_dhugetlb_pool(gfp_mask); + if (page) { + prep_new_page(page, order, gfp_mask, alloc_flags); + goto out; + } + } + /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (likely(page))
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I6BDME CVE: NA
--------------------------------
Add CONFIG_DYNAMIC_HUGETLB in struct mem_cgroup_extension and struct hugetlbfs_inode_info to isolate the member.
Add CONFIG_DYNAMIC_HUGETLB for pageflag PG_pool to isolate it.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- fs/hugetlbfs/inode.c | 4 ++++ include/linux/hugetlb.h | 2 ++ include/linux/memcontrol.h | 2 ++ include/linux/page-flags.h | 10 +++++++++- include/trace/events/mmflags.h | 8 +++++++- mm/hugetlb.c | 22 +++++++++++++++++----- 6 files changed, 41 insertions(+), 7 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 30a29936372c..e411103d2cf4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1164,8 +1164,10 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) * private inode. This simplifies hugetlbfs_destroy_inode. */ mpol_shared_policy_init(&p->policy, NULL); +#ifdef CONFIG_DYNAMIC_HUGETLB /* Initialize hpool here in case of a quick call to destroy */ p->hpool = get_dhugetlb_pool_from_task(current); +#endif
return &p->vfs_inode; } @@ -1180,8 +1182,10 @@ static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); +#ifdef CONFIG_DYNAMIC_HUGETLB dhugetlb_pool_put(HUGETLBFS_I(inode)->hpool); HUGETLBFS_I(inode)->hpool = NULL; +#endif call_rcu(&inode->i_rcu, hugetlbfs_i_callback); }
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3a82ea9283ec..92fd6e2cbaf3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -289,7 +289,9 @@ struct hugetlbfs_inode_info { struct shared_policy policy; struct inode vfs_inode; unsigned int seals; +#ifdef CONFIG_DYNAMIC_HUGETLB struct dhugetlb_pool *hpool; +#endif };
static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 22f40d5e0e8b..594925ea3076 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -326,7 +326,9 @@ struct mem_cgroup { };
struct mem_cgroup_extension { +#ifdef CONFIG_DYNAMIC_HUGETLB struct dhugetlb_pool *hpool; +#endif #ifdef CONFIG_MEMCG_QOS /* Currently support 0 and -1. * in the future it can expand to other value. diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index fd6cd68e00a2..feca326f0563 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -102,7 +102,9 @@ enum pageflags { PG_idle, #endif PG_percpu_ref, +#ifdef CONFIG_DYNAMIC_HUGETLB PG_pool, +#endif __NR_PAGEFLAGS,
/* Filesystems */ @@ -285,7 +287,13 @@ PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) __PAGEFLAG(Slab, slab, PF_NO_TAIL) __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ +#ifdef CONFIG_DYNAMIC_HUGETLB PAGEFLAG(Pool, pool, PF_NO_TAIL) +#define __PG_POOL (1UL << PG_pool) +#else +PAGEFLAG_FALSE(Pool) +#define __PG_POOL 0 +#endif
/* Xen */ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) @@ -772,7 +780,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_slab | 1UL << PG_active | \ - 1UL << PG_pool | \ + __PG_POOL | \ 1UL << PG_unevictable | __PG_MLOCKED)
/* diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 4d06b47129f3..45add2ab8790 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -79,9 +79,14 @@ #define IF_HAVE_PG_IDLE(flag,string) #endif
+#ifdef CONFIG_DYNAMIC_HUGETLB +#define IF_HAVE_PG_POOL(flag,string) ,{1UL << flag, string} +#else +#define IF_HAVE_PG_POOL(flag,string) +#endif + #define __def_pageflag_names \ {1UL << PG_locked, "locked" }, \ - {1UL << PG_pool, "pool" }, \ {1UL << PG_waiters, "waiters" }, \ {1UL << PG_error, "error" }, \ {1UL << PG_referenced, "referenced" }, \ @@ -102,6 +107,7 @@ {1UL << PG_swapbacked, "swapbacked" }, \ {1UL << PG_unevictable, "unevictable" } \ IF_HAVE_PG_MLOCK(PG_mlocked, "mlocked" ) \ +IF_HAVE_PG_POOL(PG_pool, "pool" ) \ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ IF_HAVE_PG_IDLE(PG_young, "young" ) \ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4c8c91acd6d5..8b88ac4620d5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -601,6 +601,18 @@ static long region_del(struct resv_map *resv, long f, long t) return del; }
+#ifdef CONFIG_DYNAMIC_HUGETLB +static struct dhugetlb_pool *get_hpool_from_inode(struct inode *inode) +{ + return HUGETLBFS_I(inode)->hpool; +} +#else +static struct dhugetlb_pool *get_hpool_from_inode(struct inode *inode) +{ + return NULL; +} +#endif + /* * A rare out of memory error was encountered which prevented removal of * the reserve map region for a page. The huge page itself was free'ed @@ -615,7 +627,7 @@ void hugetlb_fix_reserve_counts(struct inode *inode) struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; bool reserved = false; - struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool; + struct dhugetlb_pool *hpool = get_hpool_from_inode(inode);
rsv_adjust = hugepage_subpool_get_pages(spool, 1, hpool); if (rsv_adjust > 0) { @@ -2380,7 +2392,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, int ret, idx; struct hugetlb_cgroup *h_cg; struct dhugetlb_pool *hpool = - HUGETLBFS_I(file_inode(vma->vm_file))->hpool; + get_hpool_from_inode(file_inode(vma->vm_file));
idx = hstate_index(h); /* @@ -4673,7 +4685,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) unsigned long reserve, start, end; long gbl_reserve; struct dhugetlb_pool *hpool = - HUGETLBFS_I(file_inode(vma->vm_file))->hpool; + get_hpool_from_inode(file_inode(vma->vm_file));
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -6073,7 +6085,7 @@ int hugetlb_reserve_pages(struct inode *inode, struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; long gbl_reserve; - struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool; + struct dhugetlb_pool *hpool = get_hpool_from_inode(inode);
/* This should never happen */ if (from > to) { @@ -6192,7 +6204,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; - struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool; + struct dhugetlb_pool *hpool = get_hpool_from_inode(inode);
/* * Since this routine can be called in the evict inode path for all
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I6BDME CVE: NA
--------------------------------
For now, we only support 1G and 2M hugepage on arm64 with 4K pages. So restrict the config depends on ARM64_4K_PAGES and only setting hpool for 1G and 2M hugepage.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- fs/Kconfig | 1 + fs/hugetlbfs/inode.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/fs/Kconfig b/fs/Kconfig index e8800d8a73b3..2265e95cf17e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -216,6 +216,7 @@ config DYNAMIC_HUGETLB depends on HUGETLB_PAGE depends on MEMCG depends on CGROUP_HUGETLB + depends on X86_64 || (ARM64 && ARM64_4K_PAGES) help Dynamic hugepage are used in memcg and can be splited into small pages automatically. The tasks in the memcg prefer to alloc dynamic hugepage. diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e411103d2cf4..5fcd3586c81f 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1166,7 +1166,9 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) mpol_shared_policy_init(&p->policy, NULL); #ifdef CONFIG_DYNAMIC_HUGETLB /* Initialize hpool here in case of a quick call to destroy */ - p->hpool = get_dhugetlb_pool_from_task(current); + if (huge_page_size(sbinfo->hstate) == PMD_SIZE || + huge_page_size(sbinfo->hstate) == PUD_SIZE) + p->hpool = get_dhugetlb_pool_from_task(current); #endif
return &p->vfs_inode;
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I6BDME CVE: NA
--------------------------------
The memory hotplug and memory failure will dissolve freed hugepages to buddy system, this is not the expected behavior for dynamic hugetlb. Skip the dissolve operation for hugepages belonging to dynamic hugetlb. For memory hotplug, the hotplug operation is not allowed, if dhugetlb pool existed. For memory failure, the hugepage will be discard directly.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- include/linux/hugetlb.h | 5 +++++ mm/hugetlb.c | 14 +++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 92fd6e2cbaf3..a26cbc4398a4 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -751,6 +751,11 @@ static inline struct dhugetlb_pool *get_dhugetlb_pool_from_task( { return NULL; } +static inline struct dhugetlb_pool *get_dhugetlb_pool_from_dhugetlb_pagelist( + struct page *page) +{ + return NULL; +} static inline void dhugetlb_pool_put(struct dhugetlb_pool *hpool) { return; } #endif /* CONFIG_DYNAMIC_HUGETLB */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8b88ac4620d5..625882b7cd97 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1769,12 +1769,20 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, int dissolve_free_huge_page(struct page *page) { int rc = -EBUSY; + struct dhugetlb_pool *hpool;
retry: /* Not to disrupt normal path by vainly holding hugetlb_lock */ if (!PageHuge(page)) return 0;
+ /* Skip dissolve hugepage for dynamic hugetlb */ + hpool = get_dhugetlb_pool_from_dhugetlb_pagelist(page); + if (hpool) { + dhugetlb_pool_put(hpool); + return -EBUSY; + } + spin_lock(&hugetlb_lock); if (!PageHuge(page)) { rc = 0; @@ -3426,8 +3434,12 @@ struct dhugetlb_pool *get_dhugetlb_pool_from_dhugetlb_pagelist( struct page *page) { struct dhugetlb_pool *hpool = NULL; - unsigned long idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT); + unsigned long idx; + + if (!dhugetlb_enabled) + return NULL;
+ idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT); read_lock(&dhugetlb_pagelist_rwlock); if (idx < dhugetlb_pagelist_t->count) hpool = dhugetlb_pagelist_t->hpool[idx];
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I6BDME CVE: NA
--------------------------------
If hugepage_init() is called before hugetlb_init(), dhugetlb_enabled has not been initialized, we should use enable_dhugetlb instead.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- include/linux/hugetlb.h | 4 +++- mm/huge_memory.c | 4 ++-- mm/hugetlb.c | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a26cbc4398a4..d44eea25c0d6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -668,6 +668,7 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr /* We don't need to try 5 times, or we can't migrate the pages. */ #define HPOOL_RECLAIM_RETRIES 5
+extern bool enable_dhugetlb; extern struct static_key_false dhugetlb_enabled_key; #define dhugetlb_enabled (static_branch_unlikely(&dhugetlb_enabled_key))
@@ -744,7 +745,8 @@ void move_pages_from_smpool_to_hpool(struct dhugetlb_pool *hpool, void dhugetlb_reserve_hugepages(struct dhugetlb_pool *hpool, unsigned long count, bool gigantic); #else -#define dhugetlb_enabled 0 +#define enable_dhugetlb 0 +#define dhugetlb_enabled 0 struct dhugetlb_pool {}; static inline struct dhugetlb_pool *get_dhugetlb_pool_from_task( struct task_struct *tsk) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 484ffdbf5f45..8b7086cfd1ed 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -404,9 +404,9 @@ static int __init hugepage_init(void) * use transparent hugepage in addition. * (May result negative optimization) */ - if (dhugetlb_enabled) { + if (enable_dhugetlb) { transparent_hugepage_flags = 0; - pr_info("transparent hugepage is disabled due to confilct with dynamic hugetlb\n"); + pr_info("transparent hugepage is disabled due to conflict with dynamic hugetlb\n"); return -EINVAL; }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 625882b7cd97..c61c3d65eafc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3299,7 +3299,7 @@ static void hugetlb_register_all_nodes(void) { } #endif
#ifdef CONFIG_DYNAMIC_HUGETLB -static bool enable_dhugetlb; +bool enable_dhugetlb; DEFINE_STATIC_KEY_FALSE(dhugetlb_enabled_key); DEFINE_RWLOCK(dhugetlb_pagelist_rwlock); struct dhugetlb_pagelist *dhugetlb_pagelist_t;
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I6BDME CVE: NA
--------------------------------
Set DYNAMIC_HUGETLB to y for hulk_defconfig.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- arch/arm64/configs/hulk_defconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index d0c20b1b24a9..c6ea4e1a3946 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -4955,6 +4955,7 @@ CONFIG_TMPFS=y CONFIG_TMPFS_POSIX_ACL=y CONFIG_TMPFS_XATTR=y # CONFIG_TMPFS_INODE64 is not set +CONFIG_DYNAMIC_HUGETLB=y CONFIG_HUGETLBFS=y CONFIG_HUGETLB_PAGE=y CONFIG_MEMFD_CREATE=y