From: Liu Shixin liushixin2@hauwei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I6BDME CVE: NA
--------------------------------
This feature has already beed supported on x86_64 and this is the origin description:
Dynamic hugetlb which is based on Hugetlb, supports to be splited dynamically in a specified cgroup. We add a hugetlb_pool in a mem_cgroup to manage dynamic hugetlb for corresponding cgroup. After dynamic hugepages are allocated for a cgroup, these hugepages can be used as 1G/2M/4K pages by split/merge opreation.
It is now supported on arm64. This feature will be limited to depends on ARM64_4K_PAGES and not support cont-bits hugepage. We merge the previous patches into one patch which is patch[1]. While merge the code ,we found some code can be isolated by config DYNAMIC_HUGETLB, so we add patch[2] to re-isolated them. In patch[3], we restrict the feature on mentioned limit. The patch[4] add skip of dissolve hugepage which may conflict with memory hotplug and memory failure. The patch[5] set DYNAMIC_HUGETLB to y in hulk_defconfig to enable by default.
This patch includes all previous patches and the patches list is recorded in bugzilla.
Signed-off-by: Liu Shixin liushixin2@hauwei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- fs/Kconfig | 9 + fs/hugetlbfs/inode.c | 4 + include/linux/gfp.h | 4 +- include/linux/hugetlb.h | 97 +++ include/linux/memcontrol.h | 15 + include/linux/page-flags.h | 3 + include/trace/events/dhugetlb.h | 123 ++++ include/trace/events/mmflags.h | 1 + kernel/cgroup/cgroup.c | 6 + mm/huge_memory.c | 16 +- mm/hugetlb.c | 1188 ++++++++++++++++++++++++++++++- mm/internal.h | 1 + mm/memcontrol.c | 391 ++++++++++ mm/page_alloc.c | 33 +- 14 files changed, 1862 insertions(+), 29 deletions(-) create mode 100644 include/trace/events/dhugetlb.h
diff --git a/fs/Kconfig b/fs/Kconfig index 5921bfbebee4..e8800d8a73b3 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -211,6 +211,15 @@ config TMPFS_INODE64
If unsure, say N.
+config DYNAMIC_HUGETLB + bool "Dynamic HugeTLB" + depends on HUGETLB_PAGE + depends on MEMCG + depends on CGROUP_HUGETLB + help + Dynamic hugepage are used in memcg and can be splited into small pages + automatically. The tasks in the memcg prefer to alloc dynamic hugepage. + config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 005e05c442c5..30a29936372c 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -1164,6 +1164,8 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) * private inode. This simplifies hugetlbfs_destroy_inode. */ mpol_shared_policy_init(&p->policy, NULL); + /* Initialize hpool here in case of a quick call to destroy */ + p->hpool = get_dhugetlb_pool_from_task(current);
return &p->vfs_inode; } @@ -1178,6 +1180,8 @@ static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); + dhugetlb_pool_put(HUGETLBFS_I(inode)->hpool); + HUGETLBFS_I(inode)->hpool = NULL; call_rcu(&inode->i_rcu, hugetlbfs_i_callback); }
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 152cb9bdf436..74b0375d7d2b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -501,7 +501,9 @@ static inline void arch_alloc_page(struct page *page, int order) { } struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask); - +void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags); +bool free_pages_prepare(struct page *page, unsigned int order, bool check_free); static inline struct page * __alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid) { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2d2b06b36bd0..3a82ea9283ec 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -289,6 +289,7 @@ struct hugetlbfs_inode_info { struct shared_policy policy; struct inode vfs_inode; unsigned int seals; + struct dhugetlb_pool *hpool; };
static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) @@ -655,6 +656,102 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
#endif /* CONFIG_HUGETLB_PAGE */
+#ifdef CONFIG_DYNAMIC_HUGETLB +/* The number of small_page_pool for a dhugetlb_pool */ +#define NR_SMPOOL num_possible_cpus() +/* The max page number in a small_page_pool */ +#define MAX_SMPOOL_PAGE 1024 +/* number to move between list */ +#define BATCH_SMPOOL_PAGE (MAX_SMPOOL_PAGE >> 2) +/* We don't need to try 5 times, or we can't migrate the pages. */ +#define HPOOL_RECLAIM_RETRIES 5 + +extern struct static_key_false dhugetlb_enabled_key; +#define dhugetlb_enabled (static_branch_unlikely(&dhugetlb_enabled_key)) + +#define DEFAULT_PAGESIZE 4096 +extern rwlock_t dhugetlb_pagelist_rwlock; +struct dhugetlb_pagelist { + unsigned long count; + struct dhugetlb_pool *hpool[0]; +}; +extern struct dhugetlb_pagelist *dhugetlb_pagelist_t; + +struct split_pages { + struct list_head list; + unsigned long start_pfn; + unsigned long free_pages; +}; + +struct small_page_pool { + spinlock_t lock; + unsigned long free_pages; + long used_pages; + struct list_head head_page; +}; + +struct dhugetlb_pool { + int nid; + spinlock_t lock; + spinlock_t reserved_lock; + atomic_t refcnt; + + struct mem_cgroup *attach_memcg; + + struct list_head dhugetlb_1G_freelists; + struct list_head dhugetlb_2M_freelists; + struct list_head dhugetlb_4K_freelists; + + struct list_head split_1G_freelists; + struct list_head split_2M_freelists; + + unsigned long total_nr_pages; + + unsigned long total_reserved_1G; + unsigned long free_reserved_1G; + unsigned long mmap_reserved_1G; + unsigned long used_1G; + unsigned long free_unreserved_1G; + unsigned long nr_split_1G; + + unsigned long total_reserved_2M; + unsigned long free_reserved_2M; + unsigned long mmap_reserved_2M; + unsigned long used_2M; + unsigned long free_unreserved_2M; + unsigned long nr_split_2M; + + unsigned long free_pages; + struct small_page_pool smpool[0]; +}; + +bool dhugetlb_pool_get(struct dhugetlb_pool *hpool); +void dhugetlb_pool_put(struct dhugetlb_pool *hpool); +struct dhugetlb_pool *hpool_alloc(unsigned long nid); +int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool, + unsigned long nid, unsigned long size); +bool free_dhugetlb_pool(struct dhugetlb_pool *hpool); +int update_dhugetlb_pagelist(unsigned long idx, struct dhugetlb_pool *hpool); +struct dhugetlb_pool *get_dhugetlb_pool_from_dhugetlb_pagelist( + struct page *page); +struct dhugetlb_pool *get_dhugetlb_pool_from_task(struct task_struct *tsk); +bool move_pages_from_hpool_to_smpool(struct dhugetlb_pool *hpool, + struct small_page_pool *smpool); +void move_pages_from_smpool_to_hpool(struct dhugetlb_pool *hpool, + struct small_page_pool *smpool); +void dhugetlb_reserve_hugepages(struct dhugetlb_pool *hpool, + unsigned long count, bool gigantic); +#else +#define dhugetlb_enabled 0 +struct dhugetlb_pool {}; +static inline struct dhugetlb_pool *get_dhugetlb_pool_from_task( + struct task_struct *tsk) +{ + return NULL; +} +static inline void dhugetlb_pool_put(struct dhugetlb_pool *hpool) { return; } +#endif /* CONFIG_DYNAMIC_HUGETLB */ + static inline spinlock_t *huge_pte_lock(struct hstate *h, struct mm_struct *mm, pte_t *pte) { diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 4517d132d1e2..22f40d5e0e8b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -326,6 +326,7 @@ struct mem_cgroup { };
struct mem_cgroup_extension { + struct dhugetlb_pool *hpool; #ifdef CONFIG_MEMCG_QOS /* Currently support 0 and -1. * in the future it can expand to other value. @@ -1406,4 +1407,18 @@ static inline void memcg_put_cache_ids(void)
#endif /* CONFIG_MEMCG_KMEM */
+#ifdef CONFIG_DYNAMIC_HUGETLB +struct dhugetlb_pool *get_dhugetlb_pool_from_memcg(struct mem_cgroup *memcg); +struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp_mask); +void free_page_to_dhugetlb_pool(struct page *page); +int dhugetlb_pool_force_empty(struct mem_cgroup *memcg); +bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css); +#else +static inline struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp_mask) +{ + return NULL; +} +static inline void free_page_to_dhugetlb_pool(struct page *page) {} +#endif + #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 0c5d1c4c71e6..fd6cd68e00a2 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -102,6 +102,7 @@ enum pageflags { PG_idle, #endif PG_percpu_ref, + PG_pool, __NR_PAGEFLAGS,
/* Filesystems */ @@ -284,6 +285,7 @@ PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) __PAGEFLAG(Slab, slab, PF_NO_TAIL) __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ +PAGEFLAG(Pool, pool, PF_NO_TAIL)
/* Xen */ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) @@ -770,6 +772,7 @@ static inline void ClearPageSlabPfmemalloc(struct page *page) 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_slab | 1UL << PG_active | \ + 1UL << PG_pool | \ 1UL << PG_unevictable | __PG_MLOCKED)
/* diff --git a/include/trace/events/dhugetlb.h b/include/trace/events/dhugetlb.h new file mode 100644 index 000000000000..20b3a54589d1 --- /dev/null +++ b/include/trace/events/dhugetlb.h @@ -0,0 +1,123 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM dhugetlb + +#if !defined(_TRACE_DHUGETLB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DHUGETLB_H + +#include <linux/tracepoint.h> +#include <trace/events/mmflags.h> + +#define DHUGETLB_SPLIT_1G 0x01u +#define DHUGETLB_SPLIT_2M 0x02u +#define DHUGETLB_MERGE_4K 0x04u +#define DHUGETLB_MIGRATE_4K 0x08u +#define DHUGETLB_RESV_1G 0x10u +#define DHUGETLB_UNRESV_1G 0x20u +#define DHUGETLB_RESV_2M 0x40u +#define DHUGETLB_UNRESV_2M 0x80u +#define DHUGETLB_ALLOC_1G 0x100u +#define DHUGETLB_FREE_1G 0x200u +#define DHUGETLB_ALLOC_2M 0x400u +#define DHUGETLB_FREE_2M 0x800u + +#define __def_action_names \ + {(unsigned long)DHUGETLB_SPLIT_1G, "split_1G_to_2M"}, \ + {(unsigned long)DHUGETLB_SPLIT_2M, "split_2M_to_4K"}, \ + {(unsigned long)DHUGETLB_MERGE_4K, "merge_4K_to_2M"}, \ + {(unsigned long)DHUGETLB_MIGRATE_4K, "migrate_4K_to_2M"}, \ + {(unsigned long)DHUGETLB_RESV_1G, "resv_1G_page"}, \ + {(unsigned long)DHUGETLB_UNRESV_1G, "unresv_1G_page"}, \ + {(unsigned long)DHUGETLB_RESV_2M, "resv_2M_page"}, \ + {(unsigned long)DHUGETLB_UNRESV_2M, "unresv_2M_page"}, \ + {(unsigned long)DHUGETLB_ALLOC_1G, "alloc_1G_page"}, \ + {(unsigned long)DHUGETLB_FREE_1G, "free_1G_page"}, \ + {(unsigned long)DHUGETLB_ALLOC_2M, "alloc_2M_page"}, \ + {(unsigned long)DHUGETLB_FREE_2M, "free_2M_page"} + +#define show_action(action) \ + (action) ? __print_flags(action, "", \ + __def_action_names \ + ) : "none" + +TRACE_EVENT(dhugetlb_split_merge, + + TP_PROTO(const void *hpool, struct page *page, unsigned long action), + + TP_ARGS(hpool, page, action), + + TP_STRUCT__entry( + __field( const void *, hpool ) + __field( unsigned long, pfn ) + __field( unsigned long, action ) + ), + + TP_fast_assign( + __entry->hpool = hpool; + __entry->pfn = page ? page_to_pfn(page) : -1UL; + __entry->action = action; + ), + + TP_printk("hpool=%p page=%p pfn=%lu action=%s", + __entry->hpool, + __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, + __entry->pfn != -1UL ? __entry->pfn : 0, + show_action(__entry->action)) +); + +TRACE_EVENT(dhugetlb_acct_memory, + + TP_PROTO(const void *hpool, unsigned long count, unsigned long action), + + TP_ARGS(hpool, count, action), + + TP_STRUCT__entry( + __field( const void *, hpool ) + __field( unsigned long, count ) + __field( unsigned long, action ) + ), + + TP_fast_assign( + __entry->hpool = hpool; + __entry->count = count; + __entry->action = action; + ), + + TP_printk("hpool=%p action=%s, mmap_count=%lu", + __entry->hpool, + show_action(__entry->action), + __entry->count) +); + +TRACE_EVENT(dhugetlb_alloc_free, + + TP_PROTO(const void *hpool, struct page *page, unsigned long count, + unsigned long action), + + TP_ARGS(hpool, page, count, action), + + TP_STRUCT__entry( + __field( const void *, hpool ) + __field( unsigned long, pfn ) + __field( unsigned long, count ) + __field( unsigned long, action ) + ), + + TP_fast_assign( + __entry->hpool = hpool; + __entry->pfn = page ? page_to_pfn(page) : -1UL; + __entry->count = count; + __entry->action = action; + ), + + TP_printk("hpool=%p page=%p pfn=%lu action=%s free_count=%lu", + __entry->hpool, + __entry->pfn != -1UL ? pfn_to_page(__entry->pfn) : NULL, + __entry->pfn != -1UL ? __entry->pfn : 0, + show_action(__entry->action), + __entry->count) +); + +#endif /* _TRACE_DHUGETLB_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index b817bf1885a0..4d06b47129f3 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -81,6 +81,7 @@
#define __def_pageflag_names \ {1UL << PG_locked, "locked" }, \ + {1UL << PG_pool, "pool" }, \ {1UL << PG_waiters, "waiters" }, \ {1UL << PG_error, "error" }, \ {1UL << PG_referenced, "referenced" }, \ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7456882e1a0f..b01490b71f32 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -65,6 +65,7 @@ /* let's not notify more than 100 times per second */ #define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
+bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css); /* * cgroup_mutex is the master lock. Any modification to cgroup or its * hierarchy must be performed while holding it. @@ -5280,6 +5281,11 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (css_has_online_children(&cgrp->self)) return -EBUSY;
+#ifdef CONFIG_MEMCG + /* If we use dynamic hugetlb, make sure dhugtlb_pool is free */ + if (!dhugetlb_pool_is_free(cgrp->subsys[memory_cgrp_id])) + return -EBUSY; +#endif /* * Mark @cgrp and the associated csets dead. The former prevents * further task migration and child creation by disabling diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f8319265c1cf..484ffdbf5f45 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -396,6 +396,20 @@ static int __init hugepage_init(void) return -EINVAL; }
+ /* + * When we alloc some pages(order = 0), system may help us to alloc + * a page(order > 0) due to transparent hugepage. This result + * dynamic hugetlb to be skipped. Actually, using dynamic hugetlb + * means we have already optimized the program, so we should not + * use transparent hugepage in addition. + * (May result negative optimization) + */ + if (dhugetlb_enabled) { + transparent_hugepage_flags = 0; + pr_info("transparent hugepage is disabled due to confilct with dynamic hugetlb\n"); + return -EINVAL; + } + /* * hugepages can't be allocated by the buddy allocator */ @@ -2946,9 +2960,9 @@ static unsigned long deferred_split_count(struct shrinker *shrink, { struct pglist_data *pgdata = NODE_DATA(sc->nid); unsigned long *split_queue_len = &pgdata->split_queue_len; +#ifdef CONFIG_MEMCG struct mem_cgroup_extension *memcg_ext;
-#ifdef CONFIG_MEMCG if (sc->memcg) { memcg_ext = container_of(sc->memcg, struct mem_cgroup_extension, memcg); split_queue_len = &memcg_ext->split_queue_len; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 495d8b5b38fc..4c8c91acd6d5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -27,6 +27,12 @@ #include <linux/jhash.h> #include <linux/mman.h> #include <linux/share_pool.h> +#include <linux/kthread.h> +#include <linux/cpuhotplug.h> +#include <linux/freezer.h> +#include <linux/delay.h> +#include <linux/migrate.h> +#include <linux/mm_inline.h>
#include <asm/page.h> #include <asm/pgtable.h> @@ -39,8 +45,14 @@ #include <linux/userfaultfd_k.h> #include <linux/page_owner.h> #include <linux/share_pool.h> +#include <linux/memblock.h> #include "internal.h"
+#if (defined CONFIG_DYNAMIC_HUGETLB) && (!defined __GENKSYMS__) +#define CREATE_TRACE_POINTS +#include <trace/events/dhugetlb.h> +#endif + int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -89,7 +101,8 @@ static inline void ClearPageHugeFreed(struct page *head) }
/* Forward declaration */ -static int hugetlb_acct_memory(struct hstate *h, long delta); +static int hugetlb_acct_memory(struct hstate *h, long delta, + struct dhugetlb_pool *hpool);
static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { @@ -103,7 +116,7 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) if (free) { if (spool->min_hpages != -1) hugetlb_acct_memory(spool->hstate, - -spool->min_hpages); + -spool->min_hpages, NULL); kfree(spool); } } @@ -123,7 +136,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, spool->hstate = h; spool->min_hpages = min_hpages;
- if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) { + if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages, NULL)) { kfree(spool); return NULL; } @@ -149,13 +162,17 @@ void hugepage_put_subpool(struct hugepage_subpool *spool) * a subpool minimum size must be manitained. */ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, - long delta) + long delta, struct dhugetlb_pool *hpool) { long ret = delta;
if (!spool) return ret;
+ /* Skip subpool when hugetlb file belongs to a hugetlb_pool */ + if (dhugetlb_enabled && hpool) + return ret; + spin_lock(&spool->lock);
if (spool->max_hpages != -1) { /* maximum size accounting */ @@ -194,13 +211,17 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool, * in the case where a subpool minimum size must be maintained. */ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool, - long delta) + long delta, struct dhugetlb_pool *hpool) { long ret = delta;
if (!spool) return delta;
+ /* Skip subpool when hugetlb file belongs to a hugetlb_pool */ + if (dhugetlb_enabled && hpool) + return ret; + spin_lock(&spool->lock);
if (spool->max_hpages != -1) /* maximum size accounting */ @@ -594,12 +615,13 @@ void hugetlb_fix_reserve_counts(struct inode *inode) struct hugepage_subpool *spool = subpool_inode(inode); long rsv_adjust; bool reserved = false; + struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool;
- rsv_adjust = hugepage_subpool_get_pages(spool, 1); + rsv_adjust = hugepage_subpool_get_pages(spool, 1, hpool); if (rsv_adjust > 0) { struct hstate *h = hstate_inode(inode);
- if (!hugetlb_acct_memory(h, 1)) + if (!hugetlb_acct_memory(h, 1, hpool)) reserved = true; } else if (!rsv_adjust) { reserved = true; @@ -1300,6 +1322,56 @@ static inline void ClearPageHugeTemporary(struct page *page) page[2].mapping = NULL; }
+#ifdef CONFIG_DYNAMIC_HUGETLB +static void free_huge_page_to_dhugetlb_pool(struct page *page, + bool restore_reserve) +{ + struct hstate *h = page_hstate(page); + struct dhugetlb_pool *hpool; + + hpool = get_dhugetlb_pool_from_dhugetlb_pagelist(page); + if (unlikely(!hpool)) { + pr_err("dhugetlb: free error: get hpool failed\n"); + return; + } + + spin_lock(&hpool->lock); + ClearPagePool(page); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + if (!hstate_is_gigantic(h)) { + list_add(&page->lru, &hpool->dhugetlb_2M_freelists); + hpool->free_reserved_2M++; + hpool->used_2M--; + if (restore_reserve) { + hpool->mmap_reserved_2M++; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_2M, + DHUGETLB_RESV_2M); + } + trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_2M, + DHUGETLB_FREE_2M); + } else { + list_add(&page->lru, &hpool->dhugetlb_1G_freelists); + hpool->free_reserved_1G++; + hpool->used_1G--; + if (restore_reserve) { + hpool->mmap_reserved_1G++; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_1G, + DHUGETLB_RESV_1G); + } + trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_1G, + DHUGETLB_FREE_1G); + } + spin_unlock(&hpool->lock); + dhugetlb_pool_put(hpool); +} +#else +void free_huge_page_to_dhugetlb_pool(struct page *page, bool restore_reserve) +{ +} +#endif + void free_huge_page(struct page *page) { /* @@ -1320,6 +1392,17 @@ void free_huge_page(struct page *page) restore_reserve = PagePrivate(page); ClearPagePrivate(page);
+ if (dhugetlb_enabled && PagePool(page)) { + spin_lock(&hugetlb_lock); + clear_page_huge_active(page); + list_del(&page->lru); + hugetlb_cgroup_uncharge_page(hstate_index(h), + pages_per_huge_page(h), page); + spin_unlock(&hugetlb_lock); + free_huge_page_to_dhugetlb_pool(page, restore_reserve); + return; + } + /* * If PagePrivate() was set on page, page allocation consumed a * reservation. If the page was associated with a subpool, there @@ -1335,7 +1418,7 @@ void free_huge_page(struct page *page) * after page is free. Therefore, force restore_reserve * operation. */ - if (hugepage_subpool_put_pages(spool, 1) == 0) + if (hugepage_subpool_put_pages(spool, 1, NULL) == 0) restore_reserve = true; }
@@ -2211,6 +2294,81 @@ static void restore_reserve_on_error(struct hstate *h, } }
+#ifdef CONFIG_DYNAMIC_HUGETLB +static struct page *__alloc_huge_page_from_dhugetlb_pool( + struct dhugetlb_pool *hpool, int idx, bool need_unreserved) +{ + unsigned long flags; + struct page *page = NULL; + + spin_lock_irqsave(&hpool->lock, flags); + if (hstate_is_gigantic(&hstates[idx]) && hpool->free_reserved_1G) { + page = list_entry(hpool->dhugetlb_1G_freelists.next, + struct page, lru); + list_del(&page->lru); + hpool->free_reserved_1G--; + hpool->used_1G++; + if (need_unreserved) { + SetPagePrivate(page); + hpool->mmap_reserved_1G--; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_1G, + DHUGETLB_UNRESV_1G); + } + trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_1G, + DHUGETLB_ALLOC_1G); + } else if (!hstate_is_gigantic(&hstates[idx]) && + hpool->free_reserved_2M) { + page = list_entry(hpool->dhugetlb_2M_freelists.next, + struct page, lru); + list_del(&page->lru); + hpool->free_reserved_2M--; + hpool->used_2M++; + if (need_unreserved) { + SetPagePrivate(page); + hpool->mmap_reserved_2M--; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_2M, + DHUGETLB_UNRESV_2M); + } + trace_dhugetlb_alloc_free(hpool, page, hpool->free_reserved_2M, + DHUGETLB_ALLOC_2M); + } + if (page) { + INIT_LIST_HEAD(&page->lru); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + set_page_refcounted(page); + SetPagePool(page); + } + spin_unlock_irqrestore(&hpool->lock, flags); + + return page; +} + +static struct page *alloc_huge_page_from_dhugetlb_pool( + struct vm_area_struct *vma, int idx, int avoid_reserve, + long gbl_chg, struct dhugetlb_pool *hpool) +{ + struct page *page; + bool need_unreserved = false; + + if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) + need_unreserved = true; + + page = __alloc_huge_page_from_dhugetlb_pool(hpool, idx, + need_unreserved); + + return page; +} +#else +static inline struct page *alloc_huge_page_from_dhugetlb_pool( + struct vm_area_struct *vma, int idx, int avoid_reserve, + long gbl_chg, struct dhugetlb_pool *hpool) +{ + return NULL; +} +#endif + struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve) { @@ -2221,6 +2379,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, long gbl_chg; int ret, idx; struct hugetlb_cgroup *h_cg; + struct dhugetlb_pool *hpool = + HUGETLBFS_I(file_inode(vma->vm_file))->hpool;
idx = hstate_index(h); /* @@ -2240,7 +2400,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, * checked against any subpool limit. */ if (map_chg || avoid_reserve) { - gbl_chg = hugepage_subpool_get_pages(spool, 1); + gbl_chg = hugepage_subpool_get_pages(spool, 1, hpool); if (gbl_chg < 0) { vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); @@ -2262,6 +2422,26 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, if (ret) goto out_subpool_put;
+ if (dhugetlb_enabled && hpool) { + page = alloc_huge_page_from_dhugetlb_pool(vma, idx, + avoid_reserve, + gbl_chg, hpool); + if (page) { + /* + * Use hugetlb_lock to manage the account of + * hugetlb cgroup. + */ + spin_lock(&hugetlb_lock); + list_add(&page->lru, &h->hugepage_activelist); + hugetlb_cgroup_commit_charge(idx, + pages_per_huge_page(hstate_vma(vma)), + h_cg, page); + spin_unlock(&hugetlb_lock); + goto out; + } + goto out_uncharge_cgroup; + } + spin_lock(&hugetlb_lock); /* * glb_chg is passed to indicate whether or not a page must be taken @@ -2284,7 +2464,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, } hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); spin_unlock(&hugetlb_lock); - +out: set_page_private(page, (unsigned long)spool);
map_commit = vma_commit_reservation(h, vma, addr); @@ -2300,8 +2480,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, */ long rsv_adjust;
- rsv_adjust = hugepage_subpool_put_pages(spool, 1); - hugetlb_acct_memory(h, -rsv_adjust); + rsv_adjust = hugepage_subpool_put_pages(spool, 1, hpool); + hugetlb_acct_memory(h, -rsv_adjust, hpool); } return page;
@@ -2309,7 +2489,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); out_subpool_put: if (map_chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); + hugepage_subpool_put_pages(spool, 1, hpool); vma_end_reservation(h, vma, addr); return ERR_PTR(-ENOSPC); } @@ -3098,6 +3278,932 @@ static void hugetlb_register_all_nodes(void) { }
#endif
+#ifdef CONFIG_DYNAMIC_HUGETLB +static bool enable_dhugetlb; +DEFINE_STATIC_KEY_FALSE(dhugetlb_enabled_key); +DEFINE_RWLOCK(dhugetlb_pagelist_rwlock); +struct dhugetlb_pagelist *dhugetlb_pagelist_t; + +bool dhugetlb_pool_get(struct dhugetlb_pool *hpool) +{ + if (!hpool) + return false; + + return atomic_inc_not_zero(&hpool->refcnt); +} + +void dhugetlb_pool_put(struct dhugetlb_pool *hpool) +{ + if (!dhugetlb_enabled || !hpool) + return; + + if (atomic_dec_and_test(&hpool->refcnt)) { + css_put(&hpool->attach_memcg->css); + kfree(hpool); + } +} + +struct dhugetlb_pool *hpool_alloc(unsigned long nid) +{ + int i; + struct dhugetlb_pool *hpool; + + hpool = kzalloc(sizeof(struct dhugetlb_pool) + + NR_SMPOOL * sizeof(struct small_page_pool), GFP_KERNEL); + if (!hpool) + return NULL; + + spin_lock_init(&hpool->lock); + spin_lock_init(&hpool->reserved_lock); + hpool->nid = nid; + atomic_set(&hpool->refcnt, 1); + INIT_LIST_HEAD(&hpool->dhugetlb_1G_freelists); + INIT_LIST_HEAD(&hpool->dhugetlb_2M_freelists); + INIT_LIST_HEAD(&hpool->dhugetlb_4K_freelists); + INIT_LIST_HEAD(&hpool->split_1G_freelists); + INIT_LIST_HEAD(&hpool->split_2M_freelists); + + for (i = 0; i < NR_SMPOOL; i++) { + spin_lock_init(&hpool->smpool[i].lock); + INIT_LIST_HEAD(&hpool->smpool[i].head_page); + } + + return hpool; +} + +int alloc_hugepage_from_hugetlb(struct dhugetlb_pool *hpool, + unsigned long nid, unsigned long size) +{ + int ret; + struct page *page, *next; + unsigned long idx; + unsigned long i = 0; + struct hstate *h = size_to_hstate(PUD_SIZE); + + if (!h) + return -ENOMEM; + + spin_lock(&hpool->lock); + spin_lock(&hugetlb_lock); + if (h->free_huge_pages_node[nid] < size) { + ret = -ENOMEM; + goto out_unlock; + } + + list_for_each_entry_safe(page, next, &h->hugepage_freelists[nid], lru) { + idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT); + ret = update_dhugetlb_pagelist(idx, hpool); + if (ret) + continue; + ClearPageHugeFreed(page); + list_move_tail(&page->lru, &hpool->dhugetlb_1G_freelists); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + hpool->total_nr_pages++; + hpool->free_unreserved_1G++; + if (++i == size) + break; + } + ret = 0; +out_unlock: + spin_unlock(&hugetlb_lock); + spin_unlock(&hpool->lock); + return ret; +} + +/* + * When we assign a hugepage to dhugetlb_pool, we need to record it in + * dhugetlb_pagelist_t. In this situation, we just need read_lock because + * there is not conflit when write to dhugetlb_pagelist_t->hpool. + * + * If page's pfn is greater than dhugetlb_pagelist_t->count (which may + * occurs due to memory hotplug), we need to realloc enough memory so that + * pfn = dhugetlb_pagelist_t->count - 1 and then record it. + * In this situation, we need write_lock because while we are reallocating, + * the read request should wait. + */ +int update_dhugetlb_pagelist(unsigned long idx, struct dhugetlb_pool *hpool) +{ + read_lock(&dhugetlb_pagelist_rwlock); + if (idx >= dhugetlb_pagelist_t->count) { + unsigned long size; + struct dhugetlb_pagelist *tmp; + + read_unlock(&dhugetlb_pagelist_rwlock); + write_lock(&dhugetlb_pagelist_rwlock); + + size = sizeof(struct dhugetlb_pagelist) + + (idx + 1) * sizeof(struct dhugetlb_pool *); + tmp = krealloc(dhugetlb_pagelist_t, size, GFP_ATOMIC); + if (!tmp) { + write_unlock(&dhugetlb_pagelist_rwlock); + return -ENOMEM; + } + tmp->count = idx + 1; + dhugetlb_pagelist_t = tmp; + + write_unlock(&dhugetlb_pagelist_rwlock); + read_lock(&dhugetlb_pagelist_rwlock); + } + dhugetlb_pagelist_t->hpool[idx] = hpool; + read_unlock(&dhugetlb_pagelist_rwlock); + return 0; +} + +struct dhugetlb_pool *get_dhugetlb_pool_from_dhugetlb_pagelist( + struct page *page) +{ + struct dhugetlb_pool *hpool = NULL; + unsigned long idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT); + + read_lock(&dhugetlb_pagelist_rwlock); + if (idx < dhugetlb_pagelist_t->count) + hpool = dhugetlb_pagelist_t->hpool[idx]; + read_unlock(&dhugetlb_pagelist_rwlock); + if (dhugetlb_pool_get(hpool)) + return hpool; + return NULL; +} + +struct dhugetlb_pool *get_dhugetlb_pool_from_task(struct task_struct *tsk) +{ + struct mem_cgroup *memcg; + struct dhugetlb_pool *hpool; + + if (!dhugetlb_enabled) + return NULL; + + rcu_read_lock(); + memcg = mem_cgroup_from_task(tsk); + rcu_read_unlock(); + + hpool = get_dhugetlb_pool_from_memcg(memcg); + + return hpool; +} + +static void add_new_huge_page_to_pool(struct dhugetlb_pool *hpool, + struct page *page, bool gigantic) +{ + lockdep_assert_held(&hpool->lock); + VM_BUG_ON_PAGE(page_mapcount(page), page); + INIT_LIST_HEAD(&page->lru); + + if (gigantic) { + prep_compound_gigantic_page(page, PUD_SHIFT - PAGE_SHIFT); + list_add_tail(&page->lru, &hpool->dhugetlb_1G_freelists); + hpool->free_unreserved_1G++; + } else { + prep_new_page(page, PMD_SHIFT - PAGE_SHIFT, __GFP_COMP, 0); + set_page_count(page, 0); + list_add_tail(&page->lru, &hpool->dhugetlb_2M_freelists); + hpool->free_unreserved_2M++; + } + set_page_private(page, 0); + page->mapping = NULL; + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + set_hugetlb_cgroup(page, NULL); +} + +static void free_dhugetlb_pcpool(struct dhugetlb_pool *hpool) +{ + int i; + struct small_page_pool *smpool; + + for (i = 0; i < NR_SMPOOL; i++) { + smpool = &hpool->smpool[i]; + list_splice(&smpool->head_page, &hpool->dhugetlb_4K_freelists); + smpool->free_pages = 0; + smpool->used_pages = 0; + INIT_LIST_HEAD(&smpool->head_page); + } +} + +static void __free_dhugetlb_small_page(struct dhugetlb_pool *hpool) +{ + struct page *page, *next; + struct split_pages *split_huge, *split_next; + + if (list_empty(&hpool->dhugetlb_4K_freelists)) + return; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_4K_freelists, lru) { + list_del(&page->lru); + add_new_huge_page_to_pool(hpool, page, false); + } + + list_for_each_entry_safe(split_huge, split_next, + &hpool->split_2M_freelists, list) { + list_del(&split_huge->list); + kfree(split_huge); + hpool->nr_split_2M--; + } + + hpool->free_pages = 0; + INIT_LIST_HEAD(&hpool->dhugetlb_4K_freelists); +} + +static void free_dhugetlb_small_page(struct dhugetlb_pool *hpool) +{ + struct page *page, *next; + unsigned long nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + + lockdep_assert_held(&hpool->lock); + if (list_empty(&hpool->dhugetlb_4K_freelists)) + return; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_4K_freelists, lru) { + if (page_to_pfn(page) % nr_pages != 0) + list_del(&page->lru); + } + + __free_dhugetlb_small_page(hpool); +} + +static void __free_dhugetlb_huge_page(struct dhugetlb_pool *hpool) +{ + struct page *page, *next; + struct split_pages *split_giga, *split_next; + + if (list_empty(&hpool->dhugetlb_2M_freelists)) + return; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_2M_freelists, lru) { + list_del(&page->lru); + add_new_huge_page_to_pool(hpool, page, true); + } + list_for_each_entry_safe(split_giga, split_next, + &hpool->split_1G_freelists, list) { + list_del(&split_giga->list); + kfree(split_giga); + hpool->nr_split_1G--; + } + + hpool->total_reserved_2M = 0; + hpool->free_reserved_2M = 0; + hpool->free_unreserved_2M = 0; + INIT_LIST_HEAD(&hpool->dhugetlb_2M_freelists); +} + +static void free_dhugetlb_huge_page(struct dhugetlb_pool *hpool) +{ + struct page *page, *next; + unsigned long nr_pages = 1 << (PUD_SHIFT - PAGE_SHIFT); + unsigned long block_size = 1 << (PMD_SHIFT - PAGE_SHIFT); + int i; + + lockdep_assert_held(&hpool->lock); + if (list_empty(&hpool->dhugetlb_2M_freelists)) + return; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_2M_freelists, lru) { + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + atomic_set(compound_mapcount_ptr(page), 0); + for (i = 1; i < block_size; i++) + clear_compound_head(&page[i]); + set_compound_order(page, 0); + __ClearPageHead(page); + if (page_to_pfn(page) % nr_pages != 0) + list_del(&page->lru); + } + __free_dhugetlb_huge_page(hpool); +} + +static int try_migrate_page(struct page *page, unsigned long nid) +{ + unsigned long pfn = page_to_pfn(page); + int ret = 0; + + LIST_HEAD(source); + + if (!pfn_valid(pfn)) + return 0; + BUG_ON(PageHuge(page) || PageTransHuge(page)); + /* + * HWPoison pages have elevated reference counts so the migration + * would fail on them. It also doesn't make any sense to migrate them + * in the first place. Still try to unmap such a page in case it is + * still mapped(e.g. current hwpoison implementation doesn't unmap + * KSM pages but keep the unmap as the catch all safety net). + */ + if (PageHWPoison(page)) { + if (WARN_ON(PageLRU(page))) + isolate_lru_page(page); + if (page_mapped(page)) + try_to_unmap(page, + TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS); + return 0; + } + + if (!get_page_unless_zero(page)) + return 0; + /* + * We can skip free pages. And we can deal with pages on + * LRU and non-lru movable pages. + */ + if (PageLRU(page)) + ret = isolate_lru_page(page); + else + ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE); + put_page(page); + if (ret) { + if (page_count(page)) + ret = -EBUSY; + return ret; + } + list_add_tail(&page->lru, &source); + if (!__PageMovable(page)) + inc_node_page_state(page, + NR_ISOLATED_ANON + page_is_file_cache(page)); + + ret = migrate_pages(&source, alloc_new_node_page, NULL, nid, + MIGRATE_SYNC_LIGHT, MR_COMPACTION); + if (ret) + putback_movable_pages(&source); + return ret; +} + +static void try_migrate_pages(struct dhugetlb_pool *hpool) +{ + int i, j; + unsigned long nr_free_pages; + struct split_pages *split_giga, *next; + unsigned int nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + struct page *page; + int sleep_interval = 100; /* wait for the migration */ + + spin_unlock(&hpool->lock); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + + msleep(sleep_interval); + dhugetlb_pool_force_empty(hpool->attach_memcg); + + spin_lock(&hpool->lock); + nr_free_pages = hpool->free_pages; + spin_unlock(&hpool->lock); + for (i = 0; i < NR_SMPOOL; i++) { + spin_lock(&hpool->smpool[i].lock); + nr_free_pages += hpool->smpool[i].free_pages; + spin_unlock(&hpool->smpool[i].lock); + } + + if (nr_free_pages >> HUGETLB_PAGE_ORDER < hpool->nr_split_2M) { + list_for_each_entry_safe(split_giga, next, + &hpool->split_1G_freelists, list) { + for (i = 0; i < nr_pages; i++) { + if (PageCompound(pfn_to_page( + split_giga->start_pfn + i * nr_pages))) + continue; + page = pfn_to_page(split_giga->start_pfn + + i * nr_pages); + for (j = 0; j < nr_pages; j++) { + if (PagePool(page + j)) + try_migrate_page(page + j, + hpool->nid); + } + } + } + } + + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); +} + +/* + * If there are some pages are still in use. We will try to reclaim/migrate it. + * After trying at most HPOOL_RECLAIM_RETRIES times, we may success. + * Or we will print the failed information and return false. + */ +static bool free_dhugetlb_pages(struct dhugetlb_pool *hpool) +{ + int i; + long used_pages; + int try_count = 0; + +retry: + used_pages = 0; + for (i = 0; i < NR_SMPOOL; i++) + used_pages += hpool->smpool[i].used_pages; + + if (try_count < HPOOL_RECLAIM_RETRIES && + (used_pages || hpool->used_2M || hpool->used_1G)) { + try_migrate_pages(hpool); + try_count++; + goto retry; + } + + if (used_pages) + pr_err("dhugetlb: some 4K pages not free, memcg: %s delete failed!\n", + hpool->attach_memcg->css.cgroup->kn->name); + else if (hpool->used_2M) + pr_err("dhugetlb: some 2M pages not free, memcg: %s delete failed!\n", + hpool->attach_memcg->css.cgroup->kn->name); + else if (hpool->used_1G) + pr_err("dhugetlb: some 1G pages not free, memcg: %s delete failed!\n", + hpool->attach_memcg->css.cgroup->kn->name); + else { + free_dhugetlb_pcpool(hpool); + free_dhugetlb_small_page(hpool); + free_dhugetlb_huge_page(hpool); + return true; + } + return false; +} + +static void free_back_hugetlb(struct dhugetlb_pool *hpool) +{ + int nid; + unsigned int nr_pages; + unsigned long pfn, idx; + struct page *page, *page_next, *p; + struct hstate *h = size_to_hstate(PUD_SIZE); + + if (!h) + return; + + spin_lock(&hugetlb_lock); + list_for_each_entry_safe(page, page_next, + &hpool->dhugetlb_1G_freelists, lru) { + nr_pages = 1 << huge_page_order(h); + pfn = page_to_pfn(page); + for (; nr_pages--; pfn++) { + p = pfn_to_page(pfn); + p->mapping = NULL; + } + SetPageHugeFreed(page); + set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + nid = page_to_nid(page); + BUG_ON(nid >= MAX_NUMNODES); + list_move(&page->lru, &h->hugepage_freelists[nid]); + h->free_huge_pages_node[nid]++; + read_lock(&dhugetlb_pagelist_rwlock); + idx = page_to_pfn(page) >> (PUD_SHIFT - PAGE_SHIFT); + if (idx < dhugetlb_pagelist_t->count) + dhugetlb_pagelist_t->hpool[idx] = NULL; + read_unlock(&dhugetlb_pagelist_rwlock); + } + h->free_huge_pages += hpool->total_nr_pages; + hpool->total_nr_pages = 0; + hpool->free_unreserved_1G = 0; + hpool->free_reserved_1G = 0; + hpool->total_reserved_1G = 0; + INIT_LIST_HEAD(&hpool->dhugetlb_1G_freelists); + spin_unlock(&hugetlb_lock); +} + +bool free_dhugetlb_pool(struct dhugetlb_pool *hpool) +{ + int i; + bool ret = false; + + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); + + ret = free_dhugetlb_pages(hpool); + if (!ret) + goto out_unlock; + + free_back_hugetlb(hpool); + +out_unlock: + spin_unlock(&hpool->lock); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + + if (ret) + dhugetlb_pool_put(hpool); + return ret; +} + +static void __split_free_huge_page(struct dhugetlb_pool *hpool, + struct page *page) +{ + int i; + int order_h = PUD_SHIFT - PAGE_SHIFT; + int order_m = PMD_SHIFT - PAGE_SHIFT; + int blocks = 1 << (order_h - order_m); + struct page *p = page + 1; + + lockdep_assert_held(&hpool->lock); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + atomic_set(compound_mapcount_ptr(page), 0); + for (i = 1; i < (1 << order_h); i++, p = mem_map_next(p, page, i)) + clear_compound_head(p); + + set_compound_order(page, 0); + __ClearPageHead(page); + + /* make it be 2M huge pages and put it to huge pool */ + for (i = 0; i < blocks; i++, page += (1 << order_m)) + add_new_huge_page_to_pool(hpool, page, false); +} + +static void __split_free_small_page(struct dhugetlb_pool *hpool, + struct page *page) +{ + int i; + int nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + + lockdep_assert_held(&hpool->lock); + set_compound_page_dtor(page, NULL_COMPOUND_DTOR); + set_compound_order(page, 0); + for (i = 0; i < nr_pages; i++) { + if (i != 0) { + page[i].mapping = NULL; + clear_compound_head(&page[i]); + } else + __ClearPageHead(page); + + /* + * If a hugepage is mapped in private mode, the PG_uptodate bit + * will not be cleared when the hugepage freed. Clear the + * hugepage using free_pages_prepare() here. + */ + free_pages_prepare(&page[i], 0, false); + hpool->free_pages++; + list_add_tail(&page[i].lru, &hpool->dhugetlb_4K_freelists); + } +} + +static bool split_free_huge_page(struct dhugetlb_pool *hpool) +{ + struct page *page; + struct split_pages *split_page; + + lockdep_assert_held(&hpool->lock); + + if (!hpool->free_unreserved_1G) + return false; + + split_page = kzalloc(sizeof(struct split_pages), GFP_ATOMIC); + if (!split_page) + return false; + + page = list_entry(hpool->dhugetlb_1G_freelists.next, struct page, lru); + list_del(&page->lru); + hpool->free_unreserved_1G--; + + split_page->start_pfn = page_to_pfn(page); + list_add(&split_page->list, &hpool->split_1G_freelists); + hpool->nr_split_1G++; + + trace_dhugetlb_split_merge(hpool, page, DHUGETLB_SPLIT_1G); + + __split_free_huge_page(hpool, page); + return true; +} + +static bool split_free_small_page(struct dhugetlb_pool *hpool) +{ + struct page *page; + struct split_pages *split_page; + + lockdep_assert_held(&hpool->lock); + + if (!hpool->free_unreserved_2M && !split_free_huge_page(hpool)) + return false; + + split_page = kzalloc(sizeof(struct split_pages), GFP_ATOMIC); + if (!split_page) + return false; + + page = list_entry(hpool->dhugetlb_2M_freelists.next, struct page, lru); + list_del(&page->lru); + hpool->free_unreserved_2M--; + + split_page->start_pfn = page_to_pfn(page); + list_add(&split_page->list, &hpool->split_2M_freelists); + hpool->nr_split_2M++; + + trace_dhugetlb_split_merge(hpool, page, DHUGETLB_SPLIT_2M); + + __split_free_small_page(hpool, page); + return true; +} + +bool move_pages_from_hpool_to_smpool(struct dhugetlb_pool *hpool, + struct small_page_pool *smpool) +{ + int i = 0; + struct page *page, *next; + + if (!hpool->free_pages && !split_free_small_page(hpool)) + return false; + + list_for_each_entry_safe(page, next, + &hpool->dhugetlb_4K_freelists, lru) { + list_del(&page->lru); + hpool->free_pages--; + list_add_tail(&page->lru, &smpool->head_page); + smpool->free_pages++; + if (++i == BATCH_SMPOOL_PAGE) + break; + } + return true; +} + +void move_pages_from_smpool_to_hpool(struct dhugetlb_pool *hpool, + struct small_page_pool *smpool) +{ + int i = 0; + struct page *page, *next; + + list_for_each_entry_safe(page, next, &smpool->head_page, lru) { + list_del(&page->lru); + smpool->free_pages--; + list_add(&page->lru, &hpool->dhugetlb_4K_freelists); + hpool->free_pages++; + if (++i == BATCH_SMPOOL_PAGE) + break; + } +} + +static unsigned long list_len(struct list_head *head) +{ + unsigned long len = 0; + struct page *page; + + list_for_each_entry(page, head, lru) + len++; + + return len; +} + +static void hugetlb_migrate_pages(struct dhugetlb_pool *hpool, + unsigned long count) +{ + int i, try; + struct page *page; + struct split_pages *split_huge, *split_next; + unsigned long nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + LIST_HEAD(wait_page_list); + + list_for_each_entry_safe(split_huge, split_next, + &hpool->split_2M_freelists, list) { + /* + * Isolate free page first because we dont want them to be + * allocated. + */ + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + if (!PagePool(page)) + list_move(&page->lru, &wait_page_list); + } + + for (try = 0; try < HPOOL_RECLAIM_RETRIES; try++) { + /* + * Unlock and try migration, after migration we need + * to lock back. + */ + for (i = 0; i < NR_SMPOOL; i++) + hpool->smpool[i].free_pages = + list_len(&hpool->smpool[i].head_page); + hpool->free_pages = + list_len(&hpool->dhugetlb_4K_freelists); + spin_unlock(&hpool->lock); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + if (PagePool(page)) + try_migrate_page(page, hpool->nid); + } + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); + + /* + * Isolate free page. If all page in the split_huge + * is free, return it. + */ + split_huge->free_pages = 0; + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + if (!PagePool(page)) { + list_move(&page->lru, &wait_page_list); + split_huge->free_pages++; + } + } + if (split_huge->free_pages == nr_pages) + break; + } + if (split_huge->free_pages == nr_pages) { + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + list_del(&page->lru); + } + INIT_LIST_HEAD(&wait_page_list); + page = pfn_to_page(split_huge->start_pfn); + add_new_huge_page_to_pool(hpool, page, false); + list_del(&split_huge->list); + kfree(split_huge); + hpool->nr_split_2M--; + + trace_dhugetlb_split_merge(hpool, page, + DHUGETLB_MIGRATE_4K); + + if (--count == 0) + return; + } else { + /* Failed, put back the isolate pages */ + list_splice(&wait_page_list, + &hpool->dhugetlb_4K_freelists); + INIT_LIST_HEAD(&wait_page_list); + } + } +} + +static unsigned long merge_free_split_huge(struct dhugetlb_pool *hpool, + unsigned long count) +{ + int i; + struct page *page; + struct split_pages *split_huge, *split_next; + unsigned long nr_pages = 1 << (PMD_SHIFT - PAGE_SHIFT); + + list_for_each_entry_safe(split_huge, split_next, + &hpool->split_2M_freelists, list) { + split_huge->free_pages = 0; + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + if (!PagePool(page)) + split_huge->free_pages++; + } + if (split_huge->free_pages == nr_pages) { + for (i = 0; i < nr_pages; i++) { + page = pfn_to_page(split_huge->start_pfn + i); + list_del(&page->lru); + } + page = pfn_to_page(split_huge->start_pfn); + add_new_huge_page_to_pool(hpool, page, false); + list_del(&split_huge->list); + kfree(split_huge); + hpool->nr_split_2M--; + + trace_dhugetlb_split_merge(hpool, page, + DHUGETLB_MERGE_4K); + + if (--count == 0) + return 0; + } + } + return count; +} + +static void merge_free_small_page(struct dhugetlb_pool *hpool, + unsigned long count) +{ + int i; + unsigned long need_migrate; + + if (!hpool->nr_split_2M) + return; + + need_migrate = merge_free_split_huge(hpool, count); + if (need_migrate) + hugetlb_migrate_pages(hpool, need_migrate); + + for (i = 0; i < NR_SMPOOL; i++) + hpool->smpool[i].free_pages = + list_len(&hpool->smpool[i].head_page); + hpool->free_pages = list_len(&hpool->dhugetlb_4K_freelists); +} + +static void dhugetlb_collect_2M_pages(struct dhugetlb_pool *hpool, + unsigned long count) +{ + int i; + + while (hpool->free_unreserved_1G && + count > hpool->free_unreserved_2M) + split_free_huge_page(hpool); + + /* + * If we try to merge 4K pages to 2M, we need to unlock hpool->lock + * first, and then try to lock every lock in order to avoid deadlock. + */ + if (count > hpool->free_unreserved_2M) { + spin_unlock(&hpool->lock); + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); + merge_free_small_page(hpool, count - hpool->free_unreserved_2M); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + } +} + +/* + * Parameter gigantic: true means reserve 1G pages and false means reserve + * 2M pages. When we want to reserve 2M pages more than + * hpool->free_unreserved_2M, we have to try split/merge. Still, we can't + * guarantee success. + */ +void dhugetlb_reserve_hugepages(struct dhugetlb_pool *hpool, + unsigned long count, bool gigantic) +{ + unsigned long delta; + + spin_lock(&hpool->lock); + if (gigantic) { + if (count > hpool->total_reserved_1G) { + delta = min(count - hpool->total_reserved_1G, + hpool->free_unreserved_1G); + hpool->total_reserved_1G += delta; + hpool->free_reserved_1G += delta; + hpool->free_unreserved_1G -= delta; + } else { + delta = min(hpool->total_reserved_1G - count, + hpool->free_reserved_1G - + hpool->mmap_reserved_1G); + hpool->total_reserved_1G -= delta; + hpool->free_reserved_1G -= delta; + hpool->free_unreserved_1G += delta; + } + } else { + if (count > hpool->total_reserved_2M) { + delta = count - hpool->total_reserved_2M; + if (delta > hpool->free_unreserved_2M) + dhugetlb_collect_2M_pages(hpool, delta); + delta = min(count - hpool->total_reserved_2M, + hpool->free_unreserved_2M); + hpool->total_reserved_2M += delta; + hpool->free_reserved_2M += delta; + hpool->free_unreserved_2M -= delta; + } else { + delta = min(hpool->total_reserved_2M - count, + hpool->free_reserved_2M - + hpool->mmap_reserved_2M); + hpool->total_reserved_2M -= delta; + hpool->free_reserved_2M -= delta; + hpool->free_unreserved_2M += delta; + } + } + spin_unlock(&hpool->lock); +} + +static int dhugetlb_acct_memory(struct hstate *h, long delta, + struct dhugetlb_pool *hpool) +{ + int ret = -ENOMEM; + + if (delta == 0) + return 0; + + spin_lock(&hpool->lock); + if (hstate_is_gigantic(h)) { + if (delta > 0 && delta <= hpool->free_reserved_1G - + hpool->mmap_reserved_1G) { + hpool->mmap_reserved_1G += delta; + ret = 0; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_1G, + DHUGETLB_RESV_1G); + } else if (delta < 0) { + hpool->mmap_reserved_1G -= (unsigned long)(-delta); + WARN_ON(hpool->mmap_reserved_1G < 0); + ret = 0; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_1G, + DHUGETLB_UNRESV_1G); + } + } else { + if (delta > 0 && delta <= hpool->free_reserved_2M - + hpool->mmap_reserved_2M) { + hpool->mmap_reserved_2M += delta; + ret = 0; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_2M, + DHUGETLB_RESV_2M); + } else if (delta < 0) { + hpool->mmap_reserved_2M -= (unsigned long)(-delta); + WARN_ON(hpool->mmap_reserved_2M < 0); + ret = 0; + trace_dhugetlb_acct_memory(hpool, + hpool->mmap_reserved_2M, + DHUGETLB_UNRESV_2M); + } + } + spin_unlock(&hpool->lock); + + return ret; +} +#else +static int dhugetlb_acct_memory(struct hstate *h, long delta, + struct dhugetlb_pool *hpool) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_HUGETLB */ + static int __init hugetlb_init(void) { int i; @@ -3134,6 +4240,23 @@ static int __init hugetlb_init(void) hugetlb_register_all_nodes(); hugetlb_cgroup_file_init();
+#ifdef CONFIG_DYNAMIC_HUGETLB + if (enable_dhugetlb) { + unsigned long count = max(max_pfn >> (PUD_SHIFT - PAGE_SHIFT), + (unsigned long)DEFAULT_PAGESIZE); + unsigned long size = sizeof(struct dhugetlb_pagelist) + + count * sizeof(struct dhugetlb_pool *); + dhugetlb_pagelist_t = kzalloc(size, GFP_KERNEL); + if (dhugetlb_pagelist_t) { + dhugetlb_pagelist_t->count = count; + static_branch_enable(&dhugetlb_enabled_key); + pr_info("Dynamic 1G hugepage enabled\n"); + } else + pr_info("Dynamic 1G hugepage disabled due to out of memory, need %lu\n", + size); + } +#endif + #ifdef CONFIG_SMP num_fault_mutexes = roundup_pow_of_two(8 * num_possible_cpus()); #else @@ -3270,6 +4393,16 @@ static int __init hugetlb_nrpages_setup(char *s) } __setup("hugepages=", hugetlb_nrpages_setup);
+#ifdef CONFIG_DYNAMIC_HUGETLB +static int __init dhugetlb_setup(char *s) +{ + if (!strcmp(s, "on")) + enable_dhugetlb = true; + return 1; +} +__setup("dynamic_1G_hugepage=", dhugetlb_setup); +#endif + static int __init hugetlb_default_setup(char *s) { default_hstate_size = memparse(s, &s); @@ -3471,10 +4604,14 @@ unsigned long hugetlb_total_pages(void) return nr_total_pages; }
-static int hugetlb_acct_memory(struct hstate *h, long delta) +static int hugetlb_acct_memory(struct hstate *h, long delta, + struct dhugetlb_pool *hpool) { int ret = -ENOMEM;
+ if (dhugetlb_enabled && hpool) + return dhugetlb_acct_memory(h, delta, hpool); + spin_lock(&hugetlb_lock); /* * When cpuset is configured, it breaks the strict hugetlb page @@ -3535,6 +4672,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; long gbl_reserve; + struct dhugetlb_pool *hpool = + HUGETLBFS_I(file_inode(vma->vm_file))->hpool;
if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -3551,8 +4690,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) * Decrement reserve counts. The global reserve count may be * adjusted if the subpool has a minimum size. */ - gbl_reserve = hugepage_subpool_put_pages(spool, reserve); - hugetlb_acct_memory(h, -gbl_reserve); + gbl_reserve = hugepage_subpool_put_pages(spool, reserve, hpool); + hugetlb_acct_memory(h, -gbl_reserve, hpool); } }
@@ -4934,6 +6073,7 @@ int hugetlb_reserve_pages(struct inode *inode, struct hugepage_subpool *spool = subpool_inode(inode); struct resv_map *resv_map; long gbl_reserve; + struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool;
/* This should never happen */ if (from > to) { @@ -4986,7 +6126,7 @@ int hugetlb_reserve_pages(struct inode *inode, * the subpool has a minimum size, there may be some global * reservations already in place (gbl_reserve). */ - gbl_reserve = hugepage_subpool_get_pages(spool, chg); + gbl_reserve = hugepage_subpool_get_pages(spool, chg, hpool); if (gbl_reserve < 0) { ret = -ENOSPC; goto out_err; @@ -4996,10 +6136,10 @@ int hugetlb_reserve_pages(struct inode *inode, * Check enough hugepages are available for the reservation. * Hand the pages back to the subpool if there are not */ - ret = hugetlb_acct_memory(h, gbl_reserve); + ret = hugetlb_acct_memory(h, gbl_reserve, hpool); if (ret < 0) { /* put back original number of pages, chg */ - (void)hugepage_subpool_put_pages(spool, chg); + (void)hugepage_subpool_put_pages(spool, chg, hpool); goto out_err; }
@@ -5028,8 +6168,9 @@ int hugetlb_reserve_pages(struct inode *inode, long rsv_adjust;
rsv_adjust = hugepage_subpool_put_pages(spool, - chg - add); - hugetlb_acct_memory(h, -rsv_adjust); + chg - add, + hpool); + hugetlb_acct_memory(h, -rsv_adjust, hpool); } } return 0; @@ -5051,6 +6192,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long chg = 0; struct hugepage_subpool *spool = subpool_inode(inode); long gbl_reserve; + struct dhugetlb_pool *hpool = HUGETLBFS_I(inode)->hpool;
/* * Since this routine can be called in the evict inode path for all @@ -5075,8 +6217,8 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, * If the subpool has a minimum size, the number of global * reservations to be released may be adjusted. */ - gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed)); - hugetlb_acct_memory(h, -gbl_reserve); + gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed), hpool); + hugetlb_acct_memory(h, -gbl_reserve, hpool);
return 0; } diff --git a/mm/internal.h b/mm/internal.h index 1b861446c751..deffd247b010 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -182,6 +182,7 @@ extern void __putback_isolated_page(struct page *page, unsigned int order, int mt); extern void __free_pages_core(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned int order); +extern int check_new_page(struct page *page); extern void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags); extern int user_min_free_kbytes; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 63b91a030b02..bdc90e6fc082 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -997,6 +997,41 @@ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void) return get_mem_cgroup_from_mm(current->mm); }
+#ifdef CONFIG_DYNAMIC_HUGETLB +void free_page_to_dhugetlb_pool(struct page *page) +{ + struct dhugetlb_pool *hpool; + struct small_page_pool *smpool; + unsigned long flags; + + hpool = get_dhugetlb_pool_from_dhugetlb_pagelist(page); + if (unlikely(!hpool)) { + pr_err("dhugetlb: free error: get hpool failed\n"); + return; + } + + smpool = &hpool->smpool[smp_processor_id()]; + spin_lock_irqsave(&smpool->lock, flags); + + ClearPagePool(page); + if (!free_pages_prepare(page, 0, false)) { + SetPagePool(page); + goto out; + } + list_add(&page->lru, &smpool->head_page); + smpool->free_pages++; + smpool->used_pages--; + if (smpool->free_pages > MAX_SMPOOL_PAGE) { + spin_lock(&hpool->lock); + move_pages_from_smpool_to_hpool(hpool, smpool); + spin_unlock(&hpool->lock); + } +out: + spin_unlock_irqrestore(&smpool->lock, flags); + dhugetlb_pool_put(hpool); +} +#endif /* CONFIG_DYNAMIC_HUGETLB */ + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -3118,6 +3153,31 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return 0; } +#ifdef CONFIG_DYNAMIC_HUGETLB +int dhugetlb_pool_force_empty(struct mem_cgroup *memcg) +{ + lru_add_drain_all(); + + drain_all_stock(memcg); + + while (page_counter_read(&memcg->memory)) { + int progress; + + if (signal_pending(current)) + return -EINTR; + + progress = try_to_free_mem_cgroup_pages(memcg, 1, + GFP_HIGHUSER_MOVABLE, + false); + + if (!progress) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + break; + } + } + return 0; +} +#endif
static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, char *buf, size_t nbytes, @@ -4652,6 +4712,305 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, return ret; }
+#ifdef CONFIG_DYNAMIC_HUGETLB +struct dhugetlb_pool *get_dhugetlb_pool_from_memcg(struct mem_cgroup *memcg) +{ + struct mem_cgroup_extension *memcg_ext; + + if (!memcg) + return NULL; + + memcg_ext = container_of(memcg, struct mem_cgroup_extension, memcg); + if (dhugetlb_pool_get(memcg_ext->hpool)) + return memcg_ext->hpool; + return NULL; +} + +static void set_dhugetlb_pool_to_memcg(struct mem_cgroup *memcg, + struct dhugetlb_pool *hpool) +{ + struct mem_cgroup_extension *memcg_ext; + + memcg_ext = container_of(memcg, struct mem_cgroup_extension, memcg); + + memcg_ext->hpool = hpool; +} + +static bool should_allocate_from_dhugetlb_pool(gfp_t gfp_mask) +{ + gfp_t gfp = gfp_mask & GFP_HIGHUSER_MOVABLE; + + if (current->flags & PF_KTHREAD) + return false; + + /* + * The cgroup only charges anonymous and file pages from usespage. + * some filesystem maybe has masked out the __GFP_IO | __GFP_FS + * to avoid recursive memory request. eg: loop device, xfs. + */ + if ((gfp | __GFP_IO | __GFP_FS) != GFP_HIGHUSER_MOVABLE) + return false; + + return true; +} + +static struct page *__alloc_page_from_dhugetlb_pool(void) +{ + bool ret; + struct dhugetlb_pool *hpool; + struct small_page_pool *smpool; + struct page *page = NULL; + unsigned long flags; + + hpool = get_dhugetlb_pool_from_task(current); + if (unlikely(!hpool)) + goto out; + + smpool = &hpool->smpool[smp_processor_id()]; + spin_lock_irqsave(&smpool->lock, flags); + + if (smpool->free_pages == 0) { + spin_lock(&hpool->lock); + ret = move_pages_from_hpool_to_smpool(hpool, smpool); + spin_unlock(&hpool->lock); + if (!ret) + goto unlock; + } + + page = list_entry(smpool->head_page.next, struct page, lru); + list_del(&page->lru); + smpool->free_pages--; + smpool->used_pages++; + check_new_page(page); + SetPagePool(page); +unlock: + spin_unlock_irqrestore(&smpool->lock, flags); +out: + dhugetlb_pool_put(hpool); + return page; +} + +struct page *alloc_page_from_dhugetlb_pool(gfp_t gfp_mask) +{ + struct page *page = NULL; + + if (should_allocate_from_dhugetlb_pool(gfp_mask)) + page = __alloc_page_from_dhugetlb_pool(); + + return page; +} + +static void assign_new_dhugetlb_pool(struct mem_cgroup *memcg, + unsigned long nid) +{ + struct dhugetlb_pool *hpool; + + hpool = hpool_alloc(nid); + if (!hpool) + return; + + hpool->attach_memcg = memcg; + css_get(&memcg->css); + set_dhugetlb_pool_to_memcg(memcg, hpool); +} + +static int update_dhugetlb_pool(struct mem_cgroup *memcg, + unsigned long nid, unsigned long size) +{ + int ret; + struct dhugetlb_pool *hpool = get_dhugetlb_pool_from_memcg(memcg); + + if (!hpool) { + if (memcg_has_children(memcg)) + return -EINVAL; + assign_new_dhugetlb_pool(memcg, nid); + hpool = get_dhugetlb_pool_from_memcg(memcg); + } + if (!hpool) + return -ENOMEM; + if (hpool->attach_memcg != memcg || hpool->nid != nid) { + dhugetlb_pool_put(hpool); + return -EINVAL; + } + + ret = alloc_hugepage_from_hugetlb(hpool, nid, size); + + dhugetlb_pool_put(hpool); + return ret; +} + +/* + * Test whether an process can allocate specified memory size. + * + * Input must be in format '<nid> <size>'. + * size is regarded as how many it does 1G huge page. + */ +static ssize_t memcg_write_dhugetlb(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + int ret; + unsigned long nid, size; + char *endp; + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + if (!dhugetlb_enabled) + return -EINVAL; + + buf = strstrip(buf); + nid = memparse(buf, &endp); + if (*endp != ' ' || nid >= MAX_NUMNODES) + return -EINVAL; + + buf = endp + 1; + size = memparse(buf, &endp); + if (*endp != '\0' || size == 0) + return -EINVAL; + + ret = update_dhugetlb_pool(memcg, nid, size); + + return ret ?: nbytes; +} + +static int memcg_read_dhugetlb(struct seq_file *m, void *v) +{ + int i; + unsigned long free_pages; + long used_pages = 0; + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct dhugetlb_pool *hpool = get_dhugetlb_pool_from_memcg(memcg); + + if (!dhugetlb_enabled) + return 0; + if (!hpool) { + seq_printf(m, "Curent hierarchial have not memory pool.\n"); + return 0; + } + + for (i = 0; i < NR_SMPOOL; i++) + spin_lock(&hpool->smpool[i].lock); + spin_lock(&hpool->lock); + + free_pages = hpool->free_pages; + for (i = 0; i < NR_SMPOOL; i++) { + free_pages += hpool->smpool[i].free_pages; + used_pages += hpool->smpool[i].used_pages; + } + + seq_printf(m, "dhugetlb_total_pages %ld\n" + "1G_total_reserved_pages %ld\n" + "1G_free_reserved_pages %ld\n" + "1G_mmap_reserved_pages %ld\n" + "1G_used_pages %ld\n" + "1G_free_unreserved_pages %ld\n" + "2M_total_reserved_pages %ld\n" + "2M_free_reserved_pages %ld\n" + "2M_mmap_reserved_pages %ld\n" + "2M_used_pages %ld\n" + "2M_free_unreserved_pages %ld\n" + "4K_free_pages %ld\n" + "4K_used_pages %ld\n", + hpool->total_nr_pages, + hpool->total_reserved_1G, + hpool->free_reserved_1G, + hpool->mmap_reserved_1G, + hpool->used_1G, + hpool->free_unreserved_1G, + hpool->total_reserved_2M, + hpool->free_reserved_2M, + hpool->mmap_reserved_2M, + hpool->used_2M, + hpool->free_unreserved_2M, + free_pages, + used_pages); + + spin_unlock(&hpool->lock); + for (i = NR_SMPOOL - 1; i >= 0; i--) + spin_unlock(&hpool->smpool[i].lock); + dhugetlb_pool_put(hpool); + return 0; +} + +static int update_reserve_pages(struct kernfs_open_file *of, + char *buf, bool gigantic) +{ + unsigned long size; + char *endp; + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct dhugetlb_pool *hpool; + + if (!dhugetlb_enabled) + return -EINVAL; + + buf = strstrip(buf); + size = memparse(buf, &endp); + if (*endp != '\0') + return -EINVAL; + + hpool = get_dhugetlb_pool_from_memcg(memcg); + if (!hpool) + return -EINVAL; + spin_lock(&hpool->reserved_lock); + dhugetlb_reserve_hugepages(hpool, size, gigantic); + spin_unlock(&hpool->reserved_lock); + dhugetlb_pool_put(hpool); + return 0; +} + +static ssize_t dhugetlb_1G_reserve_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return update_reserve_pages(of, buf, true) ?: nbytes; +} + +static ssize_t dhugetlb_2M_reserve_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return update_reserve_pages(of, buf, false) ?: nbytes; +} + +static void dhugetlb_pool_inherits(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ + struct dhugetlb_pool *hpool; + + hpool = get_dhugetlb_pool_from_memcg(parent); + if (!hpool) + return; + + set_dhugetlb_pool_to_memcg(memcg, hpool); + dhugetlb_pool_put(hpool); +} + +static bool dhugetlb_pool_free(struct mem_cgroup *memcg) +{ + bool ret = true; + struct dhugetlb_pool *hpool; + + hpool = get_dhugetlb_pool_from_memcg(memcg); + if (hpool && hpool->attach_memcg == memcg) + ret = free_dhugetlb_pool(hpool); + dhugetlb_pool_put(hpool); + return ret; +} + +bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css) +{ + if (dhugetlb_enabled) + return dhugetlb_pool_free(mem_cgroup_from_css(css)); + return true; +} +#else +static void dhugetlb_pool_inherits(struct mem_cgroup *memcg, + struct mem_cgroup *parent) +{ +} + +bool dhugetlb_pool_is_free(struct cgroup_subsys_state *css) +{ + return true; +} +#endif /* CONFIG_DYNAMIC_HUGETLB */ + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -4700,6 +5059,27 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = memcg_write_event_control, .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE, }, +#ifdef CONFIG_DYNAMIC_HUGETLB + { + .name = "dhugetlb.nr_pages", + .write = memcg_write_dhugetlb, + .seq_show = memcg_read_dhugetlb, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | + CFTYPE_NOT_ON_ROOT, + }, + { + .name = "dhugetlb.1G.reserved_pages", + .write = dhugetlb_1G_reserve_write, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | + CFTYPE_NOT_ON_ROOT, + }, + { + .name = "dhugetlb.2M.reserved_pages", + .write = dhugetlb_2M_reserve_write, + .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE | + CFTYPE_NOT_ON_ROOT, + }, +#endif { .name = "swappiness", .read_u64 = mem_cgroup_swappiness_read, @@ -5063,6 +5443,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; }
+ if (dhugetlb_enabled) + dhugetlb_pool_inherits(memcg, parent); + error = memcg_online_kmem(memcg); if (error) goto fail; @@ -5681,6 +6064,14 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset) if (!p) return 0;
+ if (dhugetlb_enabled) { + struct dhugetlb_pool *hpool = get_dhugetlb_pool_from_task(p); + + if (hpool) { + dhugetlb_pool_put(hpool); + return -EPERM; + } + } /* * We are now commited to this value whatever it is. Changes in this * tunable will only affect upcoming migrations, not the current one. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a6a2f254f61f..e722d73a3724 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1052,7 +1052,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) return ret; }
-static __always_inline bool free_pages_prepare(struct page *page, +__always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free) { int bad = 0; @@ -2012,7 +2012,7 @@ static void check_new_page_bad(struct page *page) /* * This page is about to be returned from the page allocator */ -static inline int check_new_page(struct page *page) +inline int check_new_page(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) @@ -2075,8 +2075,8 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_owner(page, order, gfp_flags); }
-static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, - unsigned int alloc_flags) +void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, + unsigned int alloc_flags) { int i;
@@ -2955,6 +2955,12 @@ void free_unref_page(struct page *page) unsigned long flags; unsigned long pfn = page_to_pfn(page);
+ /* Free dynamic hugetlb page */ + if (dhugetlb_enabled && PagePool(page)) { + free_page_to_dhugetlb_pool(page); + return; + } + if (!free_unref_page_prepare(page, pfn)) return;
@@ -2972,6 +2978,16 @@ void free_unref_page_list(struct list_head *list) unsigned long flags, pfn; int batch_count = 0;
+ /* Free dynamic hugetlb pages */ + if (dhugetlb_enabled) { + list_for_each_entry_safe(page, next, list, lru) { + if (PagePool(page)) { + list_del(&page->lru); + free_page_to_dhugetlb_pool(page); + } + } + } + /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); @@ -4785,6 +4801,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
finalise_ac(gfp_mask, &ac);
+ /* Dynamic hugetlb allocation attemp */ + if (dhugetlb_enabled && likely(order == 0)) { + page = alloc_page_from_dhugetlb_pool(gfp_mask); + if (page) { + prep_new_page(page, order, gfp_mask, alloc_flags); + goto out; + } + } + /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (likely(page))