From: Zhenguo Yao yaozhenguo1@gmail.com
mainline inclusion from mainline-v5.16-rc1 commit b5389086ad7be0453c55e0069a89856d1fbdf605 category: feature bugzilla: 186043 CVE: NA
--------------------------------
We can specify the number of hugepages to allocate at boot. But the hugepages is balanced in all nodes at present. In some scenarios, we only need hugepages in one node. For example: DPDK needs hugepages which are in the same node as NIC.
If DPDK needs four hugepages of 1G size in node1 and system has 16 numa nodes we must reserve 64 hugepages on the kernel cmdline. But only four hugepages are used. The others should be free after boot. If the system memory is low(for example: 64G), it will be an impossible task.
So extend the hugepages parameter to support specifying hugepages on a specific node. For example add following parameter:
hugepagesz=1G hugepages=0:1,1:3
It will allocate 1 hugepage in node0 and 3 hugepages in node1.
Link: https://lkml.kernel.org/r/20211005054729.86457-1-yaozhenguo1@gmail.com Signed-off-by: Zhenguo Yao yaozhenguo1@gmail.com Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Cc: Zhenguo Yao yaozhenguo1@gmail.com Cc: Dan Carpenter dan.carpenter@oracle.com Cc: Nathan Chancellor nathan@kernel.org Cc: Michael Ellerman mpe@ellerman.id.au Cc: Benjamin Herrenschmidt benh@kernel.crashing.org Cc: Paul Mackerras paulus@samba.org Cc: Jonathan Corbet corbet@lwn.net Cc: Mike Rapoport rppt@kernel.org Cc: Matthew Wilcox (Oracle) willy@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Conflicts: Documentation/admin-guide/kernel-parameters.txt Documentation/admin-guide/mm/hugetlbpage.rst arch/powerpc/mm/hugetlbpage.c include/linux/hugetlb.h mm/hugetlb.c Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- .../admin-guide/kernel-parameters.txt | 5 + arch/powerpc/mm/hugetlbpage.c | 9 +- include/linux/hugetlb.h | 6 +- mm/hugetlb.c | 142 +++++++++++++++--- 4 files changed, 137 insertions(+), 25 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 420f7317aa6af..5623a9d1cf813 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1402,6 +1402,11 @@ registers. Default set by CONFIG_HPET_MMAP_DEFAULT.
hugepages= [HW,X86-32,IA-64] HugeTLB pages to allocate at boot. + If using node format, the number of pages to allocate + per-node can be specified. + Format: <integer> or (node format) + <node>:<integer>[,<node>:<integer>] + hugepagesz= [HW,IA-64,PPC,X86-64] The size of the HugeTLB pages. On x86-64 and powerpc, this option can be specified multiple times interleaved with hugepages= to reserve diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index cef0b7ee10246..163b0ef7d156e 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -241,17 +241,22 @@ int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) m->hstate = hstate; return 1; } + +bool __init hugetlb_node_alloc_supported(void) +{ + return false; +} #endif
-int __init alloc_bootmem_huge_page(struct hstate *h) +int __init alloc_bootmem_huge_page(struct hstate *h, int nid) {
#ifdef CONFIG_PPC_BOOK3S_64 if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) return pseries_alloc_bootmem_huge_page(h); #endif - return __alloc_bootmem_huge_page(h); + return __alloc_bootmem_huge_page(h, nid); }
#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 561e7679fadc4..588f9f2a44fce 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -347,6 +347,7 @@ struct hstate { unsigned long nr_overcommit_huge_pages; struct list_head hugepage_activelist; struct list_head hugepage_freelists[MAX_NUMNODES]; + unsigned int max_huge_pages_node[MAX_NUMNODES]; unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; @@ -409,8 +410,9 @@ int hugetlb_insert_hugepage(struct vm_area_struct *vma, unsigned long addr, struct page *hpage, pgprot_t prot);
/* arch callback */ -int __init __alloc_bootmem_huge_page(struct hstate *h); -int __init alloc_bootmem_huge_page(struct hstate *h); +int __init __alloc_bootmem_huge_page(struct hstate *h, int nid); +int __init alloc_bootmem_huge_page(struct hstate *h, int nid); +bool __init hugetlb_node_alloc_supported(void);
void __init hugetlb_bad_size(void); void __init hugetlb_add_hstate(unsigned order); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b2ed9174bfd73..8286a6f8cb050 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -56,6 +56,7 @@ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; static bool __initdata parsed_valid_hugepagesz = true; +static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata; int enable_charge_mighp __read_mostly;
/* @@ -2241,33 +2242,39 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); }
-int alloc_bootmem_huge_page(struct hstate *h) +int alloc_bootmem_huge_page(struct hstate *h, int nid) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); -int __alloc_bootmem_huge_page(struct hstate *h) +int __alloc_bootmem_huge_page(struct hstate *h, int nid) { - struct huge_bootmem_page *m; + struct huge_bootmem_page *m = NULL; /* initialize for clang */ int nr_nodes, node;
+ if (nid >= nr_online_nodes) + return 0; + /* do node specific alloc */ + if (nid != NUMA_NO_NODE) { + m = memblock_virt_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h), + 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid); + if (!m) + return 0; + goto found; + } + /* allocate from next node when distributing huge pages */ for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) { - void *addr; - - addr = memblock_virt_alloc_try_nid_raw( + m = memblock_virt_alloc_try_nid_raw( huge_page_size(h), huge_page_size(h), 0, BOOTMEM_ALLOC_ACCESSIBLE, node); - if (addr) { - /* - * Use the beginning of the huge page to store the - * huge_bootmem_page struct (until gather_bootmem - * puts them into the mem_map). - */ - m = addr; - goto found; - } + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + */ + if (!m) + return 0; + goto found; } - return 0;
found: - BUG_ON(!IS_ALIGNED(virt_to_phys(m), huge_page_size(h))); /* Put them into a private list first because mem_map is not up yet */ INIT_LIST_HEAD(&m->list); list_add(&m->list, &huge_boot_pages); @@ -2310,12 +2317,55 @@ static void __init gather_bootmem_prealloc(void) cond_resched(); } } +static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) +{ + unsigned long i; + char buf[32]; + + for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { + if (hstate_is_gigantic(h)) { + if (!alloc_bootmem_huge_page(h, nid)) + break; + } else { + struct page *page; + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + + page = alloc_fresh_huge_page(h, gfp_mask, nid, + &node_states[N_MEMORY], NULL); + if (!page) + break; + put_page(page); /* free it into the hugepage allocator */ + } + cond_resched(); + } + if (i == h->max_huge_pages_node[nid]) + return; + + string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", + h->max_huge_pages_node[nid], buf, nid, i); + h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); + h->max_huge_pages_node[nid] = i; +}
static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { unsigned long i; nodemask_t *node_alloc_noretry; + bool node_specific_alloc = false; + + /* do node specific alloc */ + for (i = 0; i < nr_online_nodes; i++) { + if (h->max_huge_pages_node[i] > 0) { + hugetlb_hstate_alloc_pages_onenode(h, i); + node_specific_alloc = true; + } + } + + if (node_specific_alloc) + return;
+ /* below will do all node balanced alloc */ if (!hstate_is_gigantic(h)) { /* * Bit mask controlling how hard we retry per-node allocations. @@ -2336,7 +2386,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
for (i = 0; i < h->max_huge_pages; ++i) { if (hstate_is_gigantic(h)) { - if (!alloc_bootmem_huge_page(h)) + if (!alloc_bootmem_huge_page(h, NUMA_NO_NODE)) break; } else if (!alloc_pool_huge_page(h, &node_states[N_MEMORY], @@ -2991,8 +3041,13 @@ static int __init hugetlb_init(void) } default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); if (default_hstate_max_huge_pages) { - if (!default_hstate.max_huge_pages) + if (!default_hstate.max_huge_pages) { default_hstate.max_huge_pages = default_hstate_max_huge_pages; + + for (i = 0; i < nr_online_nodes; i++) + default_hstate.max_huge_pages_node[i] = + default_hugepages_in_node[i]; + } }
hugetlb_init_hstates(); @@ -3052,10 +3107,19 @@ void __init hugetlb_add_hstate(unsigned int order) parsed_hstate = h; }
+bool __init __weak hugetlb_node_alloc_supported(void) +{ + return true; +} + static int __init hugetlb_nrpages_setup(char *s) { unsigned long *mhp; static unsigned long *last_mhp; + int node = NUMA_NO_NODE; + int count; + unsigned long tmp; + char *p = s;
if (!parsed_valid_hugepagesz) { pr_warn("hugepages = %s preceded by " @@ -3077,8 +3141,40 @@ static int __init hugetlb_nrpages_setup(char *s) return 1; }
- if (sscanf(s, "%lu", mhp) <= 0) - *mhp = 0; + while (*p) { + count = 0; + if (sscanf(p, "%lu%n", &tmp, &count) != 1) + goto invalid; + /* Parameter is node format */ + if (p[count] == ':') { + if (!hugetlb_node_alloc_supported()) { + pr_warn("HugeTLB: architecture can't support node specific alloc, ignoring!\n"); + return 0; + } + node = tmp; + p += count + 1; + if (node < 0 || node >= nr_online_nodes) + goto invalid; + /* Parse hugepages */ + if (sscanf(p, "%lu%n", &tmp, &count) != 1) + goto invalid; + if (!hugetlb_max_hstate) + default_hugepages_in_node[node] = tmp; + else + parsed_hstate->max_huge_pages_node[node] = tmp; + *mhp += tmp; + /* Go to parse next node*/ + if (p[count] == ',') + p += count + 1; + else + break; + } else { + if (p != s) + goto invalid; + *mhp = tmp; + break; + } + }
/* * Global state is always initialized later in hugetlb_init. @@ -3091,6 +3187,10 @@ static int __init hugetlb_nrpages_setup(char *s) last_mhp = mhp;
return 1; + +invalid: + pr_warn("HugeTLB: Invalid hugepages parameter %s\n", p); + return 0; } __setup("hugepages=", hugetlb_nrpages_setup);
From: Zhenguo Yao yaozhenguo1@gmail.com
mainline inclusion from mainline-v5.16-rc5 commit 4178158ef8cadeb0ee86639749ce2b33ad75f770 category: bugfix bugzilla: 186043 CVE: NA
--------------------------------
Preallocation of gigantic pages can't work bacause of commit b5389086ad7b ("hugetlbfs: extend the definition of hugepages parameter to support node allocation"). When nid is NUMA_NO_NODE(-1), alloc_bootmem_huge_page will always return without doing allocation. Fix this by adding more check.
Link: https://lkml.kernel.org/r/20211129133803.15653-1-yaozhenguo1@gmail.com Fixes: b5389086ad7b ("hugetlbfs: extend the definition of hugepages parameter to support node allocation") Signed-off-by: Zhenguo Yao yaozhenguo1@gmail.com Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Tested-by: Maxim Levitsky mlevitsk@redhat.com Reviewed-by: Muchun Song songmuchun@bytedance.com Reviewed-by: Baolin Wang baolin.wang@linux.alibaba.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8286a6f8cb050..f7e41390f3d8b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2249,7 +2249,7 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid) struct huge_bootmem_page *m = NULL; /* initialize for clang */ int nr_nodes, node;
- if (nid >= nr_online_nodes) + if (nid != NUMA_NO_NODE && nid >= nr_online_nodes) return 0; /* do node specific alloc */ if (nid != NUMA_NO_NODE) {