From: Michal Hocko mhocko@suse.com
mainline inclusion from mainline-v5.18-rc1 commit 09f49dca570a917a8c6bccd7e8c61f5141534e3a category: bugfix bugzilla: 189331 CVE: NA
--------------------------------
We have had several reports [1][2][3] that page allocator blows up when an allocation from a possible node is requested. The underlying reason is that NODE_DATA for the specific node is not allocated.
NUMA specific initialization is arch specific and it can vary a lot. E.g. x86 tries to initialize all nodes that have some cpu affinity (see init_cpu_to_node) but this can be insufficient because the node might be cpuless for example.
One way to address this problem would be to check for !node_online nodes when trying to get a zonelist and silently fall back to another node. That is unfortunately adding a branch into allocator hot path and it doesn't handle any other potential NODE_DATA users.
This patch takes a different approach (following a lead of [3]) and it pre allocates pgdat for all possible nodes in an arch indipendent code - free_area_init. All uninitialized nodes are treated as memoryless nodes. node_state of the node is not changed because that would lead to other side effects - e.g. sysfs representation of such a node and from past discussions [4] it is known that some tools might have problems digesting that.
Newly allocated pgdat only gets a minimal initialization and the rest of the work is expected to be done by the memory hotplug - hotadd_new_pgdat (renamed to hotadd_init_pgdat).
generic_alloc_nodedata is changed to use the memblock allocator because neither page nor slab allocators are available at the stage when all pgdats are allocated. Hotplug doesn't allocate pgdat anymore so we can use the early boot allocator. The only arch specific implementation is ia64 and that is changed to use the early allocator as well.
[1] http://lkml.kernel.org/r/20211101201312.11589-1-amakhalov@vmware.com [2] http://lkml.kernel.org/r/20211207224013.880775-1-npache@redhat.com [3] http://lkml.kernel.org/r/20190114082416.30939-1-mhocko@kernel.org [4] http://lkml.kernel.org/r/20200428093836.27190-1-srikar@linux.vnet.ibm.com
[akpm@linux-foundation.org: replace comment, per Mike]
Link: https://lkml.kernel.org/r/Yfe7RBeLCijnWBON@dhcp22.suse.cz Reported-by: Alexey Makhalov amakhalov@vmware.com Tested-by: Alexey Makhalov amakhalov@vmware.com Reported-by: Nico Pache npache@redhat.com Acked-by: Rafael Aquini raquini@redhat.com Tested-by: Rafael Aquini raquini@redhat.com Acked-by: David Hildenbrand david@redhat.com Reviewed-by: Oscar Salvador osalvador@suse.de Acked-by: Mike Rapoport rppt@linux.ibm.com Signed-off-by: Michal Hocko mhocko@suse.com Cc: Christoph Lameter cl@linux.com Cc: Dennis Zhou dennis@kernel.org Cc: Eric Dumazet eric.dumazet@gmail.com Cc: Tejun Heo tj@kernel.org Cc: Wei Yang richard.weiyang@gmail.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org
Conflicts: mm/internal.h mm/memory_hotplug.c mm/page_alloc.c Signed-off-by: Ma Wupeng mawupeng1@huawei.com --- arch/ia64/mm/discontig.c | 4 +-- include/linux/memory_hotplug.h | 2 +- mm/internal.h | 2 ++ mm/memory_hotplug.c | 24 ++++++++---------- mm/page_alloc.c | 45 +++++++++++++++++++++++++++++----- 5 files changed, 54 insertions(+), 23 deletions(-)
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 2711e0861d8f..32c16a68815d 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -631,11 +631,11 @@ void __init paging_init(void) zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); }
-pg_data_t *arch_alloc_nodedata(int nid) +pg_data_t * __init arch_alloc_nodedata(int nid) { unsigned long size = compute_pernodesize(nid);
- return kzalloc(size, GFP_KERNEL); + return _va(memblock_alloc(size, SMP_CACHE_BYTES)); }
void arch_free_nodedata(pg_data_t *pgdat) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 403ba6dc3c2e..a0e4ad0952f8 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -43,7 +43,7 @@ extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); */ #define generic_alloc_nodedata(nid) \ ({ \ - kzalloc(sizeof(pg_data_t), GFP_KERNEL); \ + __va(memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES)); \ }) /* * This definition is just for error path in node hotadd. diff --git a/mm/internal.h b/mm/internal.h index deffd247b010..72f77379b58f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -614,4 +614,6 @@ extern int is_pagecache_reading_kernel_recovery_enable(void); extern int is_get_user_kernel_recovery_enable(void); #endif
+DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3df9285b73b1..3b961b0fffcc 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -920,18 +920,20 @@ static void reset_node_present_pages(pg_data_t *pgdat) }
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) +static pg_data_t __ref *hotadd_init_pgdat(int nid, u64 start) { struct pglist_data *pgdat; unsigned long start_pfn = PFN_DOWN(start);
pgdat = NODE_DATA(nid); - if (!pgdat) { - pgdat = arch_alloc_nodedata(nid); - if (!pgdat) - return NULL;
- arch_refresh_nodedata(nid, pgdat); + /* + * NODE_DATA is preallocated (free_area_init) but its internal + * state is not allocated completely. Add missing pieces. + * Completely offline nodes stay around and they just need + * reintialization. + */ + if (pgdat->per_cpu_nodestats == &boot_nodestats) { } else { /* * Reset the nr_zones, order and classzone_idx before reuse. @@ -943,10 +945,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) pgdat->kswapd_classzone_idx = 0; }
- /* we can use NODE_DATA(nid) from here */ - - pgdat->node_id = nid; - pgdat->node_start_pfn = start_pfn; + pgdat->node_start_pfn = 0;
/* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_core_hotplug(nid); @@ -1000,7 +999,7 @@ static int __try_online_node(int nid, u64 start, bool set_node_online) if (node_online(nid)) return 0;
- pgdat = hotadd_new_pgdat(nid, start); + pgdat = hotadd_init_pgdat(nid, start); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; @@ -1123,9 +1122,6 @@ int __ref add_memory_resource(int nid, struct resource *res)
return ret; error: - /* rollback pgdat allocation and others */ - if (new_node) - rollback_node_hotadd(nid); memblock_remove(start, size); mem_hotplug_done(); return ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc8be4b00125..1d946043ee63 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5819,7 +5819,7 @@ static void build_zonelists(pg_data_t *pgdat) */ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); -static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); +DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
static void __build_all_zonelists(void *data) { @@ -5854,7 +5854,11 @@ static void __build_all_zonelists(void *data) if (self && !node_online(self->node_id)) { build_zonelists(self); } else { - for_each_online_node(nid) { + /* + * All possible nodes have pgdat preallocated + * in free_area_init + */ + for_each_node(nid) { pg_data_t *pgdat = NODE_DATA(nid);
build_zonelists(pgdat); @@ -7441,10 +7445,39 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) mminit_verify_pageflags_layout(); setup_nr_node_ids(); zero_resv_unavail(); - for_each_online_node(nid) { - pg_data_t *pgdat = NODE_DATA(nid); - free_area_init_node(nid, NULL, - find_min_pfn_for_node(nid), NULL); + for_each_node(nid) { + pg_data_t *pgdat; + + if (!node_online(nid)) { + pr_info("Initializing node %d as memoryless\n", nid); + + /* Allocator not initialized yet */ + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) { + pr_err("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); + continue; + } + memset(pgdat, 0, sizeof(*pgdat)); + arch_refresh_nodedata(nid, pgdat); + free_area_init_node(nid, NULL, 0, NULL); + + /* + * We do not want to confuse userspace by sysfs + * files/directories for node without any memory + * attached to it, so this node is not marked as + * N_MEMORY and not marked online so that no sysfs + * hierarchy will be created via register_one_node for + * it. The pgdat will get fully initialized by + * hotadd_init_pgdat() when memory is hotplugged into + * this node. + */ + continue; + } + + pgdat = NODE_DATA(nid); + free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), + NULL);
/* Any memory on that node */ if (pgdat->node_present_pages) {