From: Shakeel Butt shakeelb@google.com
mainline inclusion from mainline-v5.14-rc4 commit f227f0faf63b46a113c4d1aca633c80195622dd2 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4C12I CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
SLUB uses page allocator for higher order allocations and update unreclaimable slab stat for such allocations. At the moment, the bulk free for SLUB does not share code with normal free code path for these type of allocations and have missed the stat update. So, fix the stat update by common code. The user visible impact of the bug is the potential of inconsistent unreclaimable slab stat visible through meminfo and vmstat.
Link: https://lkml.kernel.org/r/20210728155354.3440560-1-shakeelb@google.com Fixes: 6a486c0ad4dc ("mm, sl[ou]b: improve memory accounting") Signed-off-by: Shakeel Butt shakeelb@google.com Acked-by: Michal Hocko mhocko@suse.com Acked-by: Roman Gushchin guro@fb.com Reviewed-by: Muchun Song songmuchun@bytedance.com Cc: Christoph Lameter cl@linux.com Cc: Pekka Enberg penberg@kernel.org Cc: David Rientjes rientjes@google.com Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Huang chenhuang5@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/slub.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c index f06f002bb098..4a83fa347672 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3164,6 +3164,16 @@ struct detached_freelist { struct kmem_cache *s; };
+static inline void free_nonslab_page(struct page *page) +{ + unsigned int order = compound_order(page); + + VM_BUG_ON_PAGE(!PageCompound(page), page); + kfree_hook(page_address(page)); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); + __free_pages(page, order); +} + /* * This function progressively scans the array with free objects (with * a limited look ahead) and extract objects belonging to the same @@ -3200,9 +3210,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, if (!s) { /* Handle kalloc'ed objects */ if (unlikely(!PageSlab(page))) { - BUG_ON(!PageCompound(page)); - kfree_hook(object); - __free_pages(page, compound_order(page)); + free_nonslab_page(page); p[size] = NULL; /* mark object processed */ return size; } @@ -4102,13 +4110,7 @@ void kfree(const void *x)
page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { - unsigned int order = compound_order(page); - - BUG_ON(!PageCompound(page)); - kfree_hook(object); - mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, - -(PAGE_SIZE << order)); - __free_pages(page, order); + free_nonslab_page(page); return; } slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
From: Shakeel Butt shakeelb@google.com
mainline inclusion from mainline-v5.14-rc6 commit 1ed7ce574c136569f55fb5c32e69e382c77ba500 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4C12I CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------------------------------------
The unit test kmalloc_pagealloc_invalid_free makes sure that for the higher order slub allocation which goes to page allocator, the free is called with the correct address i.e. the virtual address of the head page.
Commit f227f0faf63b ("slub: fix unreclaimable slab stat for bulk free") unified the free code paths for page allocator based slub allocations but instead of using the address passed by the caller, it extracted the address from the page. Thus making the unit test kmalloc_pagealloc_invalid_free moot. So, fix this by using the address passed by the caller.
Should we fix this? I think yes because dev expect kasan to catch these type of programming bugs.
Link: https://lkml.kernel.org/r/20210802180819.1110165-1-shakeelb@google.com Fixes: f227f0faf63b ("slub: fix unreclaimable slab stat for bulk free") Signed-off-by: Shakeel Butt shakeelb@google.com Reported-by: Nathan Chancellor nathan@kernel.org Tested-by: Nathan Chancellor nathan@kernel.org Acked-by: Roman Gushchin guro@fb.com Cc: Michal Hocko mhocko@suse.com Cc: Muchun Song songmuchun@bytedance.com Cc: Christoph Lameter cl@linux.com Cc: Pekka Enberg penberg@kernel.org Cc: David Rientjes rientjes@google.com Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Chen Huang chenhuang5@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/slub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c index 4a83fa347672..cf9e82282832 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3164,12 +3164,12 @@ struct detached_freelist { struct kmem_cache *s; };
-static inline void free_nonslab_page(struct page *page) +static inline void free_nonslab_page(struct page *page, void *object) { unsigned int order = compound_order(page);
VM_BUG_ON_PAGE(!PageCompound(page), page); - kfree_hook(page_address(page)); + kfree_hook(object); mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, -(PAGE_SIZE << order)); __free_pages(page, order); } @@ -3210,7 +3210,7 @@ int build_detached_freelist(struct kmem_cache *s, size_t size, if (!s) { /* Handle kalloc'ed objects */ if (unlikely(!PageSlab(page))) { - free_nonslab_page(page); + free_nonslab_page(page, object); p[size] = NULL; /* mark object processed */ return size; } @@ -4110,7 +4110,7 @@ void kfree(const void *x)
page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { - free_nonslab_page(page); + free_nonslab_page(page, object); return; } slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
From: Roman Gushchin guro@fb.com
mainline inclusion from mainline-v5.14-rc1 commit ac9380f6b8a6a908a9df023bf8a2bcdaf9d2d6cb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BE79 CVE: NA
------------------------------------------------- Since the commit 3e54097beb22 ("percpu: manage chunks based on contig_bits instead of free_bytes") chunks are sorted based on the size of the biggest continuous free area instead of the total number of free bytes. Update the corresponding comment to reflect this.
Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Dennis Zhou dennis@kernel.org (cherry picked from commit ac9380f6b8a6a908a9df023bf8a2bcdaf9d2d6cb) Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/percpu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/mm/percpu.c b/mm/percpu.c index e12ab708fe15..928c8889eb1f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -98,7 +98,10 @@
#include "percpu-internal.h"
-/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */ +/* + * The slots are sorted by the size of the biggest continuous free area. + * 1-31 bytes share the same slot. + */ #define PCPU_SLOT_BASE_SHIFT 5 /* chunks in slots below this are subject to being sidelined on failed alloc */ #define PCPU_SLOT_FAIL_THRESHOLD 3
From: Roman Gushchin guro@fb.com
mainline inclusion from mainline-v5.14-rc1 commit 67c2669d69fb5ada0f3b5123fb6ebf6fef9faee5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BE79 CVE: NA
------------------------------------------------- __pcpu_balance_workfn() became fairly big and hard to follow, but in fact it consists of two fully independent parts, responsible for the destruction of excessive free chunks and population of necessarily amount of free pages.
In order to simplify the code and prepare for adding of a new functionality, split it in two functions:
1) pcpu_balance_free, 2) pcpu_balance_populated.
Move the taking/releasing of the pcpu_alloc_mutex to an upper level to keep the current synchronization in place.
Signed-off-by: Roman Gushchin guro@fb.com Reviewed-by: Dennis Zhou dennis@kernel.org Signed-off-by: Dennis Zhou dennis@kernel.org (cherry picked from commit 67c2669d69fb5ada0f3b5123fb6ebf6fef9faee5) Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/percpu.c | 46 +++++++++++++++++++++++++++++----------------- 1 file changed, 29 insertions(+), 17 deletions(-)
diff --git a/mm/percpu.c b/mm/percpu.c index 928c8889eb1f..da0567ab3459 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1932,31 +1932,22 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) }
/** - * __pcpu_balance_workfn - manage the amount of free chunks and populated pages + * pcpu_balance_free - manage the amount of free chunks * @type: chunk type * - * Reclaim all fully free chunks except for the first one. This is also - * responsible for maintaining the pool of empty populated pages. However, - * it is possible that this is called when physical memory is scarce causing - * OOM killer to be triggered. We should avoid doing so until an actual - * allocation causes the failure as it is possible that requests can be - * serviced from already backed regions. + * Reclaim all fully free chunks except for the first one. */ -static void __pcpu_balance_workfn(enum pcpu_chunk_type type) +static void pcpu_balance_free(enum pcpu_chunk_type type) { - /* gfp flags passed to underlying allocators */ - const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; LIST_HEAD(to_free); struct list_head *pcpu_slot = pcpu_chunk_list(type); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; - int slot, nr_to_pop, ret;
/* * There's no reason to keep around multiple unused chunks and VM * areas can be scarce. Destroy all free chunks except for one. */ - mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, free_head, list) { @@ -1984,6 +1975,25 @@ static void __pcpu_balance_workfn(enum pcpu_chunk_type type) pcpu_destroy_chunk(chunk); cond_resched(); } +} + +/** + * pcpu_balance_populated - manage the amount of populated pages + * @type: chunk type + * + * Maintain a certain amount of populated pages to satisfy atomic allocations. + * It is possible that this is called when physical memory is scarce causing + * OOM killer to be triggered. We should avoid doing so until an actual + * allocation causes the failure as it is possible that requests can be + * serviced from already backed regions. + */ +static void pcpu_balance_populated(enum pcpu_chunk_type type) +{ + /* gfp flags passed to underlying allocators */ + const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; + struct list_head *pcpu_slot = pcpu_chunk_list(type); + struct pcpu_chunk *chunk; + int slot, nr_to_pop, ret;
/* * Ensure there are certain number of free populated pages for @@ -2053,22 +2063,24 @@ static void __pcpu_balance_workfn(enum pcpu_chunk_type type) goto retry_pop; } } - - mutex_unlock(&pcpu_alloc_mutex); }
/** * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * - * Call __pcpu_balance_workfn() for each chunk type. + * Call pcpu_balance_free() and pcpu_balance_populated() for each chunk type. */ static void pcpu_balance_workfn(struct work_struct *work) { enum pcpu_chunk_type type;
- for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) - __pcpu_balance_workfn(type); + for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) { + mutex_lock(&pcpu_alloc_mutex); + pcpu_balance_free(type); + pcpu_balance_populated(type); + mutex_unlock(&pcpu_alloc_mutex); + } }
/**
From: Roman Gushchin guro@fb.com
mainline inclusion from mainline-v5.14-rc1 commit 8ea2e1e35d1eb4c76290ff5d565a1bfd6c24f117 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BE79 CVE: NA
------------------------------------------------- Factor out the pcpu_check_block_hint() helper, which will be useful in the future. The new function checks if the allocation can likely fit within the contig hint.
Signed-off-by: Roman Gushchin guro@fb.com Signed-off-by: Dennis Zhou dennis@kernel.org Signed-off-by: Dennis Zhou dennis@kernel.org (cherry picked from commit 8ea2e1e35d1eb4c76290ff5d565a1bfd6c24f117) Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/percpu.c | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-)
diff --git a/mm/percpu.c b/mm/percpu.c index da0567ab3459..dd4e31934f3e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -305,6 +305,25 @@ static unsigned long pcpu_block_off_to_off(int index, int off) return index * PCPU_BITMAP_BLOCK_BITS + off; }
+/** + * pcpu_check_block_hint - check against the contig hint + * @block: block of interest + * @bits: size of allocation + * @align: alignment of area (max PAGE_SIZE) + * + * Check to see if the allocation can fit in the block's contig hint. + * Note, a chunk uses the same hints as a block so this can also check against + * the chunk's contig hint. + */ +static bool pcpu_check_block_hint(struct pcpu_block_md *block, int bits, + size_t align) +{ + int bit_off = ALIGN(block->contig_hint_start, align) - + block->contig_hint_start; + + return bit_off + bits <= block->contig_hint; +} + /* * pcpu_next_hint - determine which hint to use * @block: block of interest @@ -1065,14 +1084,11 @@ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, int bit_off, bits, next_off;
/* - * Check to see if the allocation can fit in the chunk's contig hint. - * This is an optimization to prevent scanning by assuming if it - * cannot fit in the global hint, there is memory pressure and creating - * a new chunk would happen soon. + * This is an optimization to prevent scanning by assuming if the + * allocation cannot fit in the global hint, there is memory pressure + * and creating a new chunk would happen soon. */ - bit_off = ALIGN(chunk_md->contig_hint_start, align) - - chunk_md->contig_hint_start; - if (bit_off + alloc_bits > chunk_md->contig_hint) + if (!pcpu_check_block_hint(chunk_md, alloc_bits, align)) return -1;
bit_off = pcpu_next_hint(chunk_md, alloc_bits);
From: Dennis Zhou dennis@kernel.org
mainline inclusion from mainline-v5.14-rc1 commit 1c29a3ceaf5f02919e0a89119a70382581453dbb category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BE79 CVE: NA
------------------------------------------------- This prepares for adding a to_depopulate list and sidelined list after the free slot in the set of lists in pcpu_slot.
Signed-off-by: Dennis Zhou dennis@kernel.org Acked-by: Roman Gushchin guro@fb.com Signed-off-by: Dennis Zhou dennis@kernel.org (cherry picked from commit 1c29a3ceaf5f02919e0a89119a70382581453dbb) Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/percpu.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/mm/percpu.c b/mm/percpu.c index dd4e31934f3e..e43e1e418603 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -134,6 +134,7 @@ static int pcpu_unit_size __ro_after_init; static int pcpu_nr_units __ro_after_init; static int pcpu_atom_size __ro_after_init; int pcpu_nr_slots __ro_after_init; +int pcpu_free_slot __ro_after_init; static size_t pcpu_chunk_struct_size __ro_after_init;
/* cpus with the lowest and highest unit addresses */ @@ -236,7 +237,7 @@ static int __pcpu_size_to_slot(int size) static int pcpu_size_to_slot(int size) { if (size == pcpu_unit_size) - return pcpu_nr_slots - 1; + return pcpu_free_slot; return __pcpu_size_to_slot(size); }
@@ -1805,7 +1806,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, goto fail; }
- if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { + if (list_empty(&pcpu_slot[pcpu_free_slot])) { chunk = pcpu_create_chunk(type, pcpu_gfp); if (!chunk) { err = "failed to allocate new chunk"; @@ -1957,7 +1958,7 @@ static void pcpu_balance_free(enum pcpu_chunk_type type) { LIST_HEAD(to_free); struct list_head *pcpu_slot = pcpu_chunk_list(type); - struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; + struct list_head *free_head = &pcpu_slot[pcpu_free_slot]; struct pcpu_chunk *chunk, *next;
/* @@ -2032,7 +2033,7 @@ static void pcpu_balance_populated(enum pcpu_chunk_type type) 0, PCPU_EMPTY_POP_PAGES_HIGH); }
- for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { + for (slot = pcpu_size_to_slot(PAGE_SIZE); slot <= pcpu_free_slot; slot++) { unsigned int nr_unpop = 0, rs, re;
if (!nr_to_pop) @@ -2139,7 +2140,7 @@ void free_percpu(void __percpu *ptr) if (chunk->free_bytes == pcpu_unit_size) { struct pcpu_chunk *pos;
- list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) + list_for_each_entry(pos, &pcpu_slot[pcpu_free_slot], list) if (pos != chunk) { need_balance = true; break; @@ -2561,7 +2562,8 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, * Allocate chunk slots. The additional last slot is for * empty chunks. */ - pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; + pcpu_free_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1; + pcpu_nr_slots = pcpu_free_slot + 1; pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) * PCPU_NR_CHUNK_TYPES,
From: Roman Gushchin guro@fb.com
mainline inclusion from mainline-v5.14-rc1 commit f183324133ea535db4127f9fad3e19725ca88bf3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BE79 CVE: NA
------------------------------------------------- From Roman ("percpu: partial chunk depopulation"): In our [Facebook] production experience the percpu memory allocator is sometimes struggling with returning the memory to the system. A typical example is a creation of several thousands memory cgroups (each has several chunks of the percpu data used for vmstats, vmevents, ref counters etc). Deletion and complete releasing of these cgroups doesn't always lead to a shrinkage of the percpu memory, so that sometimes there are several GB's of memory wasted.
The underlying problem is the fragmentation: to release an underlying chunk all percpu allocations should be released first. The percpu allocator tends to top up chunks to improve the utilization. It means new small-ish allocations (e.g. percpu ref counters) are placed onto almost filled old-ish chunks, effectively pinning them in memory.
This patchset solves this problem by implementing a partial depopulation of percpu chunks: chunks with many empty pages are being asynchronously depopulated and the pages are returned to the system.
To illustrate the problem the following script can be used: Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com
Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/percpu-internal.h | 4 + mm/percpu-km.c | 5 ++ mm/percpu-stats.c | 12 +-- mm/percpu-vm.c | 30 ++++++++ mm/percpu.c | 180 +++++++++++++++++++++++++++++++++++++++---- 5 files changed, 211 insertions(+), 20 deletions(-)
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 095d7eaa0db4..10604dce806f 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -67,6 +67,8 @@ struct pcpu_chunk {
void *data; /* chunk data */ bool immutable; /* no [de]population allowed */ + bool isolated; /* isolated from active chunk + slots */ int start_offset; /* the overlap with the previous region to have a page aligned base_addr */ @@ -87,6 +89,8 @@ extern spinlock_t pcpu_lock;
extern struct list_head *pcpu_chunk_lists; extern int pcpu_nr_slots; +extern int pcpu_sidelined_slot; +extern int pcpu_to_depopulate_slot; extern int pcpu_nr_empty_pop_pages[];
extern struct pcpu_chunk *pcpu_first_chunk; diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 35c9941077ee..c84a9f781a6c 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -118,3 +118,8 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai)
return 0; } + +static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk) +{ + return false; +} diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c index f6026dbcdf6b..2125981acfb9 100644 --- a/mm/percpu-stats.c +++ b/mm/percpu-stats.c @@ -219,13 +219,15 @@ static int percpu_stats_show(struct seq_file *m, void *v) for (slot = 0; slot < pcpu_nr_slots; slot++) { list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot], list) { - if (chunk == pcpu_first_chunk) { + if (chunk == pcpu_first_chunk) seq_puts(m, "Chunk: <- First Chunk\n"); - chunk_map_stats(m, chunk, buffer); - } else { + else if (slot == pcpu_to_depopulate_slot) + seq_puts(m, "Chunk (to_depopulate)\n"); + else if (slot == pcpu_sidelined_slot) + seq_puts(m, "Chunk (sidelined):\n"); + else seq_puts(m, "Chunk:\n"); - chunk_map_stats(m, chunk, buffer); - } + chunk_map_stats(m, chunk, buffer); } } } diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index e46f7a6917f9..c75f6f24f2d5 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -377,3 +377,33 @@ static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) /* no extra restriction */ return 0; } + +/** + * pcpu_should_reclaim_chunk - determine if a chunk should go into reclaim + * @chunk: chunk of interest + * + * This is the entry point for percpu reclaim. If a chunk qualifies, it is then + * isolated and managed in separate lists at the back of pcpu_slot: sidelined + * and to_depopulate respectively. The to_depopulate list holds chunks slated + * for depopulation. They no longer contribute to pcpu_nr_empty_pop_pages once + * they are on this list. Once depopulated, they are moved onto the sidelined + * list which enables them to be pulled back in for allocation if no other chunk + * can suffice the allocation. + */ +static bool pcpu_should_reclaim_chunk(struct pcpu_chunk *chunk) +{ + /* do not reclaim either the first chunk or reserved chunk */ + if (chunk == pcpu_first_chunk || chunk == pcpu_reserved_chunk) + return false; + + /* + * If it is isolated, it may be on the sidelined list so move it back to + * the to_depopulate list. If we hit at least 1/4 pages empty pages AND + * there is no system-wide shortage of empty pages aside from this + * chunk, move it to the to_depopulate list. + */ + return ((chunk->isolated && chunk->nr_empty_pop_pages) || + (pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] > + PCPU_EMPTY_POP_PAGES_HIGH + chunk->nr_empty_pop_pages && + chunk->nr_empty_pop_pages >= chunk->nr_pages / 4)); +} diff --git a/mm/percpu.c b/mm/percpu.c index e43e1e418603..ceae1ed8b5cf 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -135,6 +135,8 @@ static int pcpu_nr_units __ro_after_init; static int pcpu_atom_size __ro_after_init; int pcpu_nr_slots __ro_after_init; int pcpu_free_slot __ro_after_init; +int pcpu_sidelined_slot __ro_after_init; +int pcpu_to_depopulate_slot __ro_after_init; static size_t pcpu_chunk_struct_size __ro_after_init;
/* cpus with the lowest and highest unit addresses */ @@ -561,10 +563,41 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk);
+ /* leave isolated chunks in-place */ + if (chunk->isolated) + return; + if (oslot != nslot) __pcpu_chunk_move(chunk, nslot, oslot < nslot); }
+static void pcpu_isolate_chunk(struct pcpu_chunk *chunk) +{ + enum pcpu_chunk_type type = pcpu_chunk_type(chunk); + struct list_head *pcpu_slot = pcpu_chunk_list(type); + + lockdep_assert_held(&pcpu_lock); + + if (!chunk->isolated) { + chunk->isolated = true; + pcpu_nr_empty_pop_pages[type] -= chunk->nr_empty_pop_pages; + } + list_move(&chunk->list, &pcpu_slot[pcpu_to_depopulate_slot]); +} + +static void pcpu_reintegrate_chunk(struct pcpu_chunk *chunk) +{ + enum pcpu_chunk_type type = pcpu_chunk_type(chunk); + + lockdep_assert_held(&pcpu_lock); + + if (chunk->isolated) { + chunk->isolated = false; + pcpu_nr_empty_pop_pages[type] += chunk->nr_empty_pop_pages; + pcpu_chunk_relocate(chunk, -1); + } +} + /* * pcpu_update_empty_pages - update empty page counters * @chunk: chunk of interest @@ -577,7 +610,7 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) { chunk->nr_empty_pop_pages += nr; - if (chunk != pcpu_reserved_chunk) + if (chunk != pcpu_reserved_chunk && !chunk->isolated) pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr; }
@@ -1777,7 +1810,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
restart: /* search through normal chunks */ - for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { + for (slot = pcpu_size_to_slot(size); slot <= pcpu_free_slot; slot++) { list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) { off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); @@ -1788,9 +1821,10 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, }
off = pcpu_alloc_area(chunk, bits, bit_align, off); - if (off >= 0) + if (off >= 0) { + pcpu_reintegrate_chunk(chunk); goto area_found; - + } } }
@@ -1951,10 +1985,13 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) /** * pcpu_balance_free - manage the amount of free chunks * @type: chunk type + * @empty_only: free chunks only if there are no populated pages * - * Reclaim all fully free chunks except for the first one. + * If empty_only is %false, reclaim all fully free chunks regardless of the + * number of populated pages. Otherwise, only reclaim chunks that have no + * populated pages. */ -static void pcpu_balance_free(enum pcpu_chunk_type type) +static void pcpu_balance_free(enum pcpu_chunk_type type, bool empty_only) { LIST_HEAD(to_free); struct list_head *pcpu_slot = pcpu_chunk_list(type); @@ -1974,7 +2011,8 @@ static void pcpu_balance_free(enum pcpu_chunk_type type) if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue;
- list_move(&chunk->list, &to_free); + if (!empty_only || chunk->nr_empty_pop_pages == 0) + list_move(&chunk->list, &to_free); }
spin_unlock_irq(&pcpu_lock); @@ -2082,20 +2120,121 @@ static void pcpu_balance_populated(enum pcpu_chunk_type type) } }
+/** + * pcpu_reclaim_populated - scan over to_depopulate chunks and free empty pages + * @type: chunk type + * + * Scan over chunks in the depopulate list and try to release unused populated + * pages back to the system. Depopulated chunks are sidelined to prevent + * repopulating these pages unless required. Fully free chunks are reintegrated + * and freed accordingly (1 is kept around). If we drop below the empty + * populated pages threshold, reintegrate the chunk if it has empty free pages. + * Each chunk is scanned in the reverse order to keep populated pages close to + * the beginning of the chunk. + */ +static void pcpu_reclaim_populated(enum pcpu_chunk_type type) +{ + struct list_head *pcpu_slot = pcpu_chunk_list(type); + struct pcpu_chunk *chunk; + struct pcpu_block_md *block; + int i, end; + + spin_lock_irq(&pcpu_lock); + +restart: + /* + * Once a chunk is isolated to the to_depopulate list, the chunk is no + * longer discoverable to allocations whom may populate pages. The only + * other accessor is the free path which only returns area back to the + * allocator not touching the populated bitmap. + */ + while (!list_empty(&pcpu_slot[pcpu_to_depopulate_slot])) { + chunk = list_first_entry(&pcpu_slot[pcpu_to_depopulate_slot], + struct pcpu_chunk, list); + WARN_ON(chunk->immutable); + + /* + * Scan chunk's pages in the reverse order to keep populated + * pages close to the beginning of the chunk. + */ + for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) { + /* no more work to do */ + if (chunk->nr_empty_pop_pages == 0) + break; + + /* reintegrate chunk to prevent atomic alloc failures */ + if (pcpu_nr_empty_pop_pages[type] < + PCPU_EMPTY_POP_PAGES_HIGH) { + pcpu_reintegrate_chunk(chunk); + goto restart; + } + + /* + * If the page is empty and populated, start or + * extend the (i, end) range. If i == 0, decrease + * i and perform the depopulation to cover the last + * (first) page in the chunk. + */ + block = chunk->md_blocks + i; + if (block->contig_hint == PCPU_BITMAP_BLOCK_BITS && + test_bit(i, chunk->populated)) { + if (end == -1) + end = i; + if (i > 0) + continue; + i--; + } + + /* depopulate if there is an active range */ + if (end == -1) + continue; + + spin_unlock_irq(&pcpu_lock); + pcpu_depopulate_chunk(chunk, i + 1, end + 1); + cond_resched(); + spin_lock_irq(&pcpu_lock); + + pcpu_chunk_depopulated(chunk, i + 1, end + 1); + + /* reset the range and continue */ + end = -1; + } + + if (chunk->free_bytes == pcpu_unit_size) + pcpu_reintegrate_chunk(chunk); + else + list_move(&chunk->list, + &pcpu_slot[pcpu_sidelined_slot]); + } + + spin_unlock_irq(&pcpu_lock); +} + /** * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * - * Call pcpu_balance_free() and pcpu_balance_populated() for each chunk type. + * For each chunk type, manage the number of fully free chunks and the number of + * populated pages. An important thing to consider is when pages are freed and + * how they contribute to the global counts. */ static void pcpu_balance_workfn(struct work_struct *work) { enum pcpu_chunk_type type;
+ /* + * pcpu_balance_free() is called twice because the first time we may + * trim pages in the active pcpu_nr_empty_pop_pages which may cause us + * to grow other chunks. This then gives pcpu_reclaim_populated() time + * to move fully free chunks to the active list to be freed if + * appropriate. + */ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) { mutex_lock(&pcpu_alloc_mutex); - pcpu_balance_free(type); + pcpu_balance_free(type, false); + pcpu_reclaim_populated(type); pcpu_balance_populated(type); + pcpu_balance_free(type, true); mutex_unlock(&pcpu_alloc_mutex); } } @@ -2136,8 +2275,12 @@ void free_percpu(void __percpu *ptr)
pcpu_memcg_free_hook(chunk, off, size);
- /* if there are more than one fully free chunks, wake up grim reaper */ - if (chunk->free_bytes == pcpu_unit_size) { + /* + * If there are more than one fully free chunks, wake up grim reaper. + * If the chunk is isolated, it may be in the process of being + * reclaimed. Let reclaim manage cleaning up of that chunk. + */ + if (!chunk->isolated && chunk->free_bytes == pcpu_unit_size) { struct pcpu_chunk *pos;
list_for_each_entry(pos, &pcpu_slot[pcpu_free_slot], list) @@ -2145,6 +2288,9 @@ void free_percpu(void __percpu *ptr) need_balance = true; break; } + } else if (pcpu_should_reclaim_chunk(chunk)) { + pcpu_isolate_chunk(chunk); + need_balance = true; }
trace_percpu_free_percpu(chunk->base_addr, off, ptr); @@ -2559,11 +2705,15 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_stats_save_ai(ai);
/* - * Allocate chunk slots. The additional last slot is for - * empty chunks. + * Allocate chunk slots. The slots after the active slots are: + * sidelined_slot - isolated, depopulated chunks + * free_slot - fully free chunks + * to_depopulate_slot - isolated, chunks to depopulate */ - pcpu_free_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1; - pcpu_nr_slots = pcpu_free_slot + 1; + pcpu_sidelined_slot = __pcpu_size_to_slot(pcpu_unit_size) + 1; + pcpu_free_slot = pcpu_sidelined_slot + 1; + pcpu_to_depopulate_slot = pcpu_free_slot + 1; + pcpu_nr_slots = pcpu_to_depopulate_slot + 1; pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) * PCPU_NR_CHUNK_TYPES,
From: Dennis Zhou dennis@kernel.org
mainline inclusion from mainline-v5.14-rc1 commit 93274f1dd6b0a615b299beddf99871fe81f91275 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4BE79 CVE: NA
------------------------------------------------- Prior to "percpu: implement partial chunk depopulation", pcpu_depopulate_chunk() was called only on the destruction path. This meant the virtual address range was on its way back to vmalloc which will handle flushing the tlbs for us.
However, with pcpu_reclaim_populated(), we are now calling pcpu_depopulate_chunk() during the active lifecycle of a chunk. Therefore, we need to flush the tlb as well otherwise we can end up accessing the wrong page through an invalid tlb mapping as reported in [1].
[1] https://lore.kernel.org/lkml/20210702191140.GA3166599@roeck-us.net/
Fixes: f183324133ea ("percpu: implement partial chunk depopulation") Reported-and-tested-by: Guenter Roeck linux@roeck-us.net Signed-off-by: Dennis Zhou dennis@kernel.org (cherry picked from commit 93274f1dd6b0a615b299beddf99871fe81f91275) Conflicts: mm/percpu.c Small content conflicts because of mainline-v5.14-rc1 commit faf65dde844affa9e360ccaa4bd231c2a04b87ea rework memcg accounting. Signed-off-by: Yuanzheng Song songyuanzheng@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/percpu-km.c | 6 ++++++ mm/percpu-vm.c | 5 +++-- mm/percpu.c | 32 ++++++++++++++++++++++++++------ 3 files changed, 35 insertions(+), 8 deletions(-)
diff --git a/mm/percpu-km.c b/mm/percpu-km.c index c84a9f781a6c..01e31bd55860 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -32,6 +32,12 @@
#include <linux/log2.h>
+static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + /* nothing */ +} + static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end, gfp_t gfp) { diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index c75f6f24f2d5..ba07eb5e014b 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -302,6 +302,9 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, * For each cpu, depopulate and unmap pages [@page_start,@page_end) * from @chunk. * + * Caller is required to call pcpu_post_unmap_tlb_flush() if not returning the + * region back to vmalloc() which will lazily flush the tlb. + * * CONTEXT: * pcpu_alloc_mutex. */ @@ -323,8 +326,6 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
pcpu_unmap_pages(chunk, pages, page_start, page_end);
- /* no need to flush tlb, vmalloc will handle it lazily */ - pcpu_free_pages(chunk, pages, page_start, page_end); }
diff --git a/mm/percpu.c b/mm/percpu.c index ceae1ed8b5cf..42d95c0f5c4f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1579,6 +1579,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, * * pcpu_populate_chunk - populate the specified range of a chunk * pcpu_depopulate_chunk - depopulate the specified range of a chunk + * pcpu_post_unmap_tlb_flush - flush tlb for the specified range of a chunk * pcpu_create_chunk - create a new chunk * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop * pcpu_addr_to_page - translate address to physical address @@ -1590,6 +1591,8 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end); static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type, gfp_t gfp); +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); @@ -2137,11 +2140,12 @@ static void pcpu_reclaim_populated(enum pcpu_chunk_type type) struct list_head *pcpu_slot = pcpu_chunk_list(type); struct pcpu_chunk *chunk; struct pcpu_block_md *block; + int freed_page_start, freed_page_end; int i, end; + bool reintegrate;
spin_lock_irq(&pcpu_lock);
-restart: /* * Once a chunk is isolated to the to_depopulate list, the chunk is no * longer discoverable to allocations whom may populate pages. The only @@ -2157,6 +2161,9 @@ static void pcpu_reclaim_populated(enum pcpu_chunk_type type) * Scan chunk's pages in the reverse order to keep populated * pages close to the beginning of the chunk. */ + freed_page_start = chunk->nr_pages; + freed_page_end = 0; + reintegrate = false; for (i = chunk->nr_pages - 1, end = -1; i >= 0; i--) { /* no more work to do */ if (chunk->nr_empty_pop_pages == 0) @@ -2165,8 +2172,8 @@ static void pcpu_reclaim_populated(enum pcpu_chunk_type type) /* reintegrate chunk to prevent atomic alloc failures */ if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_HIGH) { - pcpu_reintegrate_chunk(chunk); - goto restart; + reintegrate = true; + goto end_chunk; }
/* @@ -2195,16 +2202,29 @@ static void pcpu_reclaim_populated(enum pcpu_chunk_type type) spin_lock_irq(&pcpu_lock);
pcpu_chunk_depopulated(chunk, i + 1, end + 1); + freed_page_start = min(freed_page_start, i + 1); + freed_page_end = max(freed_page_end, end + 1);
/* reset the range and continue */ end = -1; }
- if (chunk->free_bytes == pcpu_unit_size) +end_chunk: + /* batch tlb flush per chunk to amortize cost */ + if (freed_page_start < freed_page_end) { + spin_unlock_irq(&pcpu_lock); + pcpu_post_unmap_tlb_flush(chunk, + freed_page_start, + freed_page_end); + cond_resched(); + spin_lock_irq(&pcpu_lock); + } + + if (reintegrate || chunk->free_bytes == pcpu_unit_size) pcpu_reintegrate_chunk(chunk); else - list_move(&chunk->list, - &pcpu_slot[pcpu_sidelined_slot]); + list_move_tail(&chunk->list, + &pcpu_slot[pcpu_sidelined_slot]); }
spin_unlock_irq(&pcpu_lock);
From: Mike Kravetz mike.kravetz@oracle.com
mainline inclusion from mainline-v5.12-rc1 commit ff5461176213d5fd5cfb7e981f9add4d856e415a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I3ZCW9 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------------------------
Gerald Schaefer reported a panic on s390 in hugepage_subpool_put_pages() with linux-next 5.12.0-20210222. Call trace: hugepage_subpool_put_pages.part.0+0x2c/0x138 __free_huge_page+0xce/0x310 alloc_pool_huge_page+0x102/0x120 set_max_huge_pages+0x13e/0x350 hugetlb_sysctl_handler_common+0xd8/0x110 hugetlb_sysctl_handler+0x48/0x58 proc_sys_call_handler+0x138/0x238 new_sync_write+0x10e/0x198 vfs_write.part.0+0x12c/0x238 ksys_write+0x68/0xf8 do_syscall+0x82/0xd0 __do_syscall+0xb4/0xc8 system_call+0x72/0x98
This is a result of the change which moved the hugetlb page subpool pointer from page->private to page[1]->private. When new pages are allocated from the buddy allocator, the private field of the head page will be cleared, but the private field of subpages is not modified. Therefore, old values may remain.
Fix by initializing hugetlb page subpool pointer in prep_new_huge_page().
Link: https://lkml.kernel.org/r/20210223215544.313871-1-mike.kravetz@oracle.com Fixes: f1280272ae4d ("hugetlb: use page.private for hugetlb specific page flags") Signed-off-by: Mike Kravetz mike.kravetz@oracle.com Reported-by: Gerald Schaefer gerald.schaefer@linux.ibm.com Reviewed-by: Oscar Salvador osalvador@suse.de Acked-by: Michal Hocko mhocko@suse.com Cc: Gerald Schaefer gerald.schaefer@linux.ibm.com Cc: Muchun Song songmuchun@bytedance.com Cc: Heiko Carstens hca@linux.ibm.com Cc: Sven Schnelle svens@linux.ibm.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Nanyong Sun sunnanyong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/hugetlb.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ac16161d5297..a7606831e66a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1591,6 +1591,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) free_huge_page_vmemmap(h, page); INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); + hugetlb_set_page_subpool(page, NULL); set_hugetlb_cgroup(page, NULL); set_hugetlb_cgroup_rsvd(page, NULL); spin_lock_irq(&hugetlb_lock);
From: Mike Kravetz mike.kravetz@oracle.com
mainline inclusion from mainline-v5.13-rc3 commit e32905e57358fdfb82f9de024534f205b3af7dac category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I3ZCW9 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-----------------------------------------------
In commit d6995da31122 ("hugetlb: use page.private for hugetlb specific page flags") the use of PagePrivate to indicate a reservation count should be restored at free time was changed to the hugetlb specific flag HPageRestoreReserve. Changes to a userfaultfd error path as well as a VM_BUG_ON() in remove_inode_hugepages() were overlooked.
Users could see incorrect hugetlb reserve counts if they experience an error with a UFFDIO_COPY operation. Specifically, this would be the result of an unlikely copy_huge_page_from_user error. There is not an increased chance of hitting the VM_BUG_ON.
Link: https://lkml.kernel.org/r/20210521233952.236434-1-mike.kravetz@oracle.com Fixes: d6995da31122 ("hugetlb: use page.private for hugetlb specific page flags") Signed-off-by: Mike Kravetz mike.kravetz@oracle.com Reviewed-by: Mina Almasry almasry.mina@google.com Cc: Oscar Salvador osalvador@suse.de Cc: Michal Hocko mhocko@suse.com Cc: Muchun Song songmuchun@bytedance.com Cc: Naoya Horiguchi n-horiguchi@ah.jp.nec.com Cc: David Hildenbrand david@redhat.com Cc: Matthew Wilcox willy@infradead.org Cc: Miaohe Lin linmiaohe@huawei.com Cc: Mina Almasry almasrymina@google.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Nanyong Sun sunnanyong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/hugetlbfs/inode.c | 2 +- mm/userfaultfd.c | 28 ++++++++++++++-------------- 2 files changed, 15 insertions(+), 15 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dd8831a78cac..f7c17a7591b4 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -532,7 +532,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * the subpool and global reserve usage count can need * to be adjusted. */ - VM_BUG_ON(PagePrivate(page)); + VM_BUG_ON(HPageRestoreReserve(page)); remove_huge_page(page); freed++; if (!truncate_op) { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index a8676b57abb8..c1f984879fda 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -388,38 +388,38 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm, * If a reservation for the page existed in the reservation * map of a private mapping, the map was modified to indicate * the reservation was consumed when the page was allocated. - * We clear the PagePrivate flag now so that the global + * We clear the HPageRestoreReserve flag now so that the global * reserve count will not be incremented in free_huge_page. * The reservation map will still indicate the reservation * was consumed and possibly prevent later page allocation. * This is better than leaking a global reservation. If no - * reservation existed, it is still safe to clear PagePrivate - * as no adjustments to reservation counts were made during - * allocation. + * reservation existed, it is still safe to clear + * HPageRestoreReserve as no adjustments to reservation counts + * were made during allocation. * * The reservation map for shared mappings indicates which * pages have reservations. When a huge page is allocated * for an address with a reservation, no change is made to - * the reserve map. In this case PagePrivate will be set - * to indicate that the global reservation count should be + * the reserve map. In this case HPageRestoreReserve will be + * set to indicate that the global reservation count should be * incremented when the page is freed. This is the desired * behavior. However, when a huge page is allocated for an * address without a reservation a reservation entry is added - * to the reservation map, and PagePrivate will not be set. - * When the page is freed, the global reserve count will NOT - * be incremented and it will appear as though we have leaked - * reserved page. In this case, set PagePrivate so that the - * global reserve count will be incremented to match the - * reservation map entry which was created. + * to the reservation map, and HPageRestoreReserve will not be + * set. When the page is freed, the global reserve count will + * NOT be incremented and it will appear as though we have + * leaked reserved page. In this case, set HPageRestoreReserve + * so that the global reserve count will be incremented to + * match the reservation map entry which was created. * * Note that vm_alloc_shared is based on the flags of the vma * for which the page was originally allocated. dst_vma could * be different or NULL on error. */ if (vm_alloc_shared) - SetPagePrivate(page); + SetHPageRestoreReserve(page); else - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); put_page(page); } BUG_ON(copied < 0);
From: Naoya Horiguchi naoya.horiguchi@nec.com
mainline inclusion from mainline-v5.13-rc5 commit 0c5da35723a961d8c02ea516da2bcfeb007d7d2c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I3ZCW9 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------------------------
When memory_failure() or soft_offline_page() is called on a tail page of some hugetlb page, "BUG: unable to handle page fault" error can be triggered.
remove_hugetlb_page() dereferences page->lru, so it's assumed that the page points to a head page, but one of the caller, dissolve_free_huge_page(), provides remove_hugetlb_page() with 'page' which could be a tail page. So pass 'head' to it, instead.
Link: https://lkml.kernel.org/r/20210526235257.2769473-1-nao.horiguchi@gmail.com Fixes: 6eb4e88a6d27 ("hugetlb: create remove_hugetlb_page() to separate functionality") Signed-off-by: Naoya Horiguchi naoya.horiguchi@nec.com Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Reviewed-by: Muchun Song songmuchun@bytedance.com Acked-by: Michal Hocko mhocko@suse.com Reviewed-by: Oscar Salvador osalvador@suse.de Cc: Miaohe Lin linmiaohe@huawei.com Cc: Matthew Wilcox willy@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Nanyong Sun sunnanyong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a7606831e66a..7e9ab6cf8729 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1882,7 +1882,7 @@ int dissolve_free_huge_page(struct page *page) goto retry; }
- remove_hugetlb_page(h, page, false); + remove_hugetlb_page(h, head, false); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock);
From: Hanjun Guo guohanjun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CGER CVE: NA
---------------------------
Add the support for PHYTIUM topology detect, it's better use PPTT ACPI table to report the topology, but we can live with it at now.
Signed-off-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/cputype.h | 1 + arch/arm64/kernel/topology.c | 5 +++++ 2 files changed, 6 insertions(+)
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index ef5b040dee44..7e2811d726e0 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -59,6 +59,7 @@ #define ARM_CPU_IMP_NVIDIA 0x4E #define ARM_CPU_IMP_FUJITSU 0x46 #define ARM_CPU_IMP_HISI 0x48 +#define ARM_CPU_IMP_PHYTIUM 0x70
#define ARM_CPU_PART_AEM_V8 0xD0F #define ARM_CPU_PART_FOUNDATION 0xD00 diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 543c67cae02f..e5c9ac4840c6 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -53,6 +53,11 @@ void store_cpu_topology(unsigned int cpuid) cpuid_topo->thread_id = -1; cpuid_topo->core_id = cpuid; cpuid_topo->package_id = cpu_to_node(cpuid); + if (read_cpuid_implementor() == ARM_CPU_IMP_PHYTIUM) { + cpuid_topo->thread_id = 0; + cpuid_topo->core_id = cpuid; + cpuid_topo->package_id = 0; + }
pr_debug("CPU%u: cluster %d core %d thread %d mpidr %#016llx\n", cpuid, cpuid_topo->package_id, cpuid_topo->core_id,
From: Hanjun Guo guohanjun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4CGER CVE: NA
---------------------------
Add workaround for phytium as the firmware didn't report the DMA size info.
Signed-off-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/usb/host/xhci-pci.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 44dd77343cc1..3fe73d2f39e9 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -371,6 +371,18 @@ static int xhci_pci_setup(struct usb_hcd *hcd) return xhci_pci_reinit(xhci, pdev); }
+#ifdef CONFIG_ARM64 +#include <asm/cputype.h> +static void phytium_xhci_pci_workaround(struct pci_dev *dev) +{ + /* Firmware bug, DMA mask is not reported by the firmware */ + if (read_cpuid_implementor() == ARM_CPU_IMP_PHYTIUM) + dma_set_mask(&dev->dev, DMA_BIT_MASK(64)); +} +#else +static inline void phytium_xhci_pci_workaround(struct pci_dev *dev) { } +#endif + /* * We need to register our own PCI probe function (instead of the USB core's * function) in order to create a second roothub under xHCI. @@ -395,6 +407,8 @@ static int xhci_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) return PTR_ERR(reset); reset_control_reset(reset);
+ phytium_xhci_pci_workaround(dev); + /* Prevent runtime suspending between USB-2 and USB-3 initialization */ pm_runtime_get_noresume(&dev->dev);
From: Weilong Chen chenweilong@huawei.com
ascend inclusion category: feature bugzilla: 46922, https://gitee.com/openeuler/kernel/issues/I41AUQ CVE: NA
-------------------------------------
Adding the MIDR encodings for HiSilicon Taishan v200 CPUs, which is used in Kunpeng ARM64 server SoCs. TSV200 is the abbreviation of Taishan v200. There are two variants of TSV200, variant 0 and variant 1.
Signed-off-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/cputype.h | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 7e2811d726e0..1f6a9d26aa39 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -99,6 +99,7 @@ #define FUJITSU_CPU_PART_A64FX 0x001
#define HISI_CPU_PART_TSV110 0xD01 +#define HISI_CPU_PART_TSV200 0xD02
#define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) @@ -128,6 +129,7 @@ #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL) #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) +#define MIDR_HISI_TSV200 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV200)
/* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ #define MIDR_FUJITSU_ERRATUM_010001 MIDR_FUJITSU_A64FX
From: Guo Hui guohui@uniontech.com
uniontech inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I41AUQ CVE: NA
-------------------------------------
Adding the MIDR encodings for PHYTIUM 2000+ and 2500 CPUs.
Signed-off-by: Guo Hui guohui@uniontech.com Signed-off-by: Hanjun Guo guohanjun@huawei.com Cc: Guo Hui guohui@uniontech.com Cc: Cheng Jian cj.chengjian@huawei.com Cc: Zhen Lei thunder.leizhen@huawei.com Cc: Xiuqi Xie xiexiuqi@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/include/asm/cputype.h | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index 1f6a9d26aa39..4e5b34c8ff54 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -101,6 +101,9 @@ #define HISI_CPU_PART_TSV110 0xD01 #define HISI_CPU_PART_TSV200 0xD02
+#define PHYTIUM_CPU_PART_FTC662 0x662 +#define PHYTIUM_CPU_PART_FTC663 0x663 + #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) #define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) @@ -130,6 +133,8 @@ #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) #define MIDR_HISI_TSV200 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV200) +#define MIDR_PHYTIUM_FT2000PLUS MIDR_CPU_MODEL(ARM_CPU_IMP_PHYTIUM, PHYTIUM_CPU_PART_FTC662) +#define MIDR_PHYTIUM_FT2500 MIDR_CPU_MODEL(ARM_CPU_IMP_PHYTIUM, PHYTIUM_CPU_PART_FTC663)
/* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ #define MIDR_FUJITSU_ERRATUM_010001 MIDR_FUJITSU_A64FX
From: Hanjun Guo guohanjun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I41AUQ CVE: NA
-------------------------------------
Update the code to using MIDR_PHYTIUM_FT2000PLUS, instead of ARM_CPU_IMP_PHYTIUM, which will distinguish FTC662 and FTC663.
Signed-off-by: Hanjun Guo guohanjun@huawei.com Cc: Guo Hui guohui@uniontech.com Cc: Cheng Jian cj.chengjian@huawei.com Cc: Zhen Lei thunder.leizhen@huawei.com Cc: Xiuqi Xie xiexiuqi@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/topology.c | 6 ++++-- drivers/usb/host/xhci-pci.c | 4 +++- 2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index e5c9ac4840c6..7a487976ce8b 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -53,9 +53,11 @@ void store_cpu_topology(unsigned int cpuid) cpuid_topo->thread_id = -1; cpuid_topo->core_id = cpuid; cpuid_topo->package_id = cpu_to_node(cpuid); - if (read_cpuid_implementor() == ARM_CPU_IMP_PHYTIUM) { + + /* Some PHYTIUM FT2000PLUS platform firmware has no PPTT table */ + if ((read_cpuid_id() & MIDR_CPU_MODEL_MASK) == MIDR_PHYTIUM_FT2000PLUS + && cpu_to_node(cpuid) == NUMA_NO_NODE) { cpuid_topo->thread_id = 0; - cpuid_topo->core_id = cpuid; cpuid_topo->package_id = 0; }
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index 3fe73d2f39e9..8b15c2644de6 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -375,8 +375,10 @@ static int xhci_pci_setup(struct usb_hcd *hcd) #include <asm/cputype.h> static void phytium_xhci_pci_workaround(struct pci_dev *dev) { + u32 midr = read_cpuid_id(); + /* Firmware bug, DMA mask is not reported by the firmware */ - if (read_cpuid_implementor() == ARM_CPU_IMP_PHYTIUM) + if ((midr & MIDR_CPU_MODEL_MASK) == MIDR_PHYTIUM_FT2000PLUS) dma_set_mask(&dev->dev, DMA_BIT_MASK(64)); } #else
From: Hanjun Guo guohanjun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I41AUQ CVE: NA
-------------------------------------
Support SMMU default bypass for some CPU SoCs which the SMMU is not functional well in address translation mode.
We already have the .def_domain_type hook for iommu_ops in iommu driver, so we add the CPU SoC SMMU bypass code in the .def_domain_type hook in smmuv2 driver, and return IOMMU_DOMAIN_IDENTITY for such SoCs.
After we add the hook, we set all the devices for such SoCs in pass through mode, no matter adding iommu.passthrough=off/on or not in the boot cmdline.
While we at it, update the config SMMU_BYPASS_DEV to specify the useage.
Signed-off-by: Hanjun Guo guohanjun@huawei.com Cc: Guo Hui guohui@uniontech.com Cc: Cheng Jian cj.chengjian@huawei.com Cc: Zhen Lei thunder.leizhen@huawei.com Cc: Xiuqi Xie xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/Kconfig | 13 ++++++++----- drivers/iommu/arm/arm-smmu/arm-smmu.c | 22 ++++++++++++++++++++++ 2 files changed, 30 insertions(+), 5 deletions(-)
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig index a65f1f835e7d..d6c04563f60d 100644 --- a/drivers/iommu/Kconfig +++ b/drivers/iommu/Kconfig @@ -409,13 +409,16 @@ config VIRTIO_IOMMU
Say Y here if you intend to run this kernel as a guest.
-config SMMU_BYPASS_DEV +config SMMU_BYPASS_DEV bool "SMMU bypass streams for some specific devices" depends on ARM_SMMU_V3=y help - according smmu.bypassdev cmdline, SMMU performs attribute - transformation only,with no address translation. - E.g:SMMU allow iMR3408/3416 Raid bypass at DMA default domain - to support other devices Virtualization through. + Using the smmu.bypassdev cmdline, to collect the devices that SMMU + performs attribute transformation only, with no address translation. + E.g:SMMU allow iMR3408/3416 Raid bypass at DMA default domain to + support other devices to use virtualization such as VFIO. + + This feature will be replaced by ACPI IORT RMR node, which will be + upstreamed in mainline.
endif # IOMMU_SUPPORT diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c index e96983dac939..1f951070f519 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c @@ -1581,11 +1581,33 @@ static void arm_smmu_get_resv_regions(struct device *dev, iommu_dma_get_resv_regions(dev, head); }
+#ifdef CONFIG_ARM64 +#include <asm/cputype.h> +static bool cpu_using_identity_iommu_domain(struct device *dev) +{ + u32 midr = read_cpuid_id(); + + if (((midr & MIDR_CPU_MODEL_MASK) == MIDR_PHYTIUM_FT2000PLUS) + || ((midr & MIDR_CPU_MODEL_MASK) == MIDR_PHYTIUM_FT2500)) + return true; + + return false; +} +#else +static bool cpu_using_identity_iommu_domain(struct device *dev) +{ + return false; +} +#endif + static int arm_smmu_def_domain_type(struct device *dev) { struct arm_smmu_master_cfg *cfg = dev_iommu_priv_get(dev); const struct arm_smmu_impl *impl = cfg->smmu->impl;
+ if (cpu_using_identity_iommu_domain(dev)) + return IOMMU_DOMAIN_IDENTITY; + if (impl && impl->def_domain_type) return impl->def_domain_type(dev);