From: Jaewon Kim jaewon31.kim@samsung.com
mainline inclusion from mainline-v5.8-rc1 commit 1f318a9b0dc3990490e98eef48f21e6f15185781 category: bugfix bugzilla: 36239 CVE: NA
-------------------------------------------------
Fix an nr_isolate_* mismatch problem between cma and dirty lazyfree pages.
If try_to_unmap_one is used for reclaim and it detects a dirty lazyfree page, then the lazyfree page is changed to a normal anon page having SwapBacked by commit 802a3a92ad7a ("mm: reclaim MADV_FREE pages"). Even with the change, reclaim context correctly counts isolated files because it uses is_file_lru to distinguish file. And the change to anon is not happened if try_to_unmap_one is used for migration. So migration context like compaction also correctly counts isolated files even though it uses page_is_file_lru insted of is_file_lru. Recently page_is_file_cache was renamed to page_is_file_lru by commit 9de4f22a60f7 ("mm: code cleanup for MADV_FREE").
But the nr_isolate_* mismatch problem happens on cma alloc. There is reclaim_clean_pages_from_list which is being used only by cma. It was introduced by commit 02c6de8d757c ("mm: cma: discard clean pages during contiguous allocation instead of migration") to reclaim clean file pages without migration. The cma alloc uses both reclaim_clean_pages_from_list and migrate_pages, and it uses page_is_file_lru to count isolated files. If there are dirty lazyfree pages allocated from cma memory region, the pages are counted as isolated file at the beginging but are counted as isolated anon after finished.
Mem-Info: Node 0 active_anon:3045904kB inactive_anon:611448kB active_file:14892kB inactive_file:205636kB unevictable:10416kB isolated(anon):0kB isolated(file):37664kB mapped:630216kB dirty:384kB writeback:0kB shmem:42576kB writeback_tmp:0kB unstable:0kB all_unreclaimable? no
Like log above, there were too much isolated files, 37664kB, which triggers too_many_isolated in reclaim even when there is no actually isolated file in system wide. It could be reproducible by running two programs, writing on MADV_FREE page and doing cma alloc, respectively. Although isolated anon is 0, I found that the internal value of isolated anon was the negative value of isolated file.
Fix this by compensating the isolated count for both LRU lists. Count non-discarded lazyfree pages in shrink_page_list, then compensate the counted number in reclaim_clean_pages_from_list.
Reported-by: Yong-Taek Lee ytk.lee@samsung.com Suggested-by: Minchan Kim minchan@kernel.org Signed-off-by: Jaewon Kim jaewon31.kim@samsung.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Minchan Kim minchan@kernel.org Cc: Mel Gorman mgorman@suse.de Cc: Johannes Weiner hannes@cmpxchg.org Cc: Marek Szyprowski m.szyprowski@samsung.com Cc: Michal Nazarewicz mina86@mina86.com Cc: Shaohua Li shli@fb.com Link: http://lkml.kernel.org/r/20200426011718.30246-1-jaewon31.kim@samsung.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/vmstat.h | 1 + mm/vmscan.c | 27 ++++++++++++++++++++++----- 2 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index f25cef84b41db..0beaea0e5b832 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -29,6 +29,7 @@ struct reclaim_stat { unsigned nr_activate; unsigned nr_ref_keep; unsigned nr_unmap_fail; + unsigned nr_lazyfree_fail; };
#ifdef CONFIG_VM_EVENT_COUNTERS diff --git a/mm/vmscan.c b/mm/vmscan.c index 6f3c655fc8879..bedea8b4024a8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1129,6 +1129,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, unsigned nr_immediate = 0; unsigned nr_ref_keep = 0; unsigned nr_unmap_fail = 0; + unsigned nr_lazyfree_fail = 0;
cond_resched();
@@ -1336,11 +1337,15 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ if (page_mapped(page)) { enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH; + bool was_swapbacked = PageSwapBacked(page);
if (unlikely(PageTransHuge(page))) flags |= TTU_SPLIT_HUGE_PMD; + if (!try_to_unmap(page, flags)) { nr_unmap_fail++; + if (!was_swapbacked && PageSwapBacked(page)) + nr_lazyfree_fail++; goto activate_locked; } } @@ -1519,6 +1524,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, stat->nr_activate = pgactivate; stat->nr_ref_keep = nr_ref_keep; stat->nr_unmap_fail = nr_unmap_fail; + stat->nr_lazyfree_fail = nr_lazyfree_fail; } return nr_reclaimed; } @@ -1531,7 +1537,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - unsigned long ret; + struct reclaim_stat stat; + unsigned long nr_reclaimed; struct page *page, *next; LIST_HEAD(clean_pages);
@@ -1543,11 +1550,21 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } }
- ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_IGNORE_ACCESS, NULL, true); + nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, + TTU_IGNORE_ACCESS, &stat, true); list_splice(&clean_pages, page_list); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); - return ret; + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed); + /* + * Since lazyfree pages are isolated from file LRU from the beginning, + * they will rotate back to anonymous LRU in the end if it failed to + * discard so isolated count will be mismatched. + * Compensate the isolated count for both LRU lists. + */ + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, + stat.nr_lazyfree_fail); + mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, + -stat.nr_lazyfree_fail); + return nr_reclaimed; }
/*
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-v5.10-rc5 commit 8faeb1ffd79593c9cd8a2a80ecdda371e3b826cb category: bugfix bugzilla: 46731 CVE: NA
-------------------------------------------------
If we reparent the slab objects to the root memcg, when we free the slab object, we need to update the per-memcg vmstats to keep it correct for the root memcg. Now this at least affects the vmstat of NR_KERNEL_STACK_KB for !CONFIG_VMAP_STACK when the thread stack size is smaller than the PAGE_SIZE.
David said: "I assume that without this fix that the root memcg's vmstat would always be inflated if we reparented"
Fixes: ec9f02384f60 ("mm: workingset: fix vmstat counters for shadow nodes") Signed-off-by: Muchun Song songmuchun@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Shakeel Butt shakeelb@google.com Acked-by: Roman Gushchin guro@fb.com Acked-by: Johannes Weiner hannes@cmpxchg.org Acked-by: David Rientjes rientjes@google.com Cc: Michal Hocko mhocko@kernel.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Christopher Lameter cl@linux.com Cc: Pekka Enberg penberg@kernel.org Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Cc: Roman Gushchin guro@fb.com Cc: Vlastimil Babka vbabka@suse.cz Cc: Yafang Shao laoar.shao@gmail.com Cc: Chris Down chris@chrisdown.name Cc: stable@vger.kernel.org [5.3+] Link: https://lkml.kernel.org/r/20201110031015.15715-1-songmuchun@bytedance.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 88ab44a5696fd..6b4b487f6ce75 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -779,8 +779,13 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) rcu_read_lock(); memcg = memcg_from_slab_page(page);
- /* Untracked pages have no memcg, no lruvec. Update only the node */ - if (!memcg || memcg == root_mem_cgroup) { + /* + * Untracked pages have no memcg, no lruvec. Update only the + * node. If we reparent the slab objects to the root memcg, + * when we free the slab object, we need to update the per-memcg + * vmstats to keep it correct for the root memcg. + */ + if (!memcg) { __mod_node_page_state(pgdat, idx, val); } else { lruvec = mem_cgroup_lruvec(pgdat, memcg);
From: Shakeel Butt shakeelb@google.com
mainline inclusion from mainline-v5.11-rc5 commit 5c447d274f3746fbed6e695e7b9a2d7bd8b31b71 category: bugfix bugzilla: 47675 CVE: NA
-------------------------------------------------
Currently the kernel is not correctly updating the numa stats for NR_FILE_PAGES and NR_SHMEM on THP migration. Fix that.
For NR_FILE_DIRTY and NR_ZONE_WRITE_PENDING, although at the moment there is no need to handle THP migration as kernel still does not have write support for file THP but to be more future proof, this patch adds the THP support for those stats as well.
Link: https://lkml.kernel.org/r/20210108155813.2914586-2-shakeelb@google.com Fixes: e71769ae52609 ("mm: enable thp migration for shmem thp") Signed-off-by: Shakeel Butt shakeelb@google.com Acked-by: Yang Shi shy828301@gmail.com Reviewed-by: Roman Gushchin guro@fb.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@kernel.org Cc: Muchun Song songmuchun@bytedance.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/migrate.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-)
diff --git a/mm/migrate.c b/mm/migrate.c index 22b08aea06975..4a810183277ac 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -443,6 +443,7 @@ int migrate_page_move_mapping(struct address_space *mapping, int dirty; int expected_count = 1 + extra_count; void **pslot; + int nr = hpage_nr_pages(page);
/* * Device public or private pages have an extra refcount as they are @@ -506,7 +507,7 @@ int migrate_page_move_mapping(struct address_space *mapping, */ newpage->index = page->index; newpage->mapping = page->mapping; - page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */ + page_ref_add(newpage, nr); /* add cache reference */ if (PageSwapBacked(page)) { __SetPageSwapBacked(newpage); if (PageSwapCache(page)) { @@ -529,7 +530,7 @@ int migrate_page_move_mapping(struct address_space *mapping, int i; int index = page_index(page);
- for (i = 1; i < HPAGE_PMD_NR; i++) { + for (i = 1; i < nr; i++) { pslot = radix_tree_lookup_slot(&mapping->i_pages, index + i); radix_tree_replace_slot(&mapping->i_pages, pslot, @@ -542,7 +543,7 @@ int migrate_page_move_mapping(struct address_space *mapping, * to one less reference. * We know this isn't the last reference. */ - page_ref_unfreeze(page, expected_count - hpage_nr_pages(page)); + page_ref_unfreeze(page, expected_count - nr);
xa_unlock(&mapping->i_pages); /* Leave irq disabled to prevent preemption while updating stats */ @@ -558,17 +559,17 @@ int migrate_page_move_mapping(struct address_space *mapping, * are mapped to swap space. */ if (newzone != oldzone) { - __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES); - __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES); + __mod_node_page_state(oldzone->zone_pgdat, NR_FILE_PAGES, -nr); + __mod_node_page_state(newzone->zone_pgdat, NR_FILE_PAGES, nr); if (PageSwapBacked(page) && !PageSwapCache(page)) { - __dec_node_state(oldzone->zone_pgdat, NR_SHMEM); - __inc_node_state(newzone->zone_pgdat, NR_SHMEM); + __mod_node_page_state(oldzone->zone_pgdat, NR_SHMEM, -nr); + __mod_node_page_state(newzone->zone_pgdat, NR_SHMEM, nr); } if (dirty && mapping_cap_account_dirty(mapping)) { - __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY); - __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING); - __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY); - __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING); + __mod_node_page_state(oldzone->zone_pgdat, NR_FILE_DIRTY, -nr); + __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr); + __mod_node_page_state(newzone->zone_pgdat, NR_FILE_DIRTY, nr); + __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr); } } local_irq_enable();
From: Vlastimil Babka vbabka@suse.cz
mainline inclusion from mainline-5.4-rc3 commit 6a486c0ad4dcdee3946842c64884d2978bfe2602 category: feature bugzilla: 51349 CVE: NA
-------------------------------------------------
Patch series "guarantee natural alignment for kmalloc()", v2.
This patch (of 2):
SLOB currently doesn't account its pages at all, so in /proc/meminfo the Slab field shows zero. Modifying a counter on page allocation and freeing should be acceptable even for the small system scenarios SLOB is intended for. Since reclaimable caches are not separated in SLOB, account everything as unreclaimable.
SLUB currently doesn't account kmalloc() and kmalloc_node() allocations larger than order-1 page, that are passed directly to the page allocator. As they also don't appear in /proc/slabinfo, it might look like a memory leak. For consistency, account them as well. (SLAB doesn't actually use page allocator directly, so no change there).
Ideally SLOB and SLUB would be handled in separate patches, but due to the shared kmalloc_order() function and different kfree() implementations, it's easier to patch both at once to prevent inconsistencies.
Link: http://lkml.kernel.org/r/20190826111627.7505-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka vbabka@suse.cz Cc: Christoph Lameter cl@linux.com Cc: Pekka Enberg penberg@kernel.org Cc: David Rientjes rientjes@google.com Cc: Ming Lei ming.lei@redhat.com Cc: Dave Chinner david@fromorbit.com Cc: Matthew Wilcox willy@infradead.org Cc: "Darrick J . Wong" darrick.wong@oracle.com Cc: Christoph Hellwig hch@lst.de Cc: James Bottomley James.Bottomley@HansenPartnership.com Cc: Vlastimil Babka vbabka@suse.cz Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 6a486c0ad4dcdee3946842c64884d2978bfe2602) Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/slab_common.c | 8 ++++++-- mm/slob.c | 20 ++++++++++++++++---- mm/slub.c | 14 +++++++++++--- 3 files changed, 33 insertions(+), 9 deletions(-)
diff --git a/mm/slab_common.c b/mm/slab_common.c index 6b1cbf89a6861..321a9abed5d9d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1252,12 +1252,16 @@ void __init create_kmalloc_caches(slab_flags_t flags) */ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) { - void *ret; + void *ret = NULL; struct page *page;
flags |= __GFP_COMP; page = alloc_pages(flags, order); - ret = page ? page_address(page) : NULL; + if (likely(page)) { + ret = page_address(page); + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, + 1 << order); + } kmemleak_alloc(ret, size, 1, flags); kasan_kmalloc_large(ret, size, flags); return ret; diff --git a/mm/slob.c b/mm/slob.c index fdf284009be92..8165d90db1adc 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -190,7 +190,7 @@ static int slob_last(slob_t *s)
static void *slob_new_pages(gfp_t gfp, int order, int node) { - void *page; + struct page *page;
#ifdef CONFIG_NUMA if (node != NUMA_NO_NODE) @@ -202,14 +202,21 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) if (!page) return NULL;
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, + 1 << order); return page_address(page); }
static void slob_free_pages(void *b, int order) { + struct page *sp = virt_to_page(b); + if (current->reclaim_state) current->reclaim_state->reclaimed_slab += 1 << order; - free_pages((unsigned long)b, order); + + mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE, + -(1 << order)); + __free_pages(sp, order); }
/* @@ -517,8 +524,13 @@ void kfree(const void *block) int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); unsigned int *m = (unsigned int *)(block - align); slob_free(m, *m + align); - } else - __free_pages(sp, compound_order(sp)); + } else { + unsigned int order = compound_order(sp); + mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE, + -(1 << order)); + __free_pages(sp, order); + + } } EXPORT_SYMBOL(kfree);
diff --git a/mm/slub.c b/mm/slub.c index af7343a744091..0d69d5b3ceefe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3793,11 +3793,15 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; void *ptr = NULL; + unsigned int order = get_order(size);
flags |= __GFP_COMP; - page = alloc_pages_node(node, flags, get_order(size)); - if (page) + page = alloc_pages_node(node, flags, order); + if (page) { ptr = page_address(page); + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, + 1 << order); + }
kmalloc_large_node_hook(ptr, size, flags); return ptr; @@ -3932,9 +3936,13 @@ void kfree(const void *x)
page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { + unsigned int order = compound_order(page); + BUG_ON(!PageCompound(page)); kfree_hook(object); - __free_pages(page, compound_order(page)); + mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, + -(1 << order)); + __free_pages(page, order); return; } slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-5.12-rc1 commit 96403bfe50c344b587ea53894954a9d152af1c9d category: bugfix bugzilla: 51349 CVE: NA
-------------------------------------------------
SLUB currently account kmalloc() and kmalloc_node() allocations larger than order-1 page per-node. But it forget to update the per-memcg vmstats. So it can lead to inaccurate statistics of "slab_unreclaimable" which is from memory.stat. Fix it by using mod_lruvec_page_state instead of mod_node_page_state.
Link: https://lkml.kernel.org/r/20210223092423.42420-1-songmuchun@bytedance.com Fixes: 6a486c0ad4dc ("mm, sl[ou]b: improve memory accounting") Signed-off-by: Muchun Song songmuchun@bytedance.com Reviewed-by: Shakeel Butt shakeelb@google.com Reviewed-by: Roman Gushchin guro@fb.com Reviewed-by: Michal Koutný mkoutny@suse.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@kernel.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 96403bfe50c344b587ea53894954a9d152af1c9d) Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/slab_common.c | 4 ++-- mm/slub.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/mm/slab_common.c b/mm/slab_common.c index 321a9abed5d9d..b8b0df81bece3 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1259,8 +1259,8 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) page = alloc_pages(flags, order); if (likely(page)) { ret = page_address(page); - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, - 1 << order); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, + PAGE_SIZE << order); } kmemleak_alloc(ret, size, 1, flags); kasan_kmalloc_large(ret, size, flags); diff --git a/mm/slub.c b/mm/slub.c index 0d69d5b3ceefe..12f23ceab1177 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3799,8 +3799,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) page = alloc_pages_node(node, flags, order); if (page) { ptr = page_address(page); - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, - 1 << order); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, + PAGE_SIZE << order); }
kmalloc_large_node_hook(ptr, size, flags); @@ -3940,8 +3940,8 @@ void kfree(const void *x)
BUG_ON(!PageCompound(page)); kfree_hook(object); - mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE, - -(1 << order)); + mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, + -(PAGE_SIZE << order)); __free_pages(page, order); return; }