From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 46357 CVE: NA
---------------------------
test procedures:
a. create 20000 cgroups, and echo "8:0 10000" to blkio.throttle.write_bps_device b. echo 1 > /sys/blocd/sda/device/delete
test result:
rcu: INFO: rcu_sched detected stalls on CPUs/tasks: [5/1143] rcu: 0-...0: (0 ticks this GP) idle=0f2/1/0x4000000000000000 softirq=15507/15507 fq rcu: (detected by 6, t=60012 jiffies, g=119977, q=27153) NMI backtrace for cpu 0 CPU: 0 PID: 443 Comm: bash Not tainted 4.19.95-00061-g0bcc83b30eec #63 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-p4 RIP: 0010:blk_throtl_update_limit_valid.isra.0+0x116/0x2a0 Code: 01 00 00 e8 7c dd 74 ff 48 83 bb 78 01 00 00 00 0f 85 54 01 00 00 48 8d bb 88 01 1 RSP: 0018:ffff8881030bf9f0 EFLAGS: 00000046 RAX: 0000000000000000 RBX: ffff8880b4f37080 RCX: ffffffff95da0afe RDX: dffffc0000000000 RSI: ffff888100373980 RDI: ffff8880b4f37208 RBP: ffff888100deca00 R08: ffffffff9528f951 R09: 0000000000000001 R10: ffffed10159dbf56 R11: ffff8880acedfab3 R12: ffff8880b9fda498 R13: ffff8880b9fda4f4 R14: 0000000000000050 R15: ffffffff98b622c0 FS: 00007feb51c51700(0000) GS:ffff888106200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000561619547080 CR3: 0000000102bc9000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: throtl_pd_offline+0x98/0x100 blkg_destroy+0x133/0x630 ? blkcg_deactivate_policy+0x2c0/0x2c0 ? lock_timer_base+0x65/0x110 blkg_destroy_all+0x7f/0x100 blkcg_exit_queue+0x3f/0xa5 blk_exit_queue+0x69/0xa0 blk_cleanup_queue+0x226/0x360 __scsi_remove_device+0xb4/0x3c0 scsi_remove_device+0x38/0x60 sdev_store_delete+0x74/0x100 ? dev_driver_string+0xb0/0xb0 dev_attr_store+0x41/0x70 sysfs_kf_write+0x89/0xc0 kernfs_fop_write+0x1b6/0x2e0 ? sysfs_kf_bin_read+0x130/0x130 __vfs_write+0xca/0x420 ? kernel_read+0xc0/0xc0 ? __alloc_fd+0x16f/0x2d0 ? __fd_install+0x95/0x1a0 ? handle_mm_fault+0x3e0/0x560 vfs_write+0x11a/0x2f0 ksys_write+0xb9/0x1e0 ? __x64_sys_read+0x60/0x60 ? kasan_check_write+0x20/0x30 ? filp_close+0xb5/0xf0 __x64_sys_write+0x46/0x60 do_syscall_64+0xd9/0x1f0 entry_SYSCALL_64_after_hwframe+0x44/0xa9
The usage of so many blkg is very rare, however, such problem do exist in theory. In order to avoid such warnings, release 'q->queue_lock' for a while when a batch of blkg were destroyed.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- block/blk-cgroup.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+)
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e592167449aa..c64f0afa27dc 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -364,16 +364,31 @@ static void blkg_destroy(struct blkcg_gq *blkg) */ static void blkg_destroy_all(struct request_queue *q) { +#define BLKG_DESTROY_BATCH 4096 struct blkcg_gq *blkg, *n; + int count;
lockdep_assert_held(q->queue_lock);
+again: + count = BLKG_DESTROY_BATCH; list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg;
spin_lock(&blkcg->lock); blkg_destroy(blkg); spin_unlock(&blkcg->lock); + /* + * If the list is too long, the loop can took a long time, + * thus relese the lock for a while when a batch of blkg + * were destroyed. + */ + if (!--count) { + spin_unlock_irq(q->queue_lock); + cond_resched(); + spin_lock_irq(q->queue_lock); + goto again; + } }
q->root_blkg = NULL;
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: 46357 CVE: NA
---------------------------
blk_throtl_update_limit_valid() will search for descendants to see if 'LIMIT_LOW' of bps/iops and READ/WRITE is nonzero. However, they're always zero if CONFIG_BLK_DEV_THROTTLING_LOW is not set, furthermore, a lot of time will be wasted to iterate descendants.
Thus do nothing in blk_throtl_update_limit_valid() in such situation.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Tao Hou houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- block/blk-throttle.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/block/blk-throttle.c b/block/blk-throttle.c index caee658609d7..c7b4f905feb5 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -568,6 +568,7 @@ static void throtl_pd_online(struct blkg_policy_data *pd) tg_update_has_rules(tg); }
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void blk_throtl_update_limit_valid(struct throtl_data *td) { struct cgroup_subsys_state *pos_css; @@ -588,6 +589,11 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td)
td->limit_valid[LIMIT_LOW] = low_valid; } +#else +static inline void blk_throtl_update_limit_valid(struct throtl_data *td) +{ +} +#endif
static void throtl_upgrade_state(struct throtl_data *td); static void throtl_pd_offline(struct blkg_policy_data *pd)
From: Kirill Tkhai ktkhai@virtuozzo.com
mainline inclusion from mainline-v5.2-rc1 commit 9851ac13592df77958ae7bac6ba39e71420c38ec category: bugfix bugzilla: 36232 CVE: NA
-------------------------------------------------
We know which LRU is not active.
[chris@chrisdown.name: fix build on !CONFIG_MEMCG] Link: http://lkml.kernel.org/r/20190322150513.GA22021@chrisdown.name Link: http://lkml.kernel.org/r/155290128498.31489.18250485448913338607.stgit@local... Signed-off-by: Kirill Tkhai ktkhai@virtuozzo.com Signed-off-by: Chris Down chris@chrisdown.name Reviewed-by: Daniel Jordan daniel.m.jordan@oracle.com Cc: Michal Hocko mhocko@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- include/linux/memcontrol.h | 6 ++++++ mm/vmscan.c | 10 ++++------ 2 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 24068566ada2..0069cd618b1e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1100,6 +1100,12 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, { }
+static inline void __count_memcg_events(struct mem_cgroup *memcg, + enum vm_event_item idx, + unsigned long count) +{ +} + static inline void count_memcg_page_event(struct page *page, int idx) { diff --git a/mm/vmscan.c b/mm/vmscan.c index ac6f4cbef5ea..01abd3f43a02 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2076,12 +2076,6 @@ static unsigned move_active_pages_to_lru(struct lruvec *lruvec, } }
- if (!is_active_lru(lru)) { - __count_vm_events(PGDEACTIVATE, nr_moved); - count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, - nr_moved); - } - return nr_moved; }
@@ -2176,6 +2170,10 @@ static void shrink_active_list(unsigned long nr_to_scan,
nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru); nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE); + + __count_vm_events(PGDEACTIVATE, nr_deactivate); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&pgdat->lru_lock);
From: Shakeel Butt shakeelb@google.com
mainline inclusion from mainline-v5.8-rc1 commit 5d91f31faf8ebed2acfc3a1d6ac344f95c488d66 category: bugfix bugzilla: 36232 CVE: NA
-------------------------------------------------
Many of the callbacks called by pagevec_lru_move_fn() does not correctly update the vmstats for huge pages. Fix that. Also __pagevec_lru_add_fn() use the irq-unsafe alternative to update the stat as the irqs are already disabled.
Signed-off-by: Shakeel Butt shakeelb@google.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Johannes Weiner hannes@cmpxchg.org Link: http://lkml.kernel.org/r/20200527182916.249910-1-shakeelb@google.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- mm/swap.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-)
diff --git a/mm/swap.c b/mm/swap.c index 320ac35bde26..a819f8746ee2 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -224,7 +224,7 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec, del_page_from_lru_list(page, lruvec, page_lru(page)); ClearPageActive(page); add_page_to_lru_list_tail(page, lruvec, page_lru(page)); - (*pgmoved)++; + (*pgmoved) += hpage_nr_pages(page); } }
@@ -284,7 +284,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, add_page_to_lru_list(page, lruvec, lru); trace_mm_lru_activate(page);
- __count_vm_event(PGACTIVATE); + __count_vm_events(PGACTIVATE, hpage_nr_pages(page)); update_page_reclaim_stat(lruvec, file, 1); } } @@ -499,6 +499,7 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, { int lru, file; bool active; + int nr_pages = hpage_nr_pages(page);
if (!PageLRU(page)) return; @@ -532,11 +533,11 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, * We moves tha page into tail of inactive. */ list_move_tail(&page->lru, &lruvec->lists[lru]); - __count_vm_event(PGROTATED); + __count_vm_events(PGROTATED, nr_pages); }
if (active) - __count_vm_event(PGDEACTIVATE); + __count_vm_events(PGDEACTIVATE, nr_pages); update_page_reclaim_stat(lruvec, file, 0); }
@@ -870,6 +871,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, { enum lru_list lru; int was_unevictable = TestClearPageUnevictable(page); + int nr_pages = hpage_nr_pages(page);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -907,13 +909,13 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, update_page_reclaim_stat(lruvec, page_is_file_cache(page), PageActive(page)); if (was_unevictable) - count_vm_event(UNEVICTABLE_PGRESCUED); + __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages); } else { lru = LRU_UNEVICTABLE; ClearPageActive(page); SetPageUnevictable(page); if (!was_unevictable) - count_vm_event(UNEVICTABLE_PGCULLED); + __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages); }
add_page_to_lru_list(page, lruvec, lru);
From: Shakeel Butt shakeelb@google.com
mainline inclusion from mainline-v5.8-rc1 commit 21e330fc632d6a288f73de48045b782cc51d501a category: bugfix bugzilla: 36232 CVE: NA
-------------------------------------------------
The commit 2262185c5b28 ("mm: per-cgroup memory reclaim stats") added PGLAZYFREE, PGACTIVATE & PGDEACTIVATE stats for cgroups but missed couple of places and PGLAZYFREE missed huge page handling. Fix that. Also for PGLAZYFREE use the irq-unsafe function to update as the irq is already disabled.
Fixes: 2262185c5b28 ("mm: per-cgroup memory reclaim stats") Signed-off-by: Shakeel Butt shakeelb@google.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Johannes Weiner hannes@cmpxchg.org Link: http://lkml.kernel.org/r/20200527182947.251343-1-shakeelb@google.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- mm/swap.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-)
diff --git a/mm/swap.c b/mm/swap.c index a819f8746ee2..0156aa4d772c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -277,6 +277,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { int file = page_is_file_cache(page); int lru = page_lru_base_type(page); + int nr_pages = hpage_nr_pages(page);
del_page_from_lru_list(page, lruvec, lru); SetPageActive(page); @@ -284,7 +285,9 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, add_page_to_lru_list(page, lruvec, lru); trace_mm_lru_activate(page);
- __count_vm_events(PGACTIVATE, hpage_nr_pages(page)); + __count_vm_events(PGACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, + nr_pages); update_page_reclaim_stat(lruvec, file, 1); } } @@ -536,18 +539,21 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, __count_vm_events(PGROTATED, nr_pages); }
- if (active) + if (active) { __count_vm_events(PGDEACTIVATE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, + nr_pages); + } update_page_reclaim_stat(lruvec, file, 0); }
- static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, void *arg) { if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && !PageSwapCache(page) && !PageUnevictable(page)) { bool active = PageActive(page); + int nr_pages = hpage_nr_pages(page);
del_page_from_lru_list(page, lruvec, LRU_INACTIVE_ANON + active); @@ -561,8 +567,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, ClearPageSwapBacked(page); add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
- __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); - count_memcg_page_event(page, PGLAZYFREE); + __count_vm_events(PGLAZYFREE, nr_pages); + __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, + nr_pages); update_page_reclaim_stat(lruvec, 1, 0); } }
From: Hugh Dickins hughd@google.com
mainline inclusion from mainline-v5.9-rc6 commit 0964730bf46b4e271c5ecad5badbbd95737c087b category: bugfix bugzilla: 36232 CVE: NA
-------------------------------------------------
5.8 commit 5d91f31faf8e ("mm: swap: fix vmstats for huge page") has established that vm_events should count every subpage of a THP, including unevictable_pgs_culled and unevictable_pgs_rescued; but lru_cache_add_inactive_or_unevictable() was not doing so for unevictable_pgs_mlocked, and mm/mlock.c was not doing so for unevictable_pgs mlocked, munlocked, cleared and stranded.
Fix them; but THPs don't go the pagevec way in mlock.c, so no fixes needed on that path.
Fixes: 5d91f31faf8e ("mm: swap: fix vmstats for huge page") Signed-off-by: Hugh Dickins hughd@google.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Shakeel Butt shakeelb@google.com Acked-by: Yang Shi shy828301@gmail.com Cc: Alex Shi alex.shi@linux.alibaba.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Cc: Mike Kravetz mike.kravetz@oracle.com Cc: Qian Cai cai@lca.pw Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008301408230.5954@eggly.anvils Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- mm/mlock.c | 25 +++++++++++++++---------- mm/swap.c | 6 +++--- 2 files changed, 18 insertions(+), 13 deletions(-)
diff --git a/mm/mlock.c b/mm/mlock.c index 8112a2200d3e..4077b1e8e199 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -58,12 +58,14 @@ EXPORT_SYMBOL(can_do_mlock); */ void clear_page_mlock(struct page *page) { + int nr_pages; + if (!TestClearPageMlocked(page)) return;
- mod_zone_page_state(page_zone(page), NR_MLOCK, - -hpage_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGCLEARED); + nr_pages = hpage_nr_pages(page); + mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages); + count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages); /* * The previous TestClearPageMlocked() corresponds to the smp_mb() * in __pagevec_lru_add_fn(). @@ -77,7 +79,7 @@ void clear_page_mlock(struct page *page) * We lost the race. the page already moved to evictable list. */ if (PageUnevictable(page)) - count_vm_event(UNEVICTABLE_PGSTRANDED); + count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); } }
@@ -94,9 +96,10 @@ void mlock_vma_page(struct page *page) VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
if (!TestSetPageMlocked(page)) { - mod_zone_page_state(page_zone(page), NR_MLOCK, - hpage_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGMLOCKED); + int nr_pages = hpage_nr_pages(page); + + mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); if (!isolate_lru_page(page)) putback_lru_page(page); } @@ -139,7 +142,7 @@ static void __munlock_isolated_page(struct page *page)
/* Did try_to_unlock() succeed or punt? */ if (!PageMlocked(page)) - count_vm_event(UNEVICTABLE_PGMUNLOCKED); + count_vm_events(UNEVICTABLE_PGMUNLOCKED, hpage_nr_pages(page));
putback_lru_page(page); } @@ -155,10 +158,12 @@ static void __munlock_isolated_page(struct page *page) */ static void __munlock_isolation_failed(struct page *page) { + int nr_pages = hpage_nr_pages(page); + if (PageUnevictable(page)) - __count_vm_event(UNEVICTABLE_PGSTRANDED); + __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages); else - __count_vm_event(UNEVICTABLE_PGMUNLOCKED); + __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages); }
/** diff --git a/mm/swap.c b/mm/swap.c index 0156aa4d772c..bdb9b294afbf 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -464,14 +464,14 @@ void lru_cache_add_active_or_unevictable(struct page *page, if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) SetPageActive(page); else if (!TestSetPageMlocked(page)) { + int nr_pages = hpage_nr_pages(page); /* * We use the irq-unsafe __mod_zone_page_stat because this * counter is not modified from interrupt context, and the pte * lock is held(spinlock), which implies preemption disabled. */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, - hpage_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGMLOCKED); + __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages); + count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages); } lru_cache_add(page); }
From: Hugh Dickins hughd@google.com
mainline inclusion from mainline-v5.9-rc6 commit 8d8869ca5d2d9d86db96271ab063fdcfa9baf5b4 category: bugfix bugzilla: 36232 CVE: NA
-------------------------------------------------
check_move_unevictable_pages() is used in making unevictable shmem pages evictable: by shmem_unlock_mapping(), drm_gem_check_release_pagevec() and i915/gem check_release_pagevec(). Those may pass down subpages of a huge page, when /sys/kernel/mm/transparent_hugepage/shmem_enabled is "force".
That does not crash or warn at present, but the accounting of vmstats unevictable_pgs_scanned and unevictable_pgs_rescued is inconsistent: scanned being incremented on each subpage, rescued only on the head (since tails already appear evictable once the head has been updated).
5.8 commit 5d91f31faf8e ("mm: swap: fix vmstats for huge page") has established that vm_events in general (and unevictable_pgs_rescued in particular) should count every subpage: so follow that precedent here.
Do this in such a way that if mem_cgroup_page_lruvec() is made stricter (to check page->mem_cgroup is always set), no problem: skip the tails before calling it, and add thp_nr_pages() to vmstats on the head.
Signed-off-by: Hugh Dickins hughd@google.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Shakeel Butt shakeelb@google.com Acked-by: Yang Shi shy828301@gmail.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@suse.com Cc: Mike Kravetz mike.kravetz@oracle.com Cc: Qian Cai cai@lca.pw Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2008301405000.5954@eggly.anvils Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- mm/vmscan.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index 01abd3f43a02..f49c11a6cff3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4364,8 +4364,14 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) for (i = 0; i < nr_pages; i++) { struct page *page = pages[i]; struct pglist_data *pagepgdat = page_pgdat(page); + int _nr_pages; + + if (PageTransTail(page)) + continue; + + _nr_pages = hpage_nr_pages(page); + pgscanned += _nr_pages;
- pgscanned++; if (pagepgdat != pgdat) { if (pgdat) spin_unlock_irq(&pgdat->lru_lock); @@ -4384,7 +4390,7 @@ void check_move_unevictable_pages(struct page **pages, int nr_pages) ClearPageUnevictable(page); del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); add_page_to_lru_list(page, lruvec, lru); - pgrescued++; + pgrescued += _nr_pages; } }
From: Huang Ying ying.huang@intel.com
mainline inclusion from mainline-v5.10-rc1 commit c4f9c701f9b44299e6adbc58d1a4bb2c40383494 category: bugfix bugzilla: 44801 CVE: NA
-------------------------------------------------
It is reported that the following bug is triggered if the HDD is used as swap device,
[ 5758.157556] BUG: kernel NULL pointer dereference, address: 0000000000000007 [ 5758.165331] #PF: supervisor write access in kernel mode [ 5758.171161] #PF: error_code(0x0002) - not-present page [ 5758.176894] PGD 0 P4D 0 [ 5758.179721] Oops: 0002 [#1] SMP PTI [ 5758.183614] CPU: 10 PID: 316 Comm: kswapd1 Kdump: loaded Tainted: G S --------- --- 5.9.0-0.rc3.1.tst.el8.x86_64 #1 [ 5758.196717] Hardware name: Intel Corporation S2600CP/S2600CP, BIOS SE5C600.86B.02.01.0002.082220131453 08/22/2013 [ 5758.208176] RIP: 0010:split_swap_cluster+0x47/0x60 [ 5758.213522] Code: c1 e3 06 48 c1 eb 0f 48 8d 1c d8 48 89 df e8 d0 20 6a 00 80 63 07 fb 48 85 db 74 16 48 89 df c6 07 00 66 66 66 90 31 c0 5b c3 <80> 24 25 07 00 00 00 fb 31 c0 5b c3 b8 f0 ff ff ff 5b c3 66 0f 1f [ 5758.234478] RSP: 0018:ffffb147442d7af0 EFLAGS: 00010246 [ 5758.240309] RAX: 0000000000000000 RBX: 000000000014b217 RCX: ffffb14779fd9000 [ 5758.248281] RDX: 000000000014b217 RSI: ffff9c52f2ab1400 RDI: 000000000014b217 [ 5758.256246] RBP: ffffe00c51168080 R08: ffffe00c5116fe08 R09: ffff9c52fffd3000 [ 5758.264208] R10: ffffe00c511537c8 R11: ffff9c52fffd3c90 R12: 0000000000000000 [ 5758.272172] R13: ffffe00c51170000 R14: ffffe00c51170000 R15: ffffe00c51168040 [ 5758.280134] FS: 0000000000000000(0000) GS:ffff9c52f2a80000(0000) knlGS:0000000000000000 [ 5758.289163] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 5758.295575] CR2: 0000000000000007 CR3: 0000000022a0e003 CR4: 00000000000606e0 [ 5758.303538] Call Trace: [ 5758.306273] split_huge_page_to_list+0x88b/0x950 [ 5758.311433] deferred_split_scan+0x1ca/0x310 [ 5758.316202] do_shrink_slab+0x12c/0x2a0 [ 5758.320491] shrink_slab+0x20f/0x2c0 [ 5758.324482] shrink_node+0x240/0x6c0 [ 5758.328469] balance_pgdat+0x2d1/0x550 [ 5758.332652] kswapd+0x201/0x3c0 [ 5758.336157] ? finish_wait+0x80/0x80 [ 5758.340147] ? balance_pgdat+0x550/0x550 [ 5758.344525] kthread+0x114/0x130 [ 5758.348126] ? kthread_park+0x80/0x80 [ 5758.352214] ret_from_fork+0x22/0x30 [ 5758.356203] Modules linked in: fuse zram rfkill sunrpc intel_rapl_msr intel_rapl_common sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp mgag200 iTCO_wdt crct10dif_pclmul iTCO_vendor_support drm_kms_helper crc32_pclmul ghash_clmulni_intel syscopyarea sysfillrect sysimgblt fb_sys_fops cec rapl joydev intel_cstate ipmi_si ipmi_devintf drm intel_uncore i2c_i801 ipmi_msghandler pcspkr lpc_ich mei_me i2c_smbus mei ioatdma ip_tables xfs libcrc32c sr_mod sd_mod cdrom t10_pi sg igb ahci libahci i2c_algo_bit crc32c_intel libata dca wmi dm_mirror dm_region_hash dm_log dm_mod [ 5758.412673] CR2: 0000000000000007 [ 0.000000] Linux version 5.9.0-0.rc3.1.tst.el8.x86_64 (mockbuild@x86-vm-15.build.eng.bos.redhat.com) (gcc (GCC) 8.3.1 20191121 (Red Hat 8.3.1-5), GNU ld version 2.30-79.el8) #1 SMP Wed Sep 9 16:03:34 EDT 2020
After further digging it's found that the following race condition exists in the original implementation,
CPU1 CPU2 ---- ---- deferred_split_scan() split_huge_page(page) /* page isn't compound head */ split_huge_page_to_list(page, NULL) __split_huge_page(page, ) ClearPageCompound(head) /* unlock all subpages except page (not head) */ add_to_swap(head) /* not THP */ get_swap_page(head) add_to_swap_cache(head, ) SetPageSwapCache(head) if PageSwapCache(head) split_swap_cluster(/* swap entry of head */) /* Deref sis->cluster_info: NULL accessing! */
So, in split_huge_page_to_list(), PageSwapCache() is called for the already split and unlocked "head", which may be added to swap cache in another CPU. So split_swap_cluster() may be called wrongly.
To fix the race, the call to split_swap_cluster() is moved to __split_huge_page() before all subpages are unlocked. So that the PageSwapCache() is stable.
Fixes: 59807685a7e77 ("mm, THP, swap: support splitting THP for THP swap out") Reported-by: Rafael Aquini aquini@redhat.com Signed-off-by: "Huang, Ying" ying.huang@intel.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Tested-by: Rafael Aquini aquini@redhat.com Acked-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Cc: Hugh Dickins hughd@google.com Cc: Andrea Arcangeli aarcange@redhat.com Cc: Matthew Wilcox willy@infradead.org Link: https://lkml.kernel.org/r/20201009073647.1531083-1-ying.huang@intel.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- mm/huge_memory.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6c4bda24c9b2..ffc4470e65f0 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2549,6 +2549,12 @@ static void __split_huge_page(struct page *page, struct list_head *list,
remap_page(head);
+ if (PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + split_swap_cluster(entry); + } + for (i = 0; i < HPAGE_PMD_NR; i++) { struct page *subpage = head + i; if (subpage == page) @@ -2786,12 +2792,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) __dec_node_page_state(page, NR_SHMEM_THPS); spin_unlock(&pgdata->split_queue_lock); __split_huge_page(page, list, end, flags); - if (PageSwapCache(head)) { - swp_entry_t entry = { .val = page_private(head) }; - - ret = split_swap_cluster(entry); - } else - ret = 0; + ret = 0; } else { if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { pr_alert("total_mapcount: %u, page_count(): %u\n",
From: Naoya Horiguchi naoya.horiguchi@nec.com
mainline inclusion from mainline-v5.10-rc1 commit 1f2481ddbe444de5bed72f167d7180d1b2708e56 category: bugfix bugzilla: 44803 CVE: NA
-------------------------------------------------
Soft offlining could fail with EIO due to the race condition with hugepage migration. This issuse became visible due to the change by previous patch that makes soft offline handler take page refcount by its own. We have no way to directly pin zero refcount page, and the page considered as a zero refcount page could be allocated just after the first check.
This patch adds the second check to find the race and gives us chance to handle it more reliably.
Reported-by: Qian Cai cai@lca.pw Signed-off-by: Naoya Horiguchi naoya.horiguchi@nec.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Cc: "Aneesh Kumar K.V" aneesh.kumar@linux.ibm.com Cc: Aneesh Kumar K.V aneesh.kumar@linux.vnet.ibm.com Cc: Aristeu Rozanski aris@ruivo.org Cc: Dave Hansen dave.hansen@intel.com Cc: David Hildenbrand david@redhat.com Cc: Dmitry Yakunin zeil@yandex-team.ru Cc: Michal Hocko mhocko@kernel.org Cc: Mike Kravetz mike.kravetz@oracle.com Cc: Oscar Salvador osalvador@suse.com Cc: Tony Luck tony.luck@intel.com Link: https://lkml.kernel.org/r/20200922135650.1634-14-osalvador@suse.de Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- mm/memory-failure.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 148fdd929a19..fa091bfb5943 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1646,6 +1646,9 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) } else if (is_free_buddy_page(p)) { pr_info("%s: %#lx free buddy page\n", __func__, pfn); ret = 0; + } else if (page_count(p)) { + /* raced with allocation */ + ret = -EBUSY; } else { pr_info("%s: %#lx: unknown zero refcount page type %lx\n", __func__, pfn, p->flags); @@ -1662,6 +1665,9 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) { int ret = __get_any_page(page, pfn, flags);
+ if (ret == -EBUSY) + ret = __get_any_page(page, pfn, flags); + if (ret == 1 && !PageHuge(page) && !PageLRU(page) && !__PageMovable(page)) { /*
From: "Matthew Wilcox (Oracle)" willy@infradead.org
mainline inclusion from mainline-v5.10-rc1 commit e320d3012d25b1fb5f3df4edb7bd44a1c362ec10 category: bugfix bugzilla: 43588 CVE: NA
-------------------------------------------------
Here is a very rare race which leaks memory:
Page P0 is allocated to the page cache. Page P1 is free.
Thread A Thread B Thread C find_get_entry(): xas_load() returns P0 Removes P0 from page cache P0 finds its buddy P1 alloc_pages(GFP_KERNEL, 1) returns P0 P0 has refcount 1 page_cache_get_speculative(P0) P0 has refcount 2 __free_pages(P0) P0 has refcount 1 put_page(P0) P1 is not freed
Fix this by freeing all the pages in __free_pages() that won't be freed by the call to put_page(). It's usually not a good idea to split a page, but this is a very unlikely scenario.
Fixes: e286781d5f2e ("mm: speculative page references") Signed-off-by: Matthew Wilcox (Oracle) willy@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Mike Rapoport rppt@linux.ibm.com Cc: Nick Piggin npiggin@gmail.com Cc: Hugh Dickins hughd@google.com Cc: Peter Zijlstra peterz@infradead.org Link: https://lkml.kernel.org/r/20200926213919.26642-1-willy@infradead.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- lib/Kconfig.debug | 9 +++++++++ lib/Makefile | 1 + lib/test_free_pages.c | 42 ++++++++++++++++++++++++++++++++++++++++++ mm/page_alloc.c | 3 +++ 4 files changed, 55 insertions(+) create mode 100644 lib/test_free_pages.c
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 47dca144782b..0ee305de7d0e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1984,6 +1984,15 @@ config TEST_DEBUG_VIRTUAL
If unsure, say N.
+config TEST_FREE_PAGES + tristate "Test freeing pages" + help + Test that a memory leak does not occur due to a race between + freeing a block of pages and a speculative page reference. + Loading this module is safe if your kernel has the bug fixed. + If the bug is not fixed, it will leak gigabytes of memory and + probably OOM your system. + endif # RUNTIME_TESTING_MENU
config MEMTEST diff --git a/lib/Makefile b/lib/Makefile index 0ab808318202..73abfe59558f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -82,6 +82,7 @@ obj-$(CONFIG_TEST_UUID) += test_uuid.o obj-$(CONFIG_TEST_PARMAN) += test_parman.o obj-$(CONFIG_TEST_KMOD) += test_kmod.o obj-$(CONFIG_TEST_DEBUG_VIRTUAL) += test_debug_virtual.o +obj-$(CONFIG_TEST_FREE_PAGES) += test_free_pages.o
ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG diff --git a/lib/test_free_pages.c b/lib/test_free_pages.c new file mode 100644 index 000000000000..074e76bd76b2 --- /dev/null +++ b/lib/test_free_pages.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * test_free_pages.c: Check that free_pages() doesn't leak memory + * Copyright (c) 2020 Oracle + * Author: Matthew Wilcox willy@infradead.org + */ + +#include <linux/gfp.h> +#include <linux/mm.h> +#include <linux/module.h> + +static void test_free_pages(gfp_t gfp) +{ + unsigned int i; + + for (i = 0; i < 1000 * 1000; i++) { + unsigned long addr = __get_free_pages(gfp, 3); + struct page *page = virt_to_page(addr); + + /* Simulate page cache getting a speculative reference */ + get_page(page); + free_pages(addr, 3); + put_page(page); + } +} + +static int m_in(void) +{ + test_free_pages(GFP_KERNEL); + test_free_pages(GFP_KERNEL | __GFP_COMP); + + return 0; +} + +static void m_ex(void) +{ +} + +module_init(m_in); +module_exit(m_ex); +MODULE_AUTHOR("Matthew Wilcox willy@infradead.org"); +MODULE_LICENSE("GPL"); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 869074294cce..504c9699b1af 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4561,6 +4561,9 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) free_the_page(page, order); + else if (!PageHead(page)) + while (order-- > 0) + free_the_page(page + (1 << order), order); } EXPORT_SYMBOL(__free_pages);
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category:feature bugzilla: 43588 CVE: NA
-------------------------------------------------
set default value of CONFIG_TEST_FREE_PAGES
Signed-off-by: Liu Shixin liushixin2@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- arch/arm64/configs/euleros_defconfig | 1 + arch/arm64/configs/hulk_defconfig | 1 + arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/hulk_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + 5 files changed, 5 insertions(+)
diff --git a/arch/arm64/configs/euleros_defconfig b/arch/arm64/configs/euleros_defconfig index b26910444f52..122be650462a 100644 --- a/arch/arm64/configs/euleros_defconfig +++ b/arch/arm64/configs/euleros_defconfig @@ -5629,6 +5629,7 @@ CONFIG_TEST_KSTRTOX=y # CONFIG_TEST_UDELAY is not set # CONFIG_TEST_STATIC_KEYS is not set # CONFIG_TEST_KMOD is not set +# CONFIG_TEST_FREE_PAGES is not set # CONFIG_MEMTEST is not set # CONFIG_BUG_ON_DATA_CORRUPTION is not set # CONFIG_SAMPLES is not set diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index 090ff6c93730..b85d790d630b 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -5634,6 +5634,7 @@ CONFIG_TEST_KSTRTOX=y # CONFIG_TEST_UDELAY is not set # CONFIG_TEST_STATIC_KEYS is not set # CONFIG_TEST_KMOD is not set +# CONFIG_TEST_FREE_PAGES is not set # CONFIG_MEMTEST is not set # CONFIG_BUG_ON_DATA_CORRUPTION is not set # CONFIG_SAMPLES is not set diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 5561751ad6eb..e1f2135f2b5b 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -5941,6 +5941,7 @@ CONFIG_TEST_KSTRTOX=y # CONFIG_TEST_UDELAY is not set # CONFIG_TEST_STATIC_KEYS is not set # CONFIG_TEST_KMOD is not set +# CONFIG_TEST_FREE_PAGES is not set # CONFIG_MEMTEST is not set # CONFIG_BUG_ON_DATA_CORRUPTION is not set # CONFIG_SAMPLES is not set diff --git a/arch/x86/configs/hulk_defconfig b/arch/x86/configs/hulk_defconfig index 4079cef76f2c..c7c7d8776c2a 100644 --- a/arch/x86/configs/hulk_defconfig +++ b/arch/x86/configs/hulk_defconfig @@ -7485,6 +7485,7 @@ CONFIG_TEST_KSTRTOX=y # CONFIG_TEST_UDELAY is not set # CONFIG_TEST_STATIC_KEYS is not set # CONFIG_TEST_KMOD is not set +# CONFIG_TEST_FREE_PAGES is not set # CONFIG_MEMTEST is not set # CONFIG_BUG_ON_DATA_CORRUPTION is not set # CONFIG_SAMPLES is not set diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index bb1bcb8de1c5..a52e5932dcc6 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -7486,6 +7486,7 @@ CONFIG_TEST_KSTRTOX=y # CONFIG_TEST_UDELAY is not set # CONFIG_TEST_STATIC_KEYS is not set # CONFIG_TEST_KMOD is not set +# CONFIG_TEST_FREE_PAGES is not set # CONFIG_MEMTEST is not set # CONFIG_BUG_ON_DATA_CORRUPTION is not set # CONFIG_SAMPLES is not set
From: Marc Zyngier maz@kernel.org
stable inclusion from linux-4.19.155 commit d953cd3fb149a999534ff48af323eb091fd5aa3b
--------------------------------
commit 18fce56134c987e5b4eceddafdbe4b00c07e2ae1 upstream.
Commit 73f381660959 ("arm64: Advertise mitigation of Spectre-v2, or lack thereof") changed the way we deal with ARCH_WORKAROUND_1, by moving most of the enabling code to the .matches() callback.
This has the unfortunate effect that the workaround gets only enabled on the first affected CPU, and no other.
In order to address this, forcefully call the .matches() callback from a .cpu_enable() callback, which brings us back to the original behaviour.
Fixes: 73f381660959 ("arm64: Advertise mitigation of Spectre-v2, or lack thereof") Cc: stable@vger.kernel.org Reviewed-by: Suzuki K Poulose suzuki.poulose@arm.com Signed-off-by: Marc Zyngier maz@kernel.org Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- arch/arm64/kernel/cpu_errata.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 5f2c30ae82ac..c0b2ef5e7ea3 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -647,6 +647,12 @@ check_branch_predictor(const struct arm64_cpu_capabilities *entry, int scope) return (need_wa > 0); }
+static void +cpu_enable_branch_predictor_hardening(const struct arm64_cpu_capabilities *cap) +{ + cap->matches(cap, SCOPE_LOCAL_CPU); +} + static const __maybe_unused struct midr_range tx2_family_cpus[] = { MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), @@ -829,9 +835,11 @@ const struct arm64_cpu_capabilities arm64_errata[] = { }, #endif { + .desc = "Branch predictor hardening", .capability = ARM64_HARDEN_BRANCH_PREDICTOR, .type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM, .matches = check_branch_predictor, + .cpu_enable = cpu_enable_branch_predictor_hardening, }, #ifdef CONFIG_HARDEN_EL2_VECTORS {
From: Michael Schaller misch@google.com
stable inclusion from linux-4.19.155 commit 02bb497cd6de22e9ce17396957e76fa5aa11102f
--------------------------------
commit 336af6a4686d885a067ecea8c3c3dd129ba4fc75 upstream.
Without this patch efivarfs_alloc_dentry creates dentries with slashes in their name if the respective EFI variable has slashes in its name. This in turn causes EIO on getdents64, which prevents a complete directory listing of /sys/firmware/efi/efivars/.
This patch replaces the invalid shlashes with exclamation marks like kobject_set_name_vargs does for /sys/firmware/efi/vars/ to have consistently named dentries under /sys/firmware/efi/vars/ and /sys/firmware/efi/efivars/.
Signed-off-by: Michael Schaller misch@google.com Link: https://lore.kernel.org/r/20200925074502.150448-1-misch@google.com Signed-off-by: Ard Biesheuvel ardb@kernel.org Signed-off-by: dann frazier dann.frazier@canonical.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- fs/efivarfs/super.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 5b68e4294faa..834615f13f3e 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -145,6 +145,9 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
+ /* replace invalid slashes like kobject_set_name_vargs does for /sys/firmware/efi/vars. */ + strreplace(name, '/', '!'); + inode = efivarfs_get_inode(sb, d_inode(root), S_IFREG | 0644, 0, is_removable); if (!inode)
From: Aleksandr Nogikh nogikh@google.com
stable inclusion from linux-4.19.155 commit 95ba2236b8e69de3cb9b12e1cd6c4252a1574a19
--------------------------------
[ Upstream commit eadd1befdd778a1eca57fad058782bd22b4db804 ]
Currently it is possible to craft a special netlink RTM_NEWQDISC command that can result in jitter being equal to 0x80000000. It is enough to set the 32 bit jitter to 0x02000000 (it will later be multiplied by 2^6) or just set the 64 bit jitter via TCA_NETEM_JITTER64. This causes an overflow during the generation of uniformly distributed numbers in tabledist(), which in turn leads to division by zero (sigma != 0, but sigma * 2 is 0).
The related fragment of code needs 32-bit division - see commit 9b0ed89 ("netem: remove unnecessary 64 bit modulus"), so switching to 64 bit is not an option.
Fix the issue by keeping the value of jitter within the range that can be adequately handled by tabledist() - [0;INT_MAX]. As negative std deviation makes no sense, take the absolute value of the passed value and cap it at INT_MAX. Inside tabledist(), switch to unsigned 32 bit arithmetic in order to prevent overflows.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Aleksandr Nogikh nogikh@google.com Reported-by: syzbot+ec762a6342ad0d3c0d8f@syzkaller.appspotmail.com Acked-by: Stephen Hemminger stephen@networkplumber.org Link: https://lore.kernel.org/r/20201028170731.1383332-1-aleksandrnogikh@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Cheng Jian cj.chengjian@huawei.com --- net/sched/sch_netem.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 014a28d8dd4f..02d8d3fd84a5 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -330,7 +330,7 @@ static s64 tabledist(s64 mu, s32 sigma,
/* default uniform distribution */ if (dist == NULL) - return ((rnd % (2 * sigma)) + mu) - sigma; + return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
t = dist->table[rnd % dist->size]; x = (sigma % NETEM_DIST_SCALE) * t; @@ -787,6 +787,10 @@ static void get_slot(struct netem_sched_data *q, const struct nlattr *attr) q->slot_config.max_packets = INT_MAX; if (q->slot_config.max_bytes == 0) q->slot_config.max_bytes = INT_MAX; + + /* capping dist_jitter to the range acceptable by tabledist() */ + q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter)); + q->slot.packets_left = q->slot_config.max_packets; q->slot.bytes_left = q->slot_config.max_bytes; if (q->slot_config.min_delay | q->slot_config.max_delay | @@ -1011,6 +1015,9 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_NETEM_SLOT]) get_slot(q, tb[TCA_NETEM_SLOT]);
+ /* capping jitter to the range acceptable by tabledist() */ + q->jitter = min_t(s64, abs(q->jitter), INT_MAX); + return ret;
get_table_failure: