Ard Biesheuvel (1): crypto: tcrypt - avoid signed overflow in byte count
Guenter Roeck (1): lib/list_debug.c: Detect uninitialized lists
Guoqing Jiang (1): md: protect md_unregister_thread from reentrancy
Konstantin Khlebnikov (1): sched/rt: Optimize checking group RT scheduler constraints
Sean Christopherson (1): hugetlbfs: fix off-by-one error in hugetlb_vmdelete_list()
crypto/tcrypt.c | 20 ++++++++++---------- drivers/md/md.c | 15 ++++++++++----- fs/hugetlbfs/inode.c | 7 ++++--- kernel/sched/rt.c | 24 +++++++++++------------- lib/list_debug.c | 12 ++++++++++-- 5 files changed, 45 insertions(+), 33 deletions(-)
From: Ard Biesheuvel ardb@kernel.org
stable inclusion from stable-v4.19.179 commit 2fd58d8bb0853c3f424a089f71805e26b9b5e899 category: bugfix bugzilla: 188240 CVE: NA
--------------------------------
[ Upstream commit 303fd3e1c771077e32e96e5788817f025f0067e2 ]
The signed long type used for printing the number of bytes processed in tcrypt benchmarks limits the range to -/+ 2 GiB, which is not sufficient to cover the performance of common accelerated ciphers such as AES-NI when benchmarked with sec=1. So switch to u64 instead.
While at it, fix up a missing printk->pr_cont conversion in the AEAD benchmark.
Signed-off-by: Ard Biesheuvel ardb@kernel.org Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: GUO Zihua guozihua@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- crypto/tcrypt.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/crypto/tcrypt.c b/crypto/tcrypt.c index 43e3f24885ae..366f4510acbe 100644 --- a/crypto/tcrypt.c +++ b/crypto/tcrypt.c @@ -202,8 +202,8 @@ static int test_mb_aead_jiffies(struct test_mb_aead_data *data, int enc, goto out; }
- pr_cont("%d operations in %d seconds (%ld bytes)\n", - bcount * num_mb, secs, (long)bcount * blen * num_mb); + pr_cont("%d operations in %d seconds (%llu bytes)\n", + bcount * num_mb, secs, (u64)bcount * blen * num_mb);
out: kfree(rc); @@ -472,8 +472,8 @@ static int test_aead_jiffies(struct aead_request *req, int enc, return ret; }
- printk("%d operations in %d seconds (%ld bytes)\n", - bcount, secs, (long)bcount * blen); + pr_cont("%d operations in %d seconds (%llu bytes)\n", + bcount, secs, (u64)bcount * blen); return 0; }
@@ -763,8 +763,8 @@ static int test_mb_ahash_jiffies(struct test_mb_ahash_data *data, int blen, goto out; }
- pr_cont("%d operations in %d seconds (%ld bytes)\n", - bcount * num_mb, secs, (long)bcount * blen * num_mb); + pr_cont("%d operations in %d seconds (%llu bytes)\n", + bcount * num_mb, secs, (u64)bcount * blen * num_mb);
out: kfree(rc); @@ -1200,8 +1200,8 @@ static int test_mb_acipher_jiffies(struct test_mb_skcipher_data *data, int enc, goto out; }
- pr_cont("%d operations in %d seconds (%ld bytes)\n", - bcount * num_mb, secs, (long)bcount * blen * num_mb); + pr_cont("%d operations in %d seconds (%llu bytes)\n", + bcount * num_mb, secs, (u64)bcount * blen * num_mb);
out: kfree(rc); @@ -1429,8 +1429,8 @@ static int test_acipher_jiffies(struct skcipher_request *req, int enc, return ret; }
- pr_cont("%d operations in %d seconds (%ld bytes)\n", - bcount, secs, (long)bcount * blen); + pr_cont("%d operations in %d seconds (%llu bytes)\n", + bcount, secs, (u64)bcount * blen); return 0; }
From: Guenter Roeck linux@roeck-us.net
stable inclusion from stable-v4.19.256 commit 2ea5c0eb2786cda8043a7f72d54f1bfd5e496460 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6C67X CVE: NA
--------------------------------
[ Upstream commit 0cc011c576aaa4de505046f7a6c90933d7c749a9 ]
In some circumstances, attempts are made to add entries to or to remove entries from an uninitialized list. A prime example is amdgpu_bo_vm_destroy(): It is indirectly called from ttm_bo_init_reserved() if that function fails, and tries to remove an entry from a list. However, that list is only initialized in amdgpu_bo_create_vm() after the call to ttm_bo_init_reserved() returned success. This results in crashes such as
BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 1479 Comm: chrome Not tainted 5.10.110-15768-g29a72e65dae5 Hardware name: Google Grunt/Grunt, BIOS Google_Grunt.11031.149.0 07/15/2020 RIP: 0010:__list_del_entry_valid+0x26/0x7d ... Call Trace: amdgpu_bo_vm_destroy+0x48/0x8b ttm_bo_init_reserved+0x1d7/0x1e0 amdgpu_bo_create+0x212/0x476 ? amdgpu_bo_user_destroy+0x23/0x23 ? kmem_cache_alloc+0x60/0x271 amdgpu_bo_create_vm+0x40/0x7d amdgpu_vm_pt_create+0xe8/0x24b ...
Check if the list's prev and next pointers are NULL to catch such problems.
Link: https://lkml.kernel.org/r/20220531222951.92073-1-linux@roeck-us.net Signed-off-by: Guenter Roeck linux@roeck-us.net Cc: Steven Rostedt rostedt@goodmis.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Ye Weihua yeweihua4@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- lib/list_debug.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/lib/list_debug.c b/lib/list_debug.c index 5d5424b51b74..413daa72a3d8 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -20,7 +20,11 @@ bool __list_add_valid(struct list_head *new, struct list_head *prev, struct list_head *next) { - if (CHECK_DATA_CORRUPTION(next->prev != prev, + if (CHECK_DATA_CORRUPTION(prev == NULL, + "list_add corruption. prev is NULL.\n") || + CHECK_DATA_CORRUPTION(next == NULL, + "list_add corruption. next is NULL.\n") || + CHECK_DATA_CORRUPTION(next->prev != prev, "list_add corruption. next->prev should be prev (%px), but was %px. (next=%px).\n", prev, next->prev, next) || CHECK_DATA_CORRUPTION(prev->next != next, @@ -42,7 +46,11 @@ bool __list_del_entry_valid(struct list_head *entry) prev = entry->prev; next = entry->next;
- if (CHECK_DATA_CORRUPTION(next == LIST_POISON1, + if (CHECK_DATA_CORRUPTION(next == NULL, + "list_del corruption, %px->next is NULL\n", entry) || + CHECK_DATA_CORRUPTION(prev == NULL, + "list_del corruption, %px->prev is NULL\n", entry) || + CHECK_DATA_CORRUPTION(next == LIST_POISON1, "list_del corruption, %px->next is LIST_POISON1 (%px)\n", entry, LIST_POISON1) || CHECK_DATA_CORRUPTION(prev == LIST_POISON2,
From: Sean Christopherson seanjc@google.com
mainline inclusion from mainline-v6.2-rc1 commit d6aba4c8e20d4d2bf65d589953f6d891c178f3a3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6BE56 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h...
--------------------------------
Pass "end - 1" instead of "end" when walking the interval tree in hugetlb_vmdelete_list() to fix an inclusive vs. exclusive bug. The two callers that pass a non-zero "end" treat it as exclusive, whereas the interval tree iterator expects an inclusive "last". E.g. punching a hole in a file that precisely matches the size of a single hugepage, with a vma starting right on the boundary, will result in unmap_hugepage_range() being called twice, with the second call having start==end.
The off-by-one error doesn't cause functional problems as __unmap_hugepage_range() turns into a massive nop due to short-circuiting its for-loop on "address < end". But, the mmu_notifier invocations to invalid_range_{start,end}() are passed a bogus zero-sized range, which may be unexpected behavior for secondary MMUs.
The bug was exposed by commit ed922739c919 ("KVM: Use interval tree to do fast hva lookup in memslots"), currently queued in the KVM tree for 5.17, which added a WARN to detect ranges with start==end.
Link: https://lkml.kernel.org/r/20211228234257.1926057-1-seanjc@google.com Fixes: 1bfad99ab425 ("hugetlbfs: hugetlb_vmtruncate_list() needs to take a range to delete") Signed-off-by: Sean Christopherson seanjc@google.com Reported-by: syzbot+4e697fe80a31aa7efe21@syzkaller.appspotmail.com Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Cc: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Reviewed-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- fs/hugetlbfs/inode.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 23c86e5b51a9..f29d6d2fb052 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -415,10 +415,11 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) struct vm_area_struct *vma;
/* - * end == 0 indicates that the entire range after - * start should be unmapped. + * end == 0 indicates that the entire range after start should be + * unmapped. Note, end is exclusive, whereas the interval tree takes + * an inclusive "last". */ - vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { + vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { unsigned long v_offset; unsigned long v_end;
From: Guoqing Jiang guoqing.jiang@cloud.ionos.com
mainline inclusion from mainline-v5.19-rc1 commit 1e267742283a4b5a8ca65755c44166be27e9aa0f category: bugfix bugzilla: 188227, https://gitee.com/openeuler/kernel/issues/I6AG8P CVE: NA
--------------------------------
Generally, the md_unregister_thread is called with reconfig_mutex, but raid_message in dm-raid doesn't hold reconfig_mutex to unregister thread, so md_unregister_thread can be called simulitaneously from two call sites in theory.
Then after previous commit which remove the protection of reconfig_mutex for md_unregister_thread completely, the potential issue could be worse than before.
Let's take pers_lock at the beginning of function to ensure reentrancy.
Reported-by: Donald Buczek buczek@molgen.mpg.de Signed-off-by: Guoqing Jiang guoqing.jiang@linux.dev Signed-off-by: Song Liu song@kernel.org Signed-off-by: Li Nan linan122@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- drivers/md/md.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-)
diff --git a/drivers/md/md.c b/drivers/md/md.c index 1088512b9a11..c4b933ea3db1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7697,17 +7697,22 @@ EXPORT_SYMBOL(md_register_thread);
void md_unregister_thread(struct md_thread **threadp) { - struct md_thread *thread = *threadp; - if (!thread) - return; - pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); - /* Locking ensures that mddev_unlock does not wake_up a + struct md_thread *thread; + + /* + * Locking ensures that mddev_unlock does not wake_up a * non-existent thread */ spin_lock(&pers_lock); + thread = *threadp; + if (!thread) { + spin_unlock(&pers_lock); + return; + } *threadp = NULL; spin_unlock(&pers_lock);
+ pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); kthread_stop(thread->tsk); kfree(thread); }
From: Konstantin Khlebnikov khlebnikov@yandex-team.ru
mainline inclusion from mainline-v5.7-rc1 commit b4fb015eeff7f3e5518a7dbe8061169a3e2f2bc7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6DZRO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Group RT scheduler contains protection against setting zero runtime for cgroup with RT tasks. Right now function tg_set_rt_bandwidth() iterates over all CPU cgroups and calls tg_has_rt_tasks() for any cgroup which runtime is zero (not only for changed one). Default RT runtime is zero, thus tg_has_rt_tasks() will is called for almost at CPU cgroups.
This protection already is slightly racy: runtime limit could be changed between cpu_cgroup_can_attach() and cpu_cgroup_attach() because changing cgroup attribute does not lock cgroup_mutex while attach does not lock rt_constraints_mutex. Changing task scheduler class also races with changing rt runtime: check in __sched_setscheduler() isn't protected.
Function tg_has_rt_tasks() iterates over all threads in the system. This gives NR_CGROUPS * NR_TASKS operations under single tasklist_lock locked for read tg_set_rt_bandwidth(). Any concurrent attempt of locking tasklist_lock for write (for example fork) will stuck with disabled irqs.
This patch makes two optimizations: 1) Remove locking tasklist_lock and iterate only tasks in cgroup 2) Call tg_has_rt_tasks() iff rt runtime changes from non-zero to zero
All changed code is under CONFIG_RT_GROUP_SCHED.
Testcase:
# mkdir /sys/fs/cgroup/cpu/test{1..10000} # echo 0 | tee /sys/fs/cgroup/cpu/test*/cpu.rt_runtime_us
At the same time without patch fork time will be >100ms:
# perf trace -e clone --duration 100 stress-ng --fork 1
Also remote ping will show timings >100ms caused by irq latency.
Signed-off-by: Konstantin Khlebnikov khlebnikov@yandex-team.ru Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Link: https://lkml.kernel.org/r/157996383820.4651.11292439232549211693.stgit@buzz Signed-off-by: Zhao Wenhui zhaowenhui8@huawei.com Reviewed-by: chenhui judy.chenhui@huawei.com Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com --- kernel/sched/rt.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a500bd78199b..4d6f1bc86b1b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2423,10 +2423,11 @@ const struct sched_class rt_sched_class = { */ static DEFINE_MUTEX(rt_constraints_mutex);
-/* Must be called with tasklist_lock held */ static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_struct *g, *p; + struct task_struct *task; + struct css_task_iter it; + int ret = 0;
/* * Autogroups do not have RT tasks; see autogroup_create(). @@ -2434,12 +2435,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg) if (task_group_is_autogroup(tg)) return 0;
- for_each_process_thread(g, p) { - if (rt_task(p) && task_group(p) == tg) - return 1; - } + css_task_iter_start(&tg->css, 0, &it); + while (!ret && (task = css_task_iter_next(&it))) + ret |= rt_task(task); + css_task_iter_end(&it);
- return 0; + return ret; }
struct rt_schedulable_data { @@ -2470,9 +2471,10 @@ static int tg_rt_schedulable(struct task_group *tg, void *data) return -EINVAL;
/* - * Ensure we don't starve existing RT tasks. + * Ensure we don't starve existing RT tasks if runtime turns zero. */ - if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + if (rt_bandwidth_enabled() && !runtime && + tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg)) return -EBUSY;
total = to_ratio(period, runtime); @@ -2538,7 +2540,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg, return -EINVAL;
mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); err = __rt_schedulable(tg, rt_period, rt_runtime); if (err) goto unlock; @@ -2556,7 +2557,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg, } raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); unlock: - read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex);
return err; @@ -2615,9 +2615,7 @@ static int sched_rt_global_constraints(void) int ret = 0;
mutex_lock(&rt_constraints_mutex); - read_lock(&tasklist_lock); ret = __rt_schedulable(NULL, 0, 0); - read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex);
return ret;