hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9VPMT
---------------------------
Remove warn printk for checking 'rq->tmp_alone_branch !=
&rq->leaf_cfs_rq_list' to avoid rq deadlock.
Deadlock analaze:
cpu 0
distribute_cfs_runtime --- rq_lock_irqsave(rq, &rf);
->__warn_printk
->try_to_wake_up --- rq_lock(rq, &rf), deadlock
Call Trace:
queued_spin_lock_slowpath at ffff000080173358
try_to_wake_up at ffff000080141068
wake_up_process at ffff00008014113c
insert_work at ffff000080123750
__queue_work at ffff0000801257ac
queue_work_on at ffff000080125c54
drm_fb_helper_dirty at ffff0000806dcd44
drm_fb_helper_sys_imageblit at ffff0000806dcf04
virtio_gpu_3d_imageblit at ffff000000c915d0 [virtio_gpu]
soft_cursor at ffff0000805e3e04
bit_cursor at ffff0000805e3654
fbcon_cursor at ffff0000805df404
hide_cursor at ffff000080677d68
vt_console_print at ffff0000806799dc
console_unlock at ffff000080183d78
vprintk_emit at ffff000080185948
vprintk_default at ffff000080185b80
vprintk_func at ffff000080186c44
printk at ffff000080186394
__warn_printk at ffff000080102d60
unthrottle_cfs_rq at ffff000080155e50
distribute_cfs_runtime at ffff00008015617c
sched_cfs_period_timer at ffff00008015654c
__hrtimer_run_queues at ffff0000801b2c58
hrtimer_interrupt at ffff0000801b3c74
arch_timer_handler_virt at ffff00008089dc3c
handle_percpu_devid_irq at ffff00008018fb3c
generic_handle_irq at ffff000080187140
__handle_domain_irq at ffff000080187adc
gic_handle_irq at ffff000080081814
Fixes: 6e9efc5d870d ("sched/fair: Add tmp_alone_branch assertion")
Signed-off-by: Hui Tang <tanghui20(a)huawei.com>
---
kernel/sched/fair.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3bd5aa6dedb3..aee13d30a7de 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -393,9 +393,12 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
}
+/*
+ * There are possible rq deadlock when warn is triggered,
+ * since try_to_wake_up may called by __warn_printk.
+ */
static inline void assert_list_leaf_cfs_rq(struct rq *rq)
{
- SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
--
2.34.1
hulk inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I9VPMT
---------------------------
Remove warn printk for checking 'rq->tmp_alone_branch !=
&rq->leaf_cfs_rq_list' to avoid rq deadlock.
Deadlock analaze:
cpu 0
distribute_cfs_runtime --- rq_lock_irqsave(rq, &rf);
->__warn_printk
->try_to_wake_up --- rq_lock(rq, &rf), deadlock
Call Trace:
queued_spin_lock_slowpath at ffff000080173358
try_to_wake_up at ffff000080141068
wake_up_process at ffff00008014113c
insert_work at ffff000080123750
__queue_work at ffff0000801257ac
queue_work_on at ffff000080125c54
drm_fb_helper_dirty at ffff0000806dcd44
drm_fb_helper_sys_imageblit at ffff0000806dcf04
virtio_gpu_3d_imageblit at ffff000000c915d0 [virtio_gpu]
soft_cursor at ffff0000805e3e04
bit_cursor at ffff0000805e3654
fbcon_cursor at ffff0000805df404
hide_cursor at ffff000080677d68
vt_console_print at ffff0000806799dc
console_unlock at ffff000080183d78
vprintk_emit at ffff000080185948
vprintk_default at ffff000080185b80
vprintk_func at ffff000080186c44
printk at ffff000080186394
__warn_printk at ffff000080102d60
unthrottle_cfs_rq at ffff000080155e50
distribute_cfs_runtime at ffff00008015617c
sched_cfs_period_timer at ffff00008015654c
__hrtimer_run_queues at ffff0000801b2c58
hrtimer_interrupt at ffff0000801b3c74
arch_timer_handler_virt at ffff00008089dc3c
handle_percpu_devid_irq at ffff00008018fb3c
generic_handle_irq at ffff000080187140
__handle_domain_irq at ffff000080187adc
gic_handle_irq at ffff000080081814
Signed-off-by: Hui Tang <tanghui20(a)huawei.com>
---
kernel/sched/fair.c | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3bd5aa6dedb3..aee13d30a7de 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -393,9 +393,12 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
}
+/*
+ * There are possible rq deadlock when warn is triggered,
+ * since try_to_wake_up may called by __warn_printk.
+ */
static inline void assert_list_leaf_cfs_rq(struct rq *rq)
{
- SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list);
}
/* Iterate thr' all leaf cfs_rq's on a runqueue */
--
2.34.1
From: Josef Bacik <josef(a)toxicpanda.com>
stable inclusion
from stable-v6.6.24
commit ded566b4637f1b6b4c9ba74e7d0b8493e93f19cf
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9Q8ZZ
CVE: CVE-2024-35784
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
commit b0ad381fa7690244802aed119b478b4bdafc31dd upstream.
While working on the patchset to remove extent locking I got a lockdep
splat with fiemap and pagefaulting with my new extent lock replacement
lock.
This deadlock exists with our normal code, we just don't have lockdep
annotations with the extent locking so we've never noticed it.
Since we're copying the fiemap extent to user space on every iteration
we have the chance of pagefaulting. Because we hold the extent lock for
the entire range we could mkwrite into a range in the file that we have
mmap'ed. This would deadlock with the following stack trace
[<0>] lock_extent+0x28d/0x2f0
[<0>] btrfs_page_mkwrite+0x273/0x8a0
[<0>] do_page_mkwrite+0x50/0xb0
[<0>] do_fault+0xc1/0x7b0
[<0>] __handle_mm_fault+0x2fa/0x460
[<0>] handle_mm_fault+0xa4/0x330
[<0>] do_user_addr_fault+0x1f4/0x800
[<0>] exc_page_fault+0x7c/0x1e0
[<0>] asm_exc_page_fault+0x26/0x30
[<0>] rep_movs_alternative+0x33/0x70
[<0>] _copy_to_user+0x49/0x70
[<0>] fiemap_fill_next_extent+0xc8/0x120
[<0>] emit_fiemap_extent+0x4d/0xa0
[<0>] extent_fiemap+0x7f8/0xad0
[<0>] btrfs_fiemap+0x49/0x80
[<0>] __x64_sys_ioctl+0x3e1/0xb50
[<0>] do_syscall_64+0x94/0x1a0
[<0>] entry_SYSCALL_64_after_hwframe+0x6e/0x76
I wrote an fstest to reproduce this deadlock without my replacement lock
and verified that the deadlock exists with our existing locking.
To fix this simply don't take the extent lock for the entire duration of
the fiemap. This is safe in general because we keep track of where we
are when we're searching the tree, so if an ordered extent updates in
the middle of our fiemap call we'll still emit the correct extents
because we know what offset we were on before.
The only place we maintain the lock is searching delalloc. Since the
delalloc stuff can change during writeback we want to lock the extent
range so we have a consistent view of delalloc at the time we're
checking to see if we need to set the delalloc flag.
With this patch applied we no longer deadlock with my testcase.
CC: stable(a)vger.kernel.org # 6.1+
Reviewed-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Conflicts:
fs/btrfs/extent_io.c
[The key to fixing the patch is the fiemap_process_hole function, which
locks only before querying delalloc. Earlier versions do not have this
function, and adaptation requires a lot of refactoring patches. So do
something similar directly in the get_extent_skip_holes function, which
contains the logic to query delalloc.]
Signed-off-by: Zizhi Wo <wozizhi(a)huawei.com>
---
fs/btrfs/extent_io.c | 25 +++++++++++++++++++------
1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b20021c501d7..8aa7738d9aad 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4332,12 +4332,31 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
return NULL;
while (1) {
+ struct extent_state *cached_state = NULL;
+ u64 lockstart;
+ u64 lockend;
+
len = last - offset;
if (len == 0)
break;
len = ALIGN(len, sectorsize);
+ lockstart = round_down(offset, sectorsize);
+ lockend = round_up(offset + len, sectorsize);
+
+ /*
+ * We are only locking for the delalloc range because that's the
+ * only thing that can change here. With fiemap we have a lock
+ * on the inode, so no buffered or direct writes can happen.
+ *
+ * However mmaps and normal page writeback will cause this to
+ * change arbitrarily. We have to lock the extent lock here to
+ * make sure that nobody messes with the tree while we're doing
+ * btrfs_find_delalloc_in_range.
+ */
+ lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state);
em = btrfs_get_extent_fiemap(BTRFS_I(inode), NULL, 0, offset,
len, 0);
+ unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state);
if (IS_ERR_OR_NULL(em))
return em;
@@ -4481,7 +4500,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 isize = i_size_read(inode);
struct btrfs_key found_key;
struct extent_map *em = NULL;
- struct extent_state *cached_state = NULL;
struct btrfs_path *path;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct fiemap_cache cache = { 0 };
@@ -4547,9 +4565,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
last_for_get_extent = isize;
}
- lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
- &cached_state);
-
em = get_extent_skip_holes(inode, start, last_for_get_extent);
if (!em)
goto out;
@@ -4662,8 +4677,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
free_extent_map(em);
out:
btrfs_free_path(path);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
- &cached_state);
return ret;
}
--
2.39.2
From: Josef Bacik <josef(a)toxicpanda.com>
stable inclusion
from stable-v6.6.24
commit ded566b4637f1b6b4c9ba74e7d0b8493e93f19cf
category: bugfix
bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9Q8ZZ
CVE: CVE-2024-35784
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?…
--------------------------------
commit b0ad381fa7690244802aed119b478b4bdafc31dd upstream.
While working on the patchset to remove extent locking I got a lockdep
splat with fiemap and pagefaulting with my new extent lock replacement
lock.
This deadlock exists with our normal code, we just don't have lockdep
annotations with the extent locking so we've never noticed it.
Since we're copying the fiemap extent to user space on every iteration
we have the chance of pagefaulting. Because we hold the extent lock for
the entire range we could mkwrite into a range in the file that we have
mmap'ed. This would deadlock with the following stack trace
[<0>] lock_extent+0x28d/0x2f0
[<0>] btrfs_page_mkwrite+0x273/0x8a0
[<0>] do_page_mkwrite+0x50/0xb0
[<0>] do_fault+0xc1/0x7b0
[<0>] __handle_mm_fault+0x2fa/0x460
[<0>] handle_mm_fault+0xa4/0x330
[<0>] do_user_addr_fault+0x1f4/0x800
[<0>] exc_page_fault+0x7c/0x1e0
[<0>] asm_exc_page_fault+0x26/0x30
[<0>] rep_movs_alternative+0x33/0x70
[<0>] _copy_to_user+0x49/0x70
[<0>] fiemap_fill_next_extent+0xc8/0x120
[<0>] emit_fiemap_extent+0x4d/0xa0
[<0>] extent_fiemap+0x7f8/0xad0
[<0>] btrfs_fiemap+0x49/0x80
[<0>] __x64_sys_ioctl+0x3e1/0xb50
[<0>] do_syscall_64+0x94/0x1a0
[<0>] entry_SYSCALL_64_after_hwframe+0x6e/0x76
I wrote an fstest to reproduce this deadlock without my replacement lock
and verified that the deadlock exists with our existing locking.
To fix this simply don't take the extent lock for the entire duration of
the fiemap. This is safe in general because we keep track of where we
are when we're searching the tree, so if an ordered extent updates in
the middle of our fiemap call we'll still emit the correct extents
because we know what offset we were on before.
The only place we maintain the lock is searching delalloc. Since the
delalloc stuff can change during writeback we want to lock the extent
range so we have a consistent view of delalloc at the time we're
checking to see if we need to set the delalloc flag.
With this patch applied we no longer deadlock with my testcase.
CC: stable(a)vger.kernel.org # 6.1+
Reviewed-by: Filipe Manana <fdmanana(a)suse.com>
Signed-off-by: Josef Bacik <josef(a)toxicpanda.com>
Reviewed-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: David Sterba <dsterba(a)suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org>
Conflicts:
fs/btrfs/extent_io.c
[The key to fixing the patch is the fiemap_process_hole function, which
locks only before querying delalloc. Earlier versions do not have this
function, and adaptation requires a lot of refactoring patches. So do
something similar directly in the get_extent_skip_holes function, which
contains the logic to query delalloc.]
Signed-off-by: Zizhi Wo <wozizhi(a)huawei.com>
---
fs/btrfs/extent_io.c | 26 +++++++++++++++++++-------
1 file changed, 19 insertions(+), 7 deletions(-)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 685a375bb6af..0d7843323930 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4532,11 +4532,30 @@ static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
return NULL;
while (1) {
+ struct extent_state *cached_state = NULL;
+ u64 lockstart;
+ u64 lockend;
+
len = last - offset;
if (len == 0)
break;
len = ALIGN(len, sectorsize);
+ lockstart = round_down(offset, sectorsize);
+ lockend = round_up(offset + len, sectorsize);
+
+ /*
+ * We are only locking for the delalloc range because that's the
+ * only thing that can change here. With fiemap we have a lock
+ * on the inode, so no buffered or direct writes can happen.
+ *
+ * However mmaps and normal page writeback will cause this to
+ * change arbitrarily. We have to lock the extent lock here to
+ * make sure that nobody messes with the tree while we're doing
+ * btrfs_find_delalloc_in_range.
+ */
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
em = btrfs_get_extent_fiemap(inode, offset, len);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
if (IS_ERR_OR_NULL(em))
return em;
@@ -4679,7 +4698,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 isize = i_size_read(&inode->vfs_inode);
struct btrfs_key found_key;
struct extent_map *em = NULL;
- struct extent_state *cached_state = NULL;
struct btrfs_path *path;
struct btrfs_root *root = inode->root;
struct fiemap_cache cache = { 0 };
@@ -4758,9 +4776,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
last_for_get_extent = isize;
}
- lock_extent_bits(&inode->io_tree, start, start + len - 1,
- &cached_state);
-
em = get_extent_skip_holes(inode, start, last_for_get_extent);
if (!em)
goto out;
@@ -4871,9 +4886,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
ret = emit_last_fiemap_cache(fieinfo, &cache);
free_extent_map(em);
out:
- unlock_extent_cached(&inode->io_tree, start, start + len - 1,
- &cached_state);
-
out_free_ulist:
btrfs_free_path(path);
ulist_free(roots);
--
2.39.2