From: Zhang Yi yi.zhang@huawei.com
maillist inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6D5XF
Reference: https://lore.kernel.org/linux-ext4/20230130111138.76tp6pij3yhh4brh@quack3/T/...
--------------------------------
Current _ext4_show_options() do not distinguish MOPT_2 flag, so it mixed extend sbi->s_mount_opt2 options with sbi->s_mount_opt, it could lead to show incorrect options, e.g. show fc_debug_force if we mount with errors=continue mode and miss it if we set.
$ mkfs.ext4 /dev/pmem0 $ mount -o errors=remount-ro /dev/pmem0 /mnt $ cat /proc/fs/ext4/pmem0/options | grep fc_debug_force #empty $ mount -o remount,errors=continue /mnt $ cat /proc/fs/ext4/pmem0/options | grep fc_debug_force fc_debug_force $ mount -o remount,errors=remount-ro,fc_debug_force /mnt $ cat /proc/fs/ext4/pmem0/options | grep fc_debug_force #empty
Fixes: 995a3ed67fc8 ("ext4: add fast_commit feature and handling for extended mount options") Signed-off-by: Zhang Yi yi.zhang@huawei.com
Conflict: fs/ext4/super.c Reviewed-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Xiaoxu zhangxiaoxu5@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/ext4/ext4.h | 1 + fs/ext4/super.c | 28 +++++++++++++++++++++------- 2 files changed, 22 insertions(+), 7 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8245b94d8fc6..f1d36671bc2b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1462,6 +1462,7 @@ struct ext4_sb_info { unsigned int s_mount_opt2; unsigned long s_mount_flags; unsigned int s_def_mount_opt; + unsigned int s_def_mount_opt2; ext4_fsblk_t s_sb_block; atomic64_t s_resv_clusters; kuid_t s_resuid; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c94ea845ea57..4fd680507948 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2586,7 +2586,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; - int def_errors, def_mount_opt = sbi->s_def_mount_opt; + int def_errors; const struct mount_opts *m; char sep = nodefs ? '\n' : ',';
@@ -2598,15 +2598,28 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
for (m = ext4_mount_opts; m->token != Opt_err; m++) { int want_set = m->flags & MOPT_SET; + int opt_2 = m->flags & MOPT_2; + unsigned int mount_opt, def_mount_opt; + if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) || (m->flags & MOPT_CLEAR_ERR) || m->flags & MOPT_SKIP) continue; - if (!nodefs && !(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt))) - continue; /* skip if same as the default */ + + if (opt_2) { + mount_opt = sbi->s_mount_opt2; + def_mount_opt = sbi->s_def_mount_opt2; + } else { + mount_opt = sbi->s_mount_opt; + def_mount_opt = sbi->s_def_mount_opt; + } + /* skip if same as the default */ + if (!nodefs && !(m->mount_opt & (mount_opt ^ def_mount_opt))) + continue; + /* select Opt_noFoo vs Opt_Foo */ if ((want_set && - (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) || - (!want_set && (sbi->s_mount_opt & m->mount_opt))) - continue; /* select Opt_noFoo vs Opt_Foo */ + (mount_opt & m->mount_opt) != m->mount_opt) || + (!want_set && (mount_opt & m->mount_opt))) + continue; SEQ_OPTS_PRINT("%s", token2str(m->token)); }
@@ -2636,7 +2649,7 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (nodefs || sbi->s_stripe) SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe); if (nodefs || EXT4_MOUNT_DATA_FLAGS & - (sbi->s_mount_opt ^ def_mount_opt)) { + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) SEQ_OPTS_PUTS("data=journal"); else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) @@ -4341,6 +4354,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) kfree(s_mount_opts); } sbi->s_def_mount_opt = sbi->s_mount_opt; + sbi->s_def_mount_opt2 = sbi->s_mount_opt2; if (!parse_options((char *) data, sb, &journal_devnum, &journal_ioprio, 0)) goto failed_mount;
From: Dave Chinner dchinner@redhat.com
mainline inclusion from mainline-v5.17-rc6 commit d2d7c0473586d2f22e85d615275f34cf19f94447 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4KIAO CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Most buffer io list operations are run with the bp->b_lock held, but xfs_iflush_abort() can be called without the buffer lock being held resulting in inodes being removed from the buffer list while other list operations are occurring. This causes problems with corrupted bp->b_io_list inode lists during filesystem shutdown, leading to traversals that never end, double removals from the AIL, etc.
Fix this by passing the buffer to xfs_iflush_abort() if we have it locked. If the inode is attached to the buffer, we're going to have to remove it from the buffer list and we'd have to get the buffer off the inode log item to do that anyway.
If we don't have a buffer passed in (e.g. from xfs_reclaim_inode()) then we can determine if the inode has a log item and if it is attached to a buffer before we do anything else. If it does have an attached buffer, we can lock it safely (because the inode has a reference to it) and then perform the inode abort.
Signed-off-by: Dave Chinner dchinner@redhat.com Reviewed-by: Darrick J. Wong djwong@kernel.org Signed-off-by: Darrick J. Wong djwong@kernel.org
conflicts: fs/xfs/xfs_icache.c
Signed-off-by: Long Li leo.lilong@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/xfs/xfs_icache.c | 2 +- fs/xfs/xfs_inode.c | 2 +- fs/xfs/xfs_inode_item.c | 162 +++++++++++++++++++++++++++++++++------- fs/xfs/xfs_inode_item.h | 1 + 4 files changed, 136 insertions(+), 31 deletions(-)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 019e5f019468..cc1e8bd4ae51 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -885,7 +885,7 @@ xfs_reclaim_inode(
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_iunpin_wait(ip); - xfs_iflush_abort(ip); + xfs_iflush_shutdown_abort(ip); goto reclaim; } if (xfs_ipincount(ip)) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 2cf69cc5f2bf..f8dfb83492cc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3725,7 +3725,7 @@ xfs_iflush_cluster(
/* * We must use the safe variant here as on shutdown xfs_iflush_abort() - * can remove itself from the list. + * will remove itself from the list. */ list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) { iip = (struct xfs_inode_log_item *)lip; diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 3aba4559469f..fec0a75e8121 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -517,10 +517,17 @@ xfs_inode_item_push( uint rval = XFS_ITEM_SUCCESS; int error;
- ASSERT(iip->ili_item.li_buf); + if (!bp || (ip->i_flags & XFS_ISTALE)) { + /* + * Inode item/buffer is being being aborted due to cluster + * buffer deletion. Trigger a log force to have that operation + * completed and items removed from the AIL before the next push + * attempt. + */ + return XFS_ITEM_PINNED; + }
- if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || - (ip->i_flags & XFS_ISTALE)) + if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp)) return XFS_ITEM_PINNED;
if (xfs_iflags_test(ip, XFS_IFLUSHING)) @@ -796,46 +803,143 @@ xfs_buf_inode_io_fail( }
/* - * This is the inode flushing abort routine. It is called when - * the filesystem is shutting down to clean up the inode state. It is - * responsible for removing the inode item from the AIL if it has not been - * re-logged and clearing the inode's flush state. + * Clear the inode logging fields so no more flushes are attempted. If we are + * on a buffer list, it is now safe to remove it because the buffer is + * guaranteed to be locked. The caller will drop the reference to the buffer + * the log item held. + */ +static void +xfs_iflush_abort_clean( + struct xfs_inode_log_item *iip) +{ + iip->ili_last_fields = 0; + iip->ili_fields = 0; + iip->ili_fsync_fields = 0; + iip->ili_flush_lsn = 0; + iip->ili_item.li_buf = NULL; + list_del_init(&iip->ili_item.li_bio_list); +} + +/* + * Abort flushing the inode from a context holding the cluster buffer locked. + * + * This is the normal runtime method of aborting writeback of an inode that is + * attached to a cluster buffer. It occurs when the inode and the backing + * cluster buffer have been freed (i.e. inode is XFS_ISTALE), or when cluster + * flushing or buffer IO completion encounters a log shutdown situation. + * + * If we need to abort inode writeback and we don't already hold the buffer + * locked, call xfs_iflush_shutdown_abort() instead as this should only ever be + * necessary in a shutdown situation. */ void xfs_iflush_abort( struct xfs_inode *ip) { struct xfs_inode_log_item *iip = ip->i_itemp; - struct xfs_buf *bp = NULL; + struct xfs_buf *bp;
- if (iip) { - /* - * Clear the failed bit before removing the item from the AIL so - * xfs_trans_ail_delete() doesn't try to clear and release the - * buffer attached to the log item before we are done with it. - */ - clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); - xfs_trans_ail_delete(&iip->ili_item, 0); + if (!iip) { + /* clean inode, nothing to do */ + xfs_iflags_clear(ip, XFS_IFLUSHING); + return; + } + + /* + * Remove the inode item from the AIL before we clear its internal + * state. Whilst the inode is in the AIL, it should have a valid buffer + * pointer for push operations to access - it is only safe to remove the + * inode from the buffer once it has been removed from the AIL. + * + * We also clear the failed bit before removing the item from the AIL + * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer + * references the inode item owns and needs to hold until we've fully + * aborted the inode log item and detached it from the buffer. + */ + clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); + xfs_trans_ail_delete(&iip->ili_item, 0); + + /* + * Grab the inode buffer so can we release the reference the inode log + * item holds on it. + */ + spin_lock(&iip->ili_lock); + bp = iip->ili_item.li_buf; + xfs_iflush_abort_clean(iip); + spin_unlock(&iip->ili_lock);
+ xfs_iflags_clear(ip, XFS_IFLUSHING); + if (bp) + xfs_buf_rele(bp); +} + +/* + * Abort an inode flush in the case of a shutdown filesystem. This can be called + * from anywhere with just an inode reference and does not require holding the + * inode cluster buffer locked. If the inode is attached to a cluster buffer, + * it will grab and lock it safely, then abort the inode flush. + */ +void +xfs_iflush_shutdown_abort( + struct xfs_inode *ip) +{ + struct xfs_inode_log_item *iip = ip->i_itemp; + struct xfs_buf *bp; + + if (!iip) { + /* clean inode, nothing to do */ + xfs_iflags_clear(ip, XFS_IFLUSHING); + return; + } + + spin_lock(&iip->ili_lock); + bp = iip->ili_item.li_buf; + if (!bp) { + spin_unlock(&iip->ili_lock); + xfs_iflush_abort(ip); + return; + } + + /* + * We have to take a reference to the buffer so that it doesn't get + * freed when we drop the ili_lock and then wait to lock the buffer. + * We'll clean up the extra reference after we pick up the ili_lock + * again. + */ + xfs_buf_hold(bp); + spin_unlock(&iip->ili_lock); + xfs_buf_lock(bp); + + spin_lock(&iip->ili_lock); + if (!iip->ili_item.li_buf) { /* - * Clear the inode logging fields so no more flushes are - * attempted. + * Raced with another removal, hold the only reference + * to bp now. Inode should not be in the AIL now, so just clean + * up and return; */ - spin_lock(&iip->ili_lock); - iip->ili_last_fields = 0; - iip->ili_fields = 0; - iip->ili_fsync_fields = 0; - iip->ili_flush_lsn = 0; - bp = iip->ili_item.li_buf; - iip->ili_item.li_buf = NULL; - list_del_init(&iip->ili_item.li_bio_list); + ASSERT(list_empty(&iip->ili_item.li_bio_list)); + ASSERT(!test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)); + xfs_iflush_abort_clean(iip); spin_unlock(&iip->ili_lock); + xfs_iflags_clear(ip, XFS_IFLUSHING); + xfs_buf_relse(bp); + return; } - xfs_iflags_clear(ip, XFS_IFLUSHING); - if (bp) - xfs_buf_rele(bp); + + /* + * Got two references to bp. The first will get dropped by + * xfs_iflush_abort() when the item is removed from the buffer list, but + * we can't drop our reference until _abort() returns because we have to + * unlock the buffer as well. Hence we abort and then unlock and release + * our reference to the buffer. + */ + ASSERT(iip->ili_item.li_buf == bp); + spin_unlock(&iip->ili_lock); + xfs_iflush_abort(ip); + xfs_buf_relse(bp); }
+ /* * convert an xfs_inode_log_format struct from the old 32 bit version * (which can have different field alignments) to the native 64 bit version diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h index 403b45ab9aa2..9c829cf5c839 100644 --- a/fs/xfs/xfs_inode_item.h +++ b/fs/xfs/xfs_inode_item.h @@ -44,6 +44,7 @@ static inline int xfs_inode_clean(struct xfs_inode *ip) extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); extern void xfs_inode_item_destroy(struct xfs_inode *); extern void xfs_iflush_abort(struct xfs_inode *); +extern void xfs_iflush_shutdown_abort(struct xfs_inode *); extern int xfs_inode_item_format_convert(xfs_log_iovec_t *, struct xfs_inode_log_format *);
From: Mike Christie michael.christie@oracle.com
mainline inclusion from mainline-v5.14-rc1 commit a1f3486b3b095ed2259d7a1fc021a8b6e72a5365 category: bugfix bugzilla: 188443, https://gitee.com/openeuler/kernel/issues/I6I8YD CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------
This doesn't fix any bugs, but it makes more sense to free the pool after we have removed the session. At that time we know nothing is touching any of the session fields, because all devices have been removed and scans are stopped.
Link: https://lore.kernel.org/r/20210525181821.7617-19-michael.christie@oracle.com Reviewed-by: Lee Duncan lduncan@suse.com Signed-off-by: Mike Christie michael.christie@oracle.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Zhong Jinghua zhongjinghua@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- drivers/scsi/libiscsi.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 176842a869f1..6a622c70514a 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -2993,10 +2993,9 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) struct module *owner = cls_session->transport->owner; struct Scsi_Host *shost = session->host;
- iscsi_pool_free(&session->cmdpool); - iscsi_remove_session(cls_session);
+ iscsi_pool_free(&session->cmdpool); kfree(session->password); kfree(session->password_in); kfree(session->username);
From: Mike Christie michael.christie@oracle.com
mainline inclusion from mainline-v6.2-rc6 commit 6f1d64b13097e85abda0f91b5638000afc5f9a06 category: bugfix bugzilla: 188443, https://gitee.com/openeuler/kernel/issues/I6I8YD CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------
Bug report and analysis from Ding Hui.
During iSCSI session logout, if another task accesses the shost ipaddress attr, we can get a KASAN UAF report like this:
[ 276.942144] BUG: KASAN: use-after-free in _raw_spin_lock_bh+0x78/0xe0 [ 276.942535] Write of size 4 at addr ffff8881053b45b8 by task cat/4088 [ 276.943511] CPU: 2 PID: 4088 Comm: cat Tainted: G E 6.1.0-rc8+ #3 [ 276.943997] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 [ 276.944470] Call Trace: [ 276.944943] <TASK> [ 276.945397] dump_stack_lvl+0x34/0x48 [ 276.945887] print_address_description.constprop.0+0x86/0x1e7 [ 276.946421] print_report+0x36/0x4f [ 276.947358] kasan_report+0xad/0x130 [ 276.948234] kasan_check_range+0x35/0x1c0 [ 276.948674] _raw_spin_lock_bh+0x78/0xe0 [ 276.949989] iscsi_sw_tcp_host_get_param+0xad/0x2e0 [iscsi_tcp] [ 276.951765] show_host_param_ISCSI_HOST_PARAM_IPADDRESS+0xe9/0x130 [scsi_transport_iscsi] [ 276.952185] dev_attr_show+0x3f/0x80 [ 276.953005] sysfs_kf_seq_show+0x1fb/0x3e0 [ 276.953401] seq_read_iter+0x402/0x1020 [ 276.954260] vfs_read+0x532/0x7b0 [ 276.955113] ksys_read+0xed/0x1c0 [ 276.955952] do_syscall_64+0x38/0x90 [ 276.956347] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 276.956769] RIP: 0033:0x7f5d3a679222 [ 276.957161] Code: c0 e9 b2 fe ff ff 50 48 8d 3d 32 c0 0b 00 e8 a5 fe 01 00 0f 1f 44 00 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 0f 05 <48> 3d 00 f0 ff ff 77 56 c3 0f 1f 44 00 00 48 83 ec 28 48 89 54 24 [ 276.958009] RSP: 002b:00007ffc864d16a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [ 276.958431] RAX: ffffffffffffffda RBX: 0000000000020000 RCX: 00007f5d3a679222 [ 276.958857] RDX: 0000000000020000 RSI: 00007f5d3a4fe000 RDI: 0000000000000003 [ 276.959281] RBP: 00007f5d3a4fe000 R08: 00000000ffffffff R09: 0000000000000000 [ 276.959682] R10: 0000000000000022 R11: 0000000000000246 R12: 0000000000020000 [ 276.960126] R13: 0000000000000003 R14: 0000000000000000 R15: 0000557a26dada58 [ 276.960536] </TASK> [ 276.961357] Allocated by task 2209: [ 276.961756] kasan_save_stack+0x1e/0x40 [ 276.962170] kasan_set_track+0x21/0x30 [ 276.962557] __kasan_kmalloc+0x7e/0x90 [ 276.962923] __kmalloc+0x5b/0x140 [ 276.963308] iscsi_alloc_session+0x28/0x840 [scsi_transport_iscsi] [ 276.963712] iscsi_session_setup+0xda/0xba0 [libiscsi] [ 276.964078] iscsi_sw_tcp_session_create+0x1fd/0x330 [iscsi_tcp] [ 276.964431] iscsi_if_create_session.isra.0+0x50/0x260 [scsi_transport_iscsi] [ 276.964793] iscsi_if_recv_msg+0xc5a/0x2660 [scsi_transport_iscsi] [ 276.965153] iscsi_if_rx+0x198/0x4b0 [scsi_transport_iscsi] [ 276.965546] netlink_unicast+0x4d5/0x7b0 [ 276.965905] netlink_sendmsg+0x78d/0xc30 [ 276.966236] sock_sendmsg+0xe5/0x120 [ 276.966576] ____sys_sendmsg+0x5fe/0x860 [ 276.966923] ___sys_sendmsg+0xe0/0x170 [ 276.967300] __sys_sendmsg+0xc8/0x170 [ 276.967666] do_syscall_64+0x38/0x90 [ 276.968028] entry_SYSCALL_64_after_hwframe+0x63/0xcd [ 276.968773] Freed by task 2209: [ 276.969111] kasan_save_stack+0x1e/0x40 [ 276.969449] kasan_set_track+0x21/0x30 [ 276.969789] kasan_save_free_info+0x2a/0x50 [ 276.970146] __kasan_slab_free+0x106/0x190 [ 276.970470] __kmem_cache_free+0x133/0x270 [ 276.970816] device_release+0x98/0x210 [ 276.971145] kobject_cleanup+0x101/0x360 [ 276.971462] iscsi_session_teardown+0x3fb/0x530 [libiscsi] [ 276.971775] iscsi_sw_tcp_session_destroy+0xd8/0x130 [iscsi_tcp] [ 276.972143] iscsi_if_recv_msg+0x1bf1/0x2660 [scsi_transport_iscsi] [ 276.972485] iscsi_if_rx+0x198/0x4b0 [scsi_transport_iscsi] [ 276.972808] netlink_unicast+0x4d5/0x7b0 [ 276.973201] netlink_sendmsg+0x78d/0xc30 [ 276.973544] sock_sendmsg+0xe5/0x120 [ 276.973864] ____sys_sendmsg+0x5fe/0x860 [ 276.974248] ___sys_sendmsg+0xe0/0x170 [ 276.974583] __sys_sendmsg+0xc8/0x170 [ 276.974891] do_syscall_64+0x38/0x90 [ 276.975216] entry_SYSCALL_64_after_hwframe+0x63/0xcd
We can easily reproduce by two tasks: 1. while :; do iscsiadm -m node --login; iscsiadm -m node --logout; done 2. while :; do cat \ /sys/devices/platform/host*/iscsi_host/host*/ipaddress; done
iscsid | cat --------------------------------+--------------------------------------- |- iscsi_sw_tcp_session_destroy | |- iscsi_session_teardown | |- device_release | |- iscsi_session_release ||- dev_attr_show |- kfree | |- show_host_param_ | ISCSI_HOST_PARAM_IPADDRESS | |- iscsi_sw_tcp_host_get_param | |- r/w tcp_sw_host->session (UAF) |- iscsi_host_remove | |- iscsi_host_free |
Fix the above bug by splitting the session removal into 2 parts:
1. removal from iSCSI class which includes sysfs and removal from host tracking.
2. freeing of session.
During iscsi_tcp host and session removal we can remove the session from sysfs then remove the host from sysfs. At this point we know userspace is not accessing the kernel via sysfs so we can free the session and host.
Link: https://lore.kernel.org/r/20230117193937.21244-2-michael.christie@oracle.com Signed-off-by: Mike Christie michael.christie@oracle.com Reviewed-by: Lee Duncan lduncan@suse.com Acked-by: Ding Hui dinghui@sangfor.com.cn Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Zhong Jinghua zhongjinghua@huawei.com conflicts: drivers/scsi/iscsi_tcp.c Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- drivers/scsi/iscsi_tcp.c | 11 +++++++++-- drivers/scsi/libiscsi.c | 38 +++++++++++++++++++++++++++++++------- include/scsi/libiscsi.h | 2 ++ 3 files changed, 42 insertions(+), 9 deletions(-)
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index a226a040647a..8c04ceee52d4 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -913,10 +913,17 @@ static void iscsi_sw_tcp_session_destroy(struct iscsi_cls_session *cls_session) if (WARN_ON_ONCE(session->leadconn)) return;
+ iscsi_session_remove(cls_session); + /* + * Our get_host_param needs to access the session, so remove the + * host from sysfs before freeing the session to make sure userspace + * is no longer accessing the callout. + */ + iscsi_host_remove(shost); + iscsi_tcp_r2tpool_free(cls_session->dd_data); - iscsi_session_teardown(cls_session);
- iscsi_host_remove(shost); + iscsi_session_free(cls_session); iscsi_host_free(shost); }
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 6a622c70514a..39d5067f804b 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -2983,17 +2983,32 @@ iscsi_session_setup(struct iscsi_transport *iscsit, struct Scsi_Host *shost, } EXPORT_SYMBOL_GPL(iscsi_session_setup);
-/** - * iscsi_session_teardown - destroy session, host, and cls_session - * @cls_session: iscsi session +/* + * issi_session_remove - Remove session from iSCSI class. */ -void iscsi_session_teardown(struct iscsi_cls_session *cls_session) +void iscsi_session_remove(struct iscsi_cls_session *cls_session) { struct iscsi_session *session = cls_session->dd_data; - struct module *owner = cls_session->transport->owner; struct Scsi_Host *shost = session->host;
iscsi_remove_session(cls_session); + /* + * host removal only has to wait for its children to be removed from + * sysfs, and iscsi_tcp needs to do iscsi_host_remove before freeing + * the session, so drop the session count here. + */ + iscsi_host_dec_session_cnt(shost); +} +EXPORT_SYMBOL_GPL(iscsi_session_remove); + +/** + * iscsi_session_free - Free iscsi session and it's resources + * @cls_session: iscsi session + */ +void iscsi_session_free(struct iscsi_cls_session *cls_session) +{ + struct iscsi_session *session = cls_session->dd_data; + struct module *owner = cls_session->transport->owner;
iscsi_pool_free(&session->cmdpool); kfree(session->password); @@ -3011,10 +3026,19 @@ void iscsi_session_teardown(struct iscsi_cls_session *cls_session) kfree(session->discovery_parent_type);
iscsi_free_session(cls_session); - - iscsi_host_dec_session_cnt(shost); module_put(owner); } +EXPORT_SYMBOL_GPL(iscsi_session_free); + +/** + * iscsi_session_teardown - destroy session and cls_session + * @cls_session: iscsi session + */ +void iscsi_session_teardown(struct iscsi_cls_session *cls_session) +{ + iscsi_session_remove(cls_session); + iscsi_session_free(cls_session); +} EXPORT_SYMBOL_GPL(iscsi_session_teardown);
/** diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index 881e4762d626..fb7c5f82dcf3 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -404,6 +404,8 @@ extern int iscsi_host_get_max_scsi_cmds(struct Scsi_Host *shost, extern struct iscsi_cls_session * iscsi_session_setup(struct iscsi_transport *, struct Scsi_Host *shost, uint16_t, int, int, uint32_t, unsigned int); +void iscsi_session_remove(struct iscsi_cls_session *cls_session); +void iscsi_session_free(struct iscsi_cls_session *cls_session); extern void iscsi_session_teardown(struct iscsi_cls_session *); extern void iscsi_session_recovery_timedout(struct iscsi_cls_session *); extern int iscsi_set_param(struct iscsi_cls_conn *cls_conn,
From: Mike Christie michael.christie@oracle.com
mainline inclusion from mainline-v6.2-rc6 commit f484a794e4ee2a9ce61f52a78e810ac45f3fe3b3 category: bugfix bugzilla: 188443, https://gitee.com/openeuler/kernel/issues/I6I8YD CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
----------------------------------------
If during iscsi_sw_tcp_session_create() iscsi_tcp_r2tpool_alloc() fails, userspace could be accessing the host's ipaddress attr. If we then free the session via iscsi_session_teardown() while userspace is still accessing the session we will hit a use after free bug.
Set the tcp_sw_host->session after we have completed session creation and can no longer fail.
Link: https://lore.kernel.org/r/20230117193937.21244-3-michael.christie@oracle.com Signed-off-by: Mike Christie michael.christie@oracle.com Reviewed-by: Lee Duncan lduncan@suse.com Acked-by: Ding Hui dinghui@sangfor.com.cn Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Zhong Jinghua zhongjinghua@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- drivers/scsi/iscsi_tcp.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c index 8c04ceee52d4..4575d0f0dd6c 100644 --- a/drivers/scsi/iscsi_tcp.c +++ b/drivers/scsi/iscsi_tcp.c @@ -783,7 +783,7 @@ static int iscsi_sw_tcp_host_get_param(struct Scsi_Host *shost, enum iscsi_host_param param, char *buf) { struct iscsi_sw_tcp_host *tcp_sw_host = iscsi_host_priv(shost); - struct iscsi_session *session = tcp_sw_host->session; + struct iscsi_session *session; struct iscsi_conn *conn; struct iscsi_tcp_conn *tcp_conn; struct iscsi_sw_tcp_conn *tcp_sw_conn; @@ -793,6 +793,7 @@ static int iscsi_sw_tcp_host_get_param(struct Scsi_Host *shost,
switch (param) { case ISCSI_HOST_PARAM_IPADDRESS: + session = tcp_sw_host->session; if (!session) return -ENOTCONN;
@@ -889,11 +890,13 @@ iscsi_sw_tcp_session_create(struct iscsi_endpoint *ep, uint16_t cmds_max, if (!cls_session) goto remove_host; session = cls_session->dd_data; - tcp_sw_host = iscsi_host_priv(shost); - tcp_sw_host->session = session;
if (iscsi_tcp_r2tpool_alloc(session)) goto remove_session; + + /* We are now fully setup so expose the session to sysfs. */ + tcp_sw_host = iscsi_host_priv(shost); + tcp_sw_host->session = session; return cls_session;
remove_session:
From: Ye Bin yebin10@huawei.com
maillist inclusion category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6K53I
Reference: https://patchwork.ozlabs.org/project/linux-ext4/patch/20230116020015.1506120...
--------------------------------
Syzbot found the following issue:
EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, O_DIRECT and fast_commit support! EXT4-fs (loop0): orphan cleanup on readonly fs ------------[ cut here ]------------ WARNING: CPU: 1 PID: 5067 at fs/ext4/mballoc.c:1869 mb_find_extent+0x8a1/0xe30 Modules linked in: CPU: 1 PID: 5067 Comm: syz-executor307 Not tainted 6.2.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 RIP: 0010:mb_find_extent+0x8a1/0xe30 fs/ext4/mballoc.c:1869 RSP: 0018:ffffc90003c9e098 EFLAGS: 00010293 RAX: ffffffff82405731 RBX: 0000000000000041 RCX: ffff8880783457c0 RDX: 0000000000000000 RSI: 0000000000000041 RDI: 0000000000000040 RBP: 0000000000000040 R08: ffffffff82405723 R09: ffffed10053c9402 R10: ffffed10053c9402 R11: 1ffff110053c9401 R12: 0000000000000000 R13: ffffc90003c9e538 R14: dffffc0000000000 R15: ffffc90003c9e2cc FS: 0000555556665300(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000056312f6796f8 CR3: 0000000022437000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> ext4_mb_complex_scan_group+0x353/0x1100 fs/ext4/mballoc.c:2307 ext4_mb_regular_allocator+0x1533/0x3860 fs/ext4/mballoc.c:2735 ext4_mb_new_blocks+0xddf/0x3db0 fs/ext4/mballoc.c:5605 ext4_ext_map_blocks+0x1868/0x6880 fs/ext4/extents.c:4286 ext4_map_blocks+0xa49/0x1cc0 fs/ext4/inode.c:651 ext4_getblk+0x1b9/0x770 fs/ext4/inode.c:864 ext4_bread+0x2a/0x170 fs/ext4/inode.c:920 ext4_quota_write+0x225/0x570 fs/ext4/super.c:7105 write_blk fs/quota/quota_tree.c:64 [inline] get_free_dqblk+0x34a/0x6d0 fs/quota/quota_tree.c:130 do_insert_tree+0x26b/0x1aa0 fs/quota/quota_tree.c:340 do_insert_tree+0x722/0x1aa0 fs/quota/quota_tree.c:375 do_insert_tree+0x722/0x1aa0 fs/quota/quota_tree.c:375 do_insert_tree+0x722/0x1aa0 fs/quota/quota_tree.c:375 dq_insert_tree fs/quota/quota_tree.c:401 [inline] qtree_write_dquot+0x3b6/0x530 fs/quota/quota_tree.c:420 v2_write_dquot+0x11b/0x190 fs/quota/quota_v2.c:358 dquot_acquire+0x348/0x670 fs/quota/dquot.c:444 ext4_acquire_dquot+0x2dc/0x400 fs/ext4/super.c:6740 dqget+0x999/0xdc0 fs/quota/dquot.c:914 __dquot_initialize+0x3d0/0xcf0 fs/quota/dquot.c:1492 ext4_process_orphan+0x57/0x2d0 fs/ext4/orphan.c:329 ext4_orphan_cleanup+0xb60/0x1340 fs/ext4/orphan.c:474 __ext4_fill_super fs/ext4/super.c:5516 [inline] ext4_fill_super+0x81cd/0x8700 fs/ext4/super.c:5644 get_tree_bdev+0x400/0x620 fs/super.c:1282 vfs_get_tree+0x88/0x270 fs/super.c:1489 do_new_mount+0x289/0xad0 fs/namespace.c:3145 do_mount fs/namespace.c:3488 [inline] __do_sys_mount fs/namespace.c:3697 [inline] __se_sys_mount+0x2d3/0x3c0 fs/namespace.c:3674 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd
Add some debug information: mb_find_extent: mb_find_extent block=41, order=0 needed=64 next=0 ex=0/41/1@3735929054 64 64 7 block_bitmap: ff 3f 0c 00 fc 01 00 00 d2 3d 00 00 00 00 00 00 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
Acctually, blocks per group is 64, but block bitmap indicate at least has 128 blocks. Now, ext4_validate_block_bitmap() didn't check invalid block's bitmap if set. To resolve above issue, add check like fsck "Padding at end of block bitmap is not set".
Reported-by: syzbot+68223fe9f6c95ad43bed@syzkaller.appspotmail.com Signed-off-by: Ye Bin yebin10@huawei.com Reviewed-by: Jan Kara jack@suse.cz Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Yang Erkun yangerkun@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- fs/ext4/balloc.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1afd60fcd772..50a0e90e8af9 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -303,6 +303,22 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, return desc; }
+static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb, + ext4_group_t block_group, + struct buffer_head *bh) +{ + ext4_grpblk_t next_zero_bit; + unsigned long bitmap_size = sb->s_blocksize * 8; + unsigned int offset = num_clusters_in_group(sb, block_group); + + if (bitmap_size <= offset) + return 0; + + next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset); + + return (next_zero_bit < bitmap_size ? next_zero_bit : 0); +} + /* * Return the block number which was discovered to be invalid, or 0 if * the block bitmap is valid. @@ -401,6 +417,15 @@ static int ext4_validate_block_bitmap(struct super_block *sb, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } + blk = ext4_valid_block_bitmap_padding(sb, block_group, bh); + if (unlikely(blk != 0)) { + ext4_unlock_group(sb, block_group); + ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set", + block_group, blk); + ext4_mark_group_bitmap_corrupted(sb, block_group, + EXT4_GROUP_INFO_BBITMAP_CORRUPT); + return -EFSCORRUPTED; + } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group);
From: Marcelo Ricardo Leitner marcelo.leitner@gmail.com
stable inclusion from stable-v5.10.165 commit 6ef652f35dcfaa1ab2b2cf6c1694718595148eee category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6I7U3 CVE: CVE-2023-1074
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
[ Upstream commit 458e279f861d3f61796894cd158b780765a1569f ]
Currently, if you bind the socket to something like: servaddr.sin6_family = AF_INET6; servaddr.sin6_port = htons(0); servaddr.sin6_scope_id = 0; inet_pton(AF_INET6, "::1", &servaddr.sin6_addr);
And then request a connect to: connaddr.sin6_family = AF_INET6; connaddr.sin6_port = htons(20000); connaddr.sin6_scope_id = if_nametoindex("lo"); inet_pton(AF_INET6, "fe88::1", &connaddr.sin6_addr);
What the stack does is: - bind the socket - create a new asoc - to handle the connect - copy the addresses that can be used for the given scope - try to connect
But the copy returns 0 addresses, and the effect is that it ends up trying to connect as if the socket wasn't bound, which is not the desired behavior. This unexpected behavior also allows KASLR leaks through SCTP diag interface.
The fix here then is, if when trying to copy the addresses that can be used for the scope used in connect() it returns 0 addresses, bail out. This is what TCP does with a similar reproducer.
Reported-by: Pietro Borrello borrello@diag.uniroma1.it Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Marcelo Ricardo Leitner marcelo.leitner@gmail.com Reviewed-by: Xin Long lucien.xin@gmail.com Link: https://lore.kernel.org/r/9fcd182f1099f86c6661f3717f63712ddd1c676c.167449673... Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Dong Chenchen dongchenchen2@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- net/sctp/bind_addr.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c index 59e653b528b1..6b95d3ba8fe1 100644 --- a/net/sctp/bind_addr.c +++ b/net/sctp/bind_addr.c @@ -73,6 +73,12 @@ int sctp_bind_addr_copy(struct net *net, struct sctp_bind_addr *dest, } }
+ /* If somehow no addresses were found that can be used with this + * scope, it's an error. + */ + if (list_empty(&dest->address_list)) + error = -ENETUNREACH; + out: if (error) sctp_bind_addr_clean(dest);
From: Zhong Jinghua zhongjinghua@huawei.com
hulk inclusion category: bugfix bugzilla: 188355, https://gitee.com/openeuler/kernel/issues/I6E4JF
----------------------------------------
A use-after-free problem like below:
BUG: KASAN: use-after-free in scsi_target_reap+0x6c/0x70
Workqueue: scsi_wq_1 __iscsi_unbind_session [scsi_transport_iscsi] Call trace: dump_backtrace+0x0/0x320 show_stack+0x24/0x30 dump_stack+0xdc/0x128 print_address_description+0x68/0x278 kasan_report+0x1e4/0x308 __asan_report_load4_noabort+0x30/0x40 scsi_target_reap+0x6c/0x70 scsi_remove_target+0x430/0x640 __iscsi_unbind_session+0x164/0x268 [scsi_transport_iscsi] process_one_work+0x67c/0x1350 worker_thread+0x370/0xf90 kthread+0x2a4/0x320 ret_from_fork+0x10/0x18
The problem is caused by a concurrency scenario:
T0: delete target // echo 1 > /sys/devices/platform/host1/session1/target1:0:0/1:0:0:1/delete T1: logout // iscsiadm -m node --logout
T0 T1 sdev_store_delete scsi_remove_device device_remove_file __scsi_remove_device __iscsi_unbind_session scsi_remove_target spin_lock_irqsave list_for_each_entry scsi_target_reap // starget->reap_ref 1 -> 0 kref_get(&starget->reap_ref); // warn use-after-free. spin_unlock_irqrestore scsi_target_reap_ref_release scsi_target_destroy ... // delete starget scsi_target_reap // UAF
When T0 reduces the reference count to 0, but has not been released, T1 can still enter list_for_each_entry, and then kref_get reports UAF.
Fix it by using kref_get_unless_zero() to check for a reference count of 0.
Signed-off-by: Zhong Jinghua zhongjinghua@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- drivers/scsi/scsi_sysfs.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-)
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 42db9c52208e..dc448def9e80 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -1556,7 +1556,16 @@ void scsi_remove_target(struct device *dev) starget->state == STARGET_CREATED_REMOVE) continue; if (starget->dev.parent == dev || &starget->dev == dev) { - kref_get(&starget->reap_ref); + /* + * If the reference count is already zero, skip + * this target. Calling kref_get_unless_zero() if + * the reference count is zero is safe because + * scsi_target_destroy() will wait until the host + * lock has been released before freeing starget. + */ + if (!kref_get_unless_zero(&starget->reap_ref)) + continue; + if (starget->state == STARGET_CREATED) starget->state = STARGET_CREATED_REMOVE; else
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6L4UU CVE: NA
--------------------------------
In the error path of raid10_run(), 'conf' need be freed, however, 'conf->bio_split' is missed and memory will be leaked.
Since there are 3 places to free 'conf', factor out a helper to fix the problem.
Fixes: fc9977dd069e ("md/raid10: simplify the splitting of requests.") Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- drivers/md/raid10.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-)
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index beaada3e87ba..4b65efd413f9 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3624,6 +3624,20 @@ static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) return nc*fc; }
+static void raid10_free_conf(struct r10conf *conf) +{ + if (!conf) + return; + + mempool_exit(&conf->r10bio_pool); + kfree(conf->mirrors); + kfree(conf->mirrors_old); + kfree(conf->mirrors_new); + safe_put_page(conf->tmppage); + bioset_exit(&conf->bio_split); + kfree(conf); +} + static struct r10conf *setup_conf(struct mddev *mddev) { struct r10conf *conf = NULL; @@ -3706,13 +3720,7 @@ static struct r10conf *setup_conf(struct mddev *mddev) return conf;
out: - if (conf) { - mempool_exit(&conf->r10bio_pool); - kfree(conf->mirrors); - safe_put_page(conf->tmppage); - bioset_exit(&conf->bio_split); - kfree(conf); - } + raid10_free_conf(conf); return ERR_PTR(err); }
@@ -3918,10 +3926,7 @@ static int raid10_run(struct mddev *mddev)
out_free_conf: md_unregister_thread(&mddev->thread); - mempool_exit(&conf->r10bio_pool); - safe_put_page(conf->tmppage); - kfree(conf->mirrors); - kfree(conf); + raid10_free_conf(conf); mddev->private = NULL; out: return -EIO; @@ -3929,15 +3934,7 @@ static int raid10_run(struct mddev *mddev)
static void raid10_free(struct mddev *mddev, void *priv) { - struct r10conf *conf = priv; - - mempool_exit(&conf->r10bio_pool); - safe_put_page(conf->tmppage); - kfree(conf->mirrors); - kfree(conf->mirrors_old); - kfree(conf->mirrors_new); - bioset_exit(&conf->bio_split); - kfree(conf); + raid10_free_conf(priv); }
static void raid10_quiesce(struct mddev *mddev, int quiesce)
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6L586 CVE: NA
--------------------------------
Currently, for bio-based device, 'ios' and 'sectors' is counted while io is started, while 'nsecs' is counted while io is done.
This behaviour is obviously wrong, however we can't fix exist kapis because this will require new parameter, which will cause kapi broken. Hence this patch add some new apis, which will make sure io accounting for bio-based device is precise.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- block/blk-core.c | 59 +++++++++++++++++++++++++++++++++++------- include/linux/blkdev.h | 21 +++++++++++++++ 2 files changed, 71 insertions(+), 9 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 71c5cf508127..f0e28624ef9c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1374,15 +1374,18 @@ void blk_account_io_start(struct request *rq) }
static unsigned long __part_start_io_acct(struct hd_struct *part, - unsigned int sectors, unsigned int op) + unsigned int sectors, unsigned int op, + bool precise) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies);
part_stat_lock(); update_io_ticks(part, now, false); - part_stat_inc(part, ios[sgrp]); - part_stat_add(part, sectors[sgrp], sectors); + if (!precise) { + part_stat_inc(part, ios[sgrp]); + part_stat_add(part, sectors[sgrp], sectors); + } part_stat_local_inc(part, in_flight[op_is_write(op)]); part_stat_unlock();
@@ -1394,19 +1397,21 @@ unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, { *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);
- return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio)); + return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio), + false); } EXPORT_SYMBOL_GPL(part_start_io_acct);
unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) { - return __part_start_io_acct(&disk->part0, sectors, op); + return __part_start_io_acct(&disk->part0, sectors, op, false); } EXPORT_SYMBOL(disk_start_io_acct);
-static void __part_end_io_acct(struct hd_struct *part, unsigned int op, - unsigned long start_time) +static void __part_end_io_acct(struct hd_struct *part, unsigned int sectors, + unsigned int op, unsigned long start_time, + bool precise) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); @@ -1414,6 +1419,10 @@ static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
part_stat_lock(); update_io_ticks(part, now, true); + if (precise) { + part_stat_inc(part, ios[sgrp]); + part_stat_add(part, sectors[sgrp], sectors); + } part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); part_stat_local_dec(part, in_flight[op_is_write(op)]); part_stat_unlock(); @@ -1422,7 +1431,7 @@ static void __part_end_io_acct(struct hd_struct *part, unsigned int op, void part_end_io_acct(struct hd_struct *part, struct bio *bio, unsigned long start_time) { - __part_end_io_acct(part, bio_op(bio), start_time); + __part_end_io_acct(part, 0, bio_op(bio), start_time, false); hd_struct_put(part); } EXPORT_SYMBOL_GPL(part_end_io_acct); @@ -1430,10 +1439,42 @@ EXPORT_SYMBOL_GPL(part_end_io_acct); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time) { - __part_end_io_acct(&disk->part0, op, start_time); + __part_end_io_acct(&disk->part0, 0, op, start_time, false); } EXPORT_SYMBOL(disk_end_io_acct);
+unsigned long part_start_precise_io_acct(struct gendisk *disk, + struct hd_struct **part, + struct bio *bio) +{ + *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); + + return __part_start_io_acct(*part, 0, bio_op(bio), true); +} +EXPORT_SYMBOL_GPL(part_start_precise_io_acct); + +unsigned long disk_start_precise_io_acct(struct gendisk *disk, unsigned int op) +{ + return __part_start_io_acct(&disk->part0, 0, op, true); +} +EXPORT_SYMBOL(disk_start_precise_io_acct); + +void part_end_precise_io_acct(struct hd_struct *part, struct bio *bio, + unsigned long start_time) +{ + __part_end_io_acct(part, bio_sectors(bio), bio_op(bio), start_time, + true); + hd_struct_put(part); +} +EXPORT_SYMBOL_GPL(part_end_precise_io_acct); + +void disk_end_precise_io_acct(struct gendisk *disk, unsigned int sectors, + unsigned int op, unsigned long start_time) +{ + __part_end_io_acct(&disk->part0, sectors, op, start_time, true); +} +EXPORT_SYMBOL(disk_end_precise_io_acct); + /* * Steal bios from a request and add them to a bio list. * The request must not have been partially completed before. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 171884608cad..b04613bc3ed5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -2027,6 +2027,27 @@ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time); }
+unsigned long disk_start_precise_io_acct(struct gendisk *disk, unsigned int op); +void disk_end_precise_io_acct(struct gendisk *disk, unsigned int sectors, + unsigned int op, unsigned long start_time); +unsigned long part_start_precise_io_acct(struct gendisk *disk, + struct hd_struct **part, + struct bio *bio); +void part_end_precise_io_acct(struct hd_struct *part, struct bio *bio, + unsigned long start_time); + +static inline unsigned long bio_start_precise_io_acct(struct bio *bio) +{ + return disk_start_precise_io_acct(bio->bi_disk, bio_op(bio)); +} + +static inline void bio_end_precise_io_acct(struct bio *bio, + unsigned long start_time) +{ + return disk_end_precise_io_acct(bio->bi_disk, bio_sectors(bio), + bio_op(bio), start_time); +} + int bdev_read_only(struct block_device *bdev); int set_blocksize(struct block_device *bdev, int size);
From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6L586 CVE: NA
--------------------------------
'ios' and 'sectors' is counted in bio_start_io_acct() while io is started insted of io is done. Hence switch to precise io accounting to count them when io is done.
Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- drivers/md/raid1.c | 6 +++--- drivers/md/raid10.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7c01f8487427..8780c95f9b86 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -301,7 +301,7 @@ static void call_bio_endio(struct r1bio *r1_bio) bio->bi_status = BLK_STS_IOERR;
if (blk_queue_io_stat(bio->bi_disk->queue)) - bio_end_io_acct(bio, r1_bio->start_time); + bio_end_precise_io_acct(bio, r1_bio->start_time); bio_endio(bio); }
@@ -1295,7 +1295,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, r1_bio->read_disk = rdisk;
if (!r1bio_existed && blk_queue_io_stat(bio->bi_disk->queue)) - r1_bio->start_time = bio_start_io_acct(bio); + r1_bio->start_time = bio_start_precise_io_acct(bio);
read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
@@ -1487,7 +1487,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, }
if (blk_queue_io_stat(bio->bi_disk->queue)) - r1_bio->start_time = bio_start_io_acct(bio); + r1_bio->start_time = bio_start_precise_io_acct(bio); atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0);
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 4b65efd413f9..04869394e345 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -298,7 +298,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio) bio->bi_status = BLK_STS_IOERR;
if (blk_queue_io_stat(bio->bi_disk->queue)) - bio_end_io_acct(bio, r10_bio->start_time); + bio_end_precise_io_acct(bio, r10_bio->start_time); bio_endio(bio); /* * Wake up any possible resync thread that waits for the device @@ -1188,7 +1188,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, slot = r10_bio->read_slot;
if (!handle_error && blk_queue_io_stat(bio->bi_disk->queue)) - r10_bio->start_time = bio_start_io_acct(bio); + r10_bio->start_time = bio_start_precise_io_acct(bio); read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set);
r10_bio->devs[slot].bio = read_bio; @@ -1473,7 +1473,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, }
if (blk_queue_io_stat(bio->bi_disk->queue)) - r10_bio->start_time = bio_start_io_acct(bio); + r10_bio->start_time = bio_start_precise_io_acct(bio); atomic_set(&r10_bio->remaining, 1); md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
From: "Steven Rostedt (Google)" rostedt@goodmis.org
stable inclusion from stable-v5.10.150 commit 0cf6c09dafeeb6f3d92cc19ea9e024640448c42e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6LCWQ CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit a0fcaaed0c46cf9399d3a2d6e0c87ddb3df0e044 upstream.
The ring buffer is broken up into sub buffers (currently of page size). Each sub buffer has a pointer to its "tail" (the last event written to the sub buffer). When a new event is requested, the tail is locally incremented to cover the size of the new event. This is done in a way that there is no need for locking.
If the tail goes past the end of the sub buffer, the process of moving to the next sub buffer takes place. After setting the current sub buffer to the next one, the previous one that had the tail go passed the end of the sub buffer needs to be reset back to the original tail location (before the new event was requested) and the rest of the sub buffer needs to be "padded".
The race happens when a reader takes control of the sub buffer. As readers do a "swap" of sub buffers from the ring buffer to get exclusive access to the sub buffer, it replaces the "head" sub buffer with an empty sub buffer that goes back into the writable portion of the ring buffer. This swap can happen as soon as the writer moves to the next sub buffer and before it updates the last sub buffer with padding.
Because the sub buffer can be released to the reader while the writer is still updating the padding, it is possible for the reader to see the event that goes past the end of the sub buffer. This can cause obvious issues.
To fix this, add a few memory barriers so that the reader definitely sees the updates to the sub buffer, and also waits until the writer has put back the "tail" of the sub buffer back to the last event that was written on it.
To be paranoid, it will only spin for 1 second, otherwise it will warn and shutdown the ring buffer code. 1 second should be enough as the writer does have preemption disabled. If the writer doesn't move within 1 second (with preemption disabled) something is horribly wrong. No interrupt should last 1 second!
Link: https://lore.kernel.org/all/20220830120854.7545-1-jiazi.li@transsion.com/ Link: https://bugzilla.kernel.org/show_bug.cgi?id=216369 Link: https://lkml.kernel.org/r/20220929104909.0650a36c@gandalf.local.home
Cc: Ingo Molnar mingo@kernel.org Cc: Andrew Morton akpm@linux-foundation.org Cc: stable@vger.kernel.org Fixes: c7b0930857e22 ("ring-buffer: prevent adding write in discarded area") Reported-by: Jiazi.Li jiazi.li@transsion.com Signed-off-by: Steven Rostedt (Google) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Chen Jiahao chenjiahao16@huawei.com Signed-off-by: Zheng Yejian zhengyejian1@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- kernel/trace/ring_buffer.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 4a9b16dfb5cd..c64a654e213e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2469,6 +2469,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* Mark the rest of the page with padding */ rb_event_set_padding(event);
+ /* Make sure the padding is visible before the write update */ + smp_wmb(); + /* Set the write back to the previous setting */ local_sub(length, &tail_page->write); return; @@ -2480,6 +2483,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, /* time delta must be non zero */ event->time_delta = 1;
+ /* Make sure the padding is visible before the tail_page->write update */ + smp_wmb(); + /* Set write to end of buffer */ length = (tail + length) - BUF_PAGE_SIZE; local_sub(length, &tail_page->write); @@ -4294,6 +4300,33 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags);
+ /* + * The writer has preempt disable, wait for it. But not forever + * Although, 1 second is pretty much "forever" + */ +#define USECS_WAIT 1000000 + for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) { + /* If the write is past the end of page, a writer is still updating it */ + if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE)) + break; + + udelay(1); + + /* Get the latest version of the reader write value */ + smp_rmb(); + } + + /* The writer is not moving forward? Something is wrong */ + if (RB_WARN_ON(cpu_buffer, nr_loops == USECS_WAIT)) + reader = NULL; + + /* + * Make sure we see any padding after the write update + * (see rb_reset_tail()) + */ + smp_rmb(); + + return reader; }
From: Hou Tao houtao1@huawei.com
hulk inclusion category: bugfix bugzilla: 188150, https://gitee.com/openeuler/kernel/issues/I643OL
----------------------------------------
Cancel the inflight async device probe when removing scsi_target, so no new disk will be added when __scsi_target_remove() returns.
Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Zhong Jinghua zhongjinghua@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com --- drivers/scsi/scsi_sysfs.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+)
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index dc448def9e80..8a24d144395d 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -1503,6 +1503,40 @@ void scsi_remove_device(struct scsi_device *sdev) } EXPORT_SYMBOL(scsi_remove_device);
+/* Cancel the inflight async probe for scsi_device */ +static void __scsi_kill_devices(struct scsi_target *starget) +{ + struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); + struct scsi_device *sdev, *to_put = NULL; + unsigned long flags; + + spin_lock_irqsave(shost->host_lock, flags); + list_for_each_entry(sdev, &shost->__devices, siblings) { + if (sdev->channel != starget->channel || + sdev->id != starget->id) + continue; + + if ((sdev->sdev_state != SDEV_DEL && + sdev->sdev_state != SDEV_CANCEL) || !sdev->is_visible) + continue; + if (!kobject_get_unless_zero(&sdev->sdev_gendev.kobj)) + continue; + spin_unlock_irqrestore(shost->host_lock, flags); + + if (to_put) + put_device(&to_put->sdev_gendev); + device_lock(&sdev->sdev_gendev); + kill_device(&sdev->sdev_gendev); + device_unlock(&sdev->sdev_gendev); + to_put = sdev; + + spin_lock_irqsave(shost->host_lock, flags); + } + spin_unlock_irqrestore(shost->host_lock, flags); + if (to_put) + put_device(&to_put->sdev_gendev); +} + static void __scsi_remove_target(struct scsi_target *starget) { struct Scsi_Host *shost = dev_to_shost(starget->dev.parent); @@ -1532,6 +1566,8 @@ static void __scsi_remove_target(struct scsi_target *starget) goto restart; } spin_unlock_irqrestore(shost->host_lock, flags); + + __scsi_kill_devices(starget); }
/**