April 2023 - Kernel - mailweb.openeuler.org

[PATCH openEuler-22.03-LTS] net/smc: align the connect behaviour with TCP
by Litao Jiao 07 Apr '23

07 Apr '23

mainline inclusion from mainline-v5.19.rc1 commit 3aba103006bcc4a7472b7c9506b3bc065ffb7992 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6TK1U CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?… -------------------------------- Connect with O_NONBLOCK will not be completed immediately and returns -EINPROGRESS. It is possible to use selector/poll for completion by selecting the socket for writing. After select indicates writability, a second connect function call will return 0 to indicate connected successfully as TCP does, but smc returns -EISCONN. Use socket state for smc to indicate connect state, which can help smc aligning the connect behaviour with TCP. Signed-off-by: Guangguan Wang <guangguan.wang(a)linux.alibaba.com> Acked-by: Karsten Graul <kgraul(a)linux.ibm.com> Signed-off-by: David S. Miller <davem(a)davemloft.net> Signed-off-by: Litao Jiao <jiaolitao(a)sangfor.com.cn> --- net/smc/af_smc.c | 51 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 5d7710dd9514..8f73da1ee7b4 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1097,9 +1097,29 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, goto out_err; lock_sock(sk); + switch (sock->state) { + default: + rc = -EINVAL; + goto out; + case SS_CONNECTED: + rc = sk->sk_state == SMC_ACTIVE ? -EISCONN : -EINVAL; + goto out; + case SS_CONNECTING: + if (sk->sk_state == SMC_ACTIVE) + goto connected; + break; + case SS_UNCONNECTED: + sock->state = SS_CONNECTING; + break; + } + switch (sk->sk_state) { default: goto out; + case SMC_CLOSED: + rc = sock_error(sk) ? : -ECONNABORTED; + sock->state = SS_UNCONNECTED; + goto out; case SMC_ACTIVE: rc = -EISCONN; goto out; @@ -1118,21 +1138,26 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, if (rc && rc != -EINPROGRESS) goto out; - if (smc->use_fallback) + if (smc->use_fallback) { + sock->state = rc ? SS_CONNECTING : SS_CONNECTED; goto out; + } sock_hold(&smc->sk); /* sock put in passive closing */ + if (flags & O_NONBLOCK) { if (queue_work(smc_hs_wq, &smc->connect_work)) smc->connect_nonblock = 1; rc = -EINPROGRESS; + goto out; } else { rc = __smc_connect(smc); if (rc < 0) goto out; - else - rc = 0; /* success cases including fallback */ } +connected: + rc = 0; + sock->state = SS_CONNECTED; out: release_sock(sk); out_err: @@ -1234,6 +1259,7 @@ struct sock *smc_accept_dequeue(struct sock *parent, } if (new_sock) { sock_graft(new_sk, new_sock); + new_sock->state = SS_CONNECTED; if (isk->use_fallback) { smc_sk(new_sk)->clcsock->file = new_sock->file; isk->clcsock->file->private_data = isk->clcsock; @@ -1865,7 +1891,7 @@ static int smc_listen(struct socket *sock, int backlog) rc = -EINVAL; if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) || - smc->connect_nonblock) + smc->connect_nonblock || sock->state != SS_UNCONNECTED) goto out; rc = 0; @@ -2135,6 +2161,17 @@ static int smc_shutdown(struct socket *sock, int how) lock_sock(sk); + if (sock->state == SS_CONNECTING) { + if (sk->sk_state == SMC_ACTIVE) + sock->state = SS_CONNECTED; + else if (sk->sk_state == SMC_PEERCLOSEWAIT1 || + sk->sk_state == SMC_PEERCLOSEWAIT2 || + sk->sk_state == SMC_APPCLOSEWAIT1 || + sk->sk_state == SMC_APPCLOSEWAIT2 || + sk->sk_state == SMC_APPFINCLOSEWAIT) + sock->state = SS_DISCONNECTING; + } + rc = -ENOTCONN; if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_PEERCLOSEWAIT1) && @@ -2148,6 +2185,7 @@ static int smc_shutdown(struct socket *sock, int how) sk->sk_shutdown = smc->clcsock->sk->sk_shutdown; if (sk->sk_shutdown == SHUTDOWN_MASK) { sk->sk_state = SMC_CLOSED; + sk->sk_socket->state = SS_UNCONNECTED; sock_put(sk); } goto out; @@ -2173,6 +2211,10 @@ static int smc_shutdown(struct socket *sock, int how) /* map sock_shutdown_cmd constants to sk_shutdown value range */ sk->sk_shutdown |= how + 1; + if (sk->sk_state == SMC_CLOSED) + sock->state = SS_UNCONNECTED; + else + sock->state = SS_DISCONNECTING; out: release_sock(sk); return rc ? rc : rc1; @@ -2464,6 +2506,7 @@ static int smc_create(struct net *net, struct socket *sock, int protocol, rc = -ENOBUFS; sock->ops = &smc_sock_ops; + sock->state = SS_UNCONNECTED; sk = smc_sock_alloc(net, sock, protocol); if (!sk) goto out; -- 2.18.0.windows.1

1 0

[PATCH openEuler-1.0-LTS 01/13] mm: mem_reliable: Initialize reliable_nr_page when mm_init()
by Yongqiang Liu 07 Apr '23

07 Apr '23

From: Ma Wupeng <mawupeng1(a)huawei.com> hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6RKHX CVE: NA -------------------------------- After the fork operation, it is erroneous for the child process to have a reliable page size twice that of its parent process. Upon examining the mm_struct structure, it was discovered that reliable_nr_page should be initialized to 0, similar to how RSS is initialized during mm_init(). This particular problem that arises during forking is merely one such example. To resolve this issue, it is recommended to set reliable_nr_page to 0 during the mm_init() operation. Fixes: 094eaabb3fe8 ("proc: Count reliable memory usage of reliable tasks") Signed-off-by: Ma Wupeng <mawupeng1(a)huawei.com> Reviewed-by: Kefeng Wang <wangkefeng.wang(a)huawei.com> Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com> --- include/linux/mem_reliable.h | 8 ++++++++ kernel/fork.c | 1 + 2 files changed, 9 insertions(+) diff --git a/include/linux/mem_reliable.h b/include/linux/mem_reliable.h index 6d57c36fb676..aa3fe77c8a72 100644 --- a/include/linux/mem_reliable.h +++ b/include/linux/mem_reliable.h @@ -123,6 +123,13 @@ static inline bool mem_reliable_shmem_limit_check(void) shmem_reliable_nr_page; } +static inline void reliable_clear_page_counter(struct mm_struct *mm) +{ + if (!mem_reliable_is_enabled()) + return; + + atomic_long_set(&mm->reliable_nr_page, 0); +} #else #define reliable_enabled 0 #define reliable_allow_fb_enabled() false @@ -171,6 +178,7 @@ static inline void reliable_lru_add_batch(int zid, enum lru_list lru, int val) {} static inline bool mem_reliable_counter_initialized(void) { return false; } +static inline void reliable_clear_page_counter(struct mm_struct *mm) {} #endif #endif diff --git a/kernel/fork.c b/kernel/fork.c index b5453a26655e..c256525d4ce5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1007,6 +1007,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, atomic_long_set(&mm->locked_vm, 0); mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); + reliable_clear_page_counter(mm); spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->arg_lock); mm_init_cpumask(mm); -- 2.25.1

1 12

[PATCH openEuler-1.0-LTS 1/5] loop: Add parm check in loop_control_ioctl
by Yongqiang Liu 07 Apr '23

07 Apr '23

From: Zhong Jinghua <zhongjinghua(a)huawei.com> hulk inclusion category: bugfix bugzilla: 188586, https://gitee.com/openeuler/kernel/issues/I6TFPJ CVE: NA ---------------------------------------- We found that in loop_control_ioctl, the kernel panic can be easily caused: 1. syscall(__NR_ioctl, r[1], 0x4c80, 0x80000200000ul); Create a loop device 0x80000200000ul. In fact, in the code, it is used as the first_minor number, and the first_minor number is 0. So the created loop device number is 7:0. 2. syscall(__NR_ioctl, r[2], 0x4c80, 0ul); Create a loop device 0x0ul. Since the 7:0 device has been created in 1, add_disk will fail because the major and first_minor numbers are consistent. 3. syscall(__NR_ioctl, r[5], 0x4c81, 0ul); Delete the device that failed to create, the kernel panics. Panic like below: BUG: KASAN: null-ptr-deref in device_del+0xb3/0x840 drivers/base/core.c:3107 Call Trace: kill_device drivers/base/core.c:3079 [inline] device_del+0xb3/0x840 drivers/base/core.c:3107 del_gendisk+0x463/0x5f0 block/genhd.c:971 loop_remove drivers/block/loop.c:2190 [inline] loop_control_ioctl drivers/block/loop.c:2289 [inline] The stack like below: Create loop device: loop_control_ioctl loop_add add_disk device_add_disk bdi_register bdi_register_va device_create device_create_groups_vargs device_add kfree(dev->p); dev->p = NULL; Remove loop device: loop_control_ioctl loop_remove del_gendisk device_del kill_device if (dev->p->dead) // p is null Fix it by adding a check for parm. Fixes: 770fe30a46a1 ("loop: add management interface for on-demand device allocation") Signed-off-by: Zhong Jinghua <zhongjinghua(a)huawei.com> Reviewed-by: Yu Kuai <yukuai3(a)huawei.com> Reviewed-by: Hou Tao <houtao1(a)huawei.com> Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com> --- drivers/block/loop.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 108a4ff27bcd..826633aa328c 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1972,6 +1972,17 @@ static int loop_add(struct loop_device **l, int i) struct gendisk *disk; int err; + /* + * i << part_shift is actually used as the first_minor. + * So here should avoid i << part_shift overflow. + * And, MKDEV() expect that the max bits of + * first_minor is 20. + */ + if (i > 0 && i > MINORMASK >> part_shift) { + err = -EINVAL; + goto out; + } + err = -ENOMEM; lo = kzalloc(sizeof(*lo), GFP_KERNEL); if (!lo) @@ -1985,7 +1996,8 @@ static int loop_add(struct loop_device **l, int i) if (err == -ENOSPC) err = -EEXIST; } else { - err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL); + err = idr_alloc(&loop_index_idr, lo, 0, + (MINORMASK >> part_shift) + 1, GFP_KERNEL); } if (err < 0) goto out_free_dev; -- 2.25.1

1 4

[PATCH openEuler-1.0-LTS] block/wbt: enable wbt after switching cfq to other schedulers
by Zhang Changzhong 07 Apr '23

07 Apr '23

From: Li Lingfeng <lilingfeng3(a)huawei.com> hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6LH5K CVE: NA -------------------------------- Commit 80061216078b ("block/wbt: fix negative inflight counter when remove scsi device") move wbt_enable_default() from elv_unregister_queue() to bfq_exit_queue(). As the result of it, wbt can't be enabled when we switch cfq to other schedulers. Fixes: 80061216078b ("block/wbt: fix negative inflight counter when remove scsi device") Signed-off-by: Li Lingfeng <lilingfeng3(a)huawei.com> Reviewed-by: Yu Kuai <yukuai3(a)huawei.com> Reviewed-by: Hou Tao <houtao1(a)huawei.com> Signed-off-by: Zhang Changzhong <zhangchangzhong(a)huawei.com> --- block/cfq-iosched.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 88bae55..130854a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -4563,6 +4563,7 @@ static void cfq_exit_queue(struct elevator_queue *e) kfree(cfqd->root_group); #endif kfree(cfqd); + wbt_enable_default(q); } static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) -- 2.9.5

1 0

[PATCH openEuler-1.0-LTS] Fix double fget() in vhost_net_set_backend()
by Yongqiang Liu 06 Apr '23

06 Apr '23

From: Al Viro <viro(a)zeniv.linux.org.uk> stable inclusion from stable-v4.19.245 commit 6ca70982c646cc32e458150ee7f2530a24369b8c category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6T1EY CVE: CVE-2023-1838 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id… -------------------------------- commit fb4554c2232e44d595920f4d5c66cf8f7d13f9bc upstream. Descriptor table is a shared resource; two fget() on the same descriptor may return different struct file references. get_tap_ptr_ring() is called after we'd found (and pinned) the socket we'll be using and it tries to find the private tun/tap data structures associated with it. Redoing the lookup by the same file descriptor we'd used to get the socket is racy - we need to same struct file. Thanks to Jason for spotting a braino in the original variant of patch - I'd missed the use of fd == -1 for disabling backend, and in that case we can end up with sock == NULL and sock != oldsock. Cc: stable(a)kernel.org Acked-by: Michael S. Tsirkin <mst(a)redhat.com> Signed-off-by: Jason Wang <jasowang(a)redhat.com> Signed-off-by: Al Viro <viro(a)zeniv.linux.org.uk> Signed-off-by: Greg Kroah-Hartman <gregkh(a)linuxfoundation.org> Signed-off-by: Zhengchao Shao <shaozhengchao(a)huawei.com> Reviewed-by: Yue Haibing <yuehaibing(a)huawei.com> Reviewed-by: Xiu Jianfeng <xiujianfeng(a)huawei.com> Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com> --- drivers/vhost/net.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 1d99f5c443ee..4b9151474a24 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -1211,13 +1211,9 @@ static struct socket *get_raw_socket(int fd) return ERR_PTR(r); } -static struct ptr_ring *get_tap_ptr_ring(int fd) +static struct ptr_ring *get_tap_ptr_ring(struct file *file) { struct ptr_ring *ring; - struct file *file = fget(fd); - - if (!file) - return NULL; ring = tun_get_tx_ring(file); if (!IS_ERR(ring)) goto out; @@ -1226,7 +1222,6 @@ static struct ptr_ring *get_tap_ptr_ring(int fd) goto out; ring = NULL; out: - fput(file); return ring; } @@ -1313,8 +1308,12 @@ static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd) r = vhost_net_enable_vq(n, vq); if (r) goto err_used; - if (index == VHOST_NET_VQ_RX) - nvq->rx_ring = get_tap_ptr_ring(fd); + if (index == VHOST_NET_VQ_RX) { + if (sock) + nvq->rx_ring = get_tap_ptr_ring(sock->file); + else + nvq->rx_ring = NULL; + } oldubufs = nvq->ubufs; nvq->ubufs = ubufs; -- 2.25.1

1 0

[PATCH openEuler-1.0-LTS 1/4] btrfs: fix race between quota disable and quota assign ioctls
by Yongqiang Liu 06 Apr '23

06 Apr '23

From: Filipe Manana <fdmanana(a)suse.com> mainline inclusion from mainline-v6.2-rc8 commit 2f1a6be12ab6c8470d5776e68644726c94257c54 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I6PQCT CVE: CVE-2023-1611 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?… -------------------------------- The quota assign ioctl can currently run in parallel with a quota disable ioctl call. The assign ioctl uses the quota root, while the disable ioctl frees that root, and therefore we can have a use-after-free triggered in the assign ioctl, leading to a trace like the following when KASAN is enabled: [672.723][T736] BUG: KASAN: slab-use-after-free in btrfs_search_slot+0x2962/0x2db0 [672.723][T736] Read of size 8 at addr ffff888022ec0208 by task btrfs_search_sl/27736 [672.724][T736] [672.725][T736] CPU: 1 PID: 27736 Comm: btrfs_search_sl Not tainted 6.3.0-rc3 #37 [672.723][T736] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [672.727][T736] Call Trace: [672.728][T736] <TASK> [672.728][T736] dump_stack_lvl+0xd9/0x150 [672.725][T736] print_report+0xc1/0x5e0 [672.720][T736] ? __virt_addr_valid+0x61/0x2e0 [672.727][T736] ? __phys_addr+0xc9/0x150 [672.725][T736] ? btrfs_search_slot+0x2962/0x2db0 [672.722][T736] kasan_report+0xc0/0xf0 [672.729][T736] ? btrfs_search_slot+0x2962/0x2db0 [672.724][T736] btrfs_search_slot+0x2962/0x2db0 [672.723][T736] ? fs_reclaim_acquire+0xba/0x160 [672.722][T736] ? split_leaf+0x13d0/0x13d0 [672.726][T736] ? rcu_is_watching+0x12/0xb0 [672.723][T736] ? kmem_cache_alloc+0x338/0x3c0 [672.722][T736] update_qgroup_status_item+0xf7/0x320 [672.724][T736] ? add_qgroup_rb+0x3d0/0x3d0 [672.739][T736] ? do_raw_spin_lock+0x12d/0x2b0 [672.730][T736] ? spin_bug+0x1d0/0x1d0 [672.737][T736] btrfs_run_qgroups+0x5de/0x840 [672.730][T736] ? btrfs_qgroup_rescan_worker+0xa70/0xa70 [672.738][T736] ? __del_qgroup_relation+0x4ba/0xe00 [672.738][T736] btrfs_ioctl+0x3d58/0x5d80 [672.735][T736] ? tomoyo_path_number_perm+0x16a/0x550 [672.737][T736] ? tomoyo_execute_permission+0x4a0/0x4a0 [672.731][T736] ? btrfs_ioctl_get_supported_features+0x50/0x50 [672.737][T736] ? __sanitizer_cov_trace_switch+0x54/0x90 [672.734][T736] ? do_vfs_ioctl+0x132/0x1660 [672.730][T736] ? vfs_fileattr_set+0xc40/0xc40 [672.730][T736] ? _raw_spin_unlock_irq+0x2e/0x50 [672.732][T736] ? sigprocmask+0xf2/0x340 [672.737][T736] ? __fget_files+0x26a/0x480 [672.732][T736] ? bpf_lsm_file_ioctl+0x9/0x10 [672.738][T736] ? btrfs_ioctl_get_supported_features+0x50/0x50 [672.736][T736] __x64_sys_ioctl+0x198/0x210 [672.736][T736] do_syscall_64+0x39/0xb0 [672.731][T736] entry_SYSCALL_64_after_hwframe+0x63/0xcd [672.739][T736] RIP: 0033:0x4556ad [672.742][T736] </TASK> [672.743][T736] [672.748][T736] Allocated by task 27677: [672.743][T736] kasan_save_stack+0x22/0x40 [672.741][T736] kasan_set_track+0x25/0x30 [672.741][T736] __kasan_kmalloc+0xa4/0xb0 [672.749][T736] btrfs_alloc_root+0x48/0x90 [672.746][T736] btrfs_create_tree+0x146/0xa20 [672.744][T736] btrfs_quota_enable+0x461/0x1d20 [672.743][T736] btrfs_ioctl+0x4a1c/0x5d80 [672.747][T736] __x64_sys_ioctl+0x198/0x210 [672.749][T736] do_syscall_64+0x39/0xb0 [672.744][T736] entry_SYSCALL_64_after_hwframe+0x63/0xcd [672.756][T736] [672.757][T736] Freed by task 27677: [672.759][T736] kasan_save_stack+0x22/0x40 [672.759][T736] kasan_set_track+0x25/0x30 [672.756][T736] kasan_save_free_info+0x2e/0x50 [672.751][T736] ____kasan_slab_free+0x162/0x1c0 [672.758][T736] slab_free_freelist_hook+0x89/0x1c0 [672.752][T736] __kmem_cache_free+0xaf/0x2e0 [672.752][T736] btrfs_put_root+0x1ff/0x2b0 [672.759][T736] btrfs_quota_disable+0x80a/0xbc0 [672.752][T736] btrfs_ioctl+0x3e5f/0x5d80 [672.756][T736] __x64_sys_ioctl+0x198/0x210 [672.753][T736] do_syscall_64+0x39/0xb0 [672.765][T736] entry_SYSCALL_64_after_hwframe+0x63/0xcd [672.769][T736] [672.768][T736] The buggy address belongs to the object at ffff888022ec0000 [672.768][T736] which belongs to the cache kmalloc-4k of size 4096 [672.769][T736] The buggy address is located 520 bytes inside of [672.769][T736] freed 4096-byte region [ffff888022ec0000, ffff888022ec1000) [672.760][T736] [672.764][T736] The buggy address belongs to the physical page: [672.761][T736] page:ffffea00008bb000 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x22ec0 [672.766][T736] head:ffffea00008bb000 order:3 entire_mapcount:0 nr_pages_mapped:0 pincount:0 [672.779][T736] flags: 0xfff00000010200(slab|head|node=0|zone=1|lastcpupid=0x7ff) [672.770][T736] raw: 00fff00000010200 ffff888012842140 ffffea000054ba00 dead000000000002 [672.770][T736] raw: 0000000000000000 0000000000040004 00000001ffffffff 0000000000000000 [672.771][T736] page dumped because: kasan: bad access detected [672.778][T736] page_owner tracks the page as allocated [672.777][T736] page last allocated via order 3, migratetype Unmovable, gfp_mask 0xd2040(__GFP_IO|__GFP_NOWARN|__GFP_NORETRY|__GFP_COMP|__GFP_NOMEMALLOC), pid 88 [672.779][T736] get_page_from_freelist+0x119c/0x2d50 [672.779][T736] __alloc_pages+0x1cb/0x4a0 [672.776][T736] alloc_pages+0x1aa/0x270 [672.773][T736] allocate_slab+0x260/0x390 [672.771][T736] ___slab_alloc+0xa9a/0x13e0 [672.778][T736] __slab_alloc.constprop.0+0x56/0xb0 [672.771][T736] __kmem_cache_alloc_node+0x136/0x320 [672.789][T736] __kmalloc+0x4e/0x1a0 [672.783][T736] tomoyo_realpath_from_path+0xc3/0x600 [672.781][T736] tomoyo_path_perm+0x22f/0x420 [672.782][T736] tomoyo_path_unlink+0x92/0xd0 [672.780][T736] security_path_unlink+0xdb/0x150 [672.788][T736] do_unlinkat+0x377/0x680 [672.788][T736] __x64_sys_unlink+0xca/0x110 [672.789][T736] do_syscall_64+0x39/0xb0 [672.783][T736] entry_SYSCALL_64_after_hwframe+0x63/0xcd [672.784][T736] page last free stack trace: [672.787][T736] free_pcp_prepare+0x4e5/0x920 [672.787][T736] free_unref_page+0x1d/0x4e0 [672.784][T736] __unfreeze_partials+0x17c/0x1a0 [672.797][T736] qlist_free_all+0x6a/0x180 [672.796][T736] kasan_quarantine_reduce+0x189/0x1d0 [672.797][T736] __kasan_slab_alloc+0x64/0x90 [672.793][T736] kmem_cache_alloc+0x17c/0x3c0 [672.799][T736] getname_flags.part.0+0x50/0x4e0 [672.799][T736] getname_flags+0x9e/0xe0 [672.792][T736] vfs_fstatat+0x77/0xb0 [672.791][T736] __do_sys_newlstat+0x84/0x100 [672.798][T736] do_syscall_64+0x39/0xb0 [672.796][T736] entry_SYSCALL_64_after_hwframe+0x63/0xcd [672.790][T736] [672.791][T736] Memory state around the buggy address: [672.799][T736] ffff888022ec0100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [672.805][T736] ffff888022ec0180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [672.802][T736] >ffff888022ec0200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [672.809][T736] ^ [672.809][T736] ffff888022ec0280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [672.809][T736] ffff888022ec0300: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fix this by having the qgroup assign ioctl take the qgroup ioctl mutex before calling btrfs_run_qgroups(), which is what all qgroup ioctls should call. Reported-by: butt3rflyh4ck <butterflyhuangxx(a)gmail.com> Link: https://lore.kernel.org/linux-btrfs/CAFcO6XN3VD8ogmHwqRk4kbiwtpUSNySu2VAxN8… CC: stable(a)vger.kernel.org # 5.10+ Reviewed-by: Qu Wenruo <wqu(a)suse.com> Signed-off-by: Filipe Manana <fdmanana(a)suse.com> Reviewed-by: David Sterba <dsterba(a)suse.com> Signed-off-by: David Sterba <dsterba(a)suse.com> Conflicts: fs/btrfs/qgroup.c Signed-off-by: Long Li <leo.lilong(a)huawei.com> Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com> Reviewed-by: Wang Weiyang <wangweiyang2(a)huawei.com> Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com> --- fs/btrfs/ioctl.c | 2 ++ fs/btrfs/qgroup.c | 11 ++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a5ae02bf3652..00424d3f3464 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -5240,7 +5240,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) } /* update qgroup status and info */ + mutex_lock(&fs_info->qgroup_ioctl_lock); err = btrfs_run_qgroups(trans); + mutex_unlock(&fs_info->qgroup_ioctl_lock); if (err < 0) btrfs_handle_fs_error(fs_info, err, "failed to update qgroup status and info"); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 7916f711daf5..8e58c58f73a3 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2196,7 +2196,8 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) } /* - * called from commit_transaction. Writes all changed qgroups to disk. + * Writes all changed qgroups to disk. + * Called by the transaction commit path and the qgroup assign ioctl. */ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) { @@ -2204,6 +2205,14 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans) struct btrfs_root *quota_root = fs_info->quota_root; int ret = 0; + /* + * In case we are called from the qgroup assign ioctl, assert that we + * are holding the qgroup_ioctl_lock, otherwise we can race with a quota + * disable operation (ioctl) and access a freed quota root. + */ + if (trans->transaction->state != TRANS_STATE_COMMIT_DOING) + lockdep_assert_held(&fs_info->qgroup_ioctl_lock); + if (!quota_root) return ret; -- 2.25.1

1 3

Kernel sig 4.7 14:00-15:30 例会议题情况//答复: openEuler Kernel SIG双周例会
by liaotao (C) 06 Apr '23

06 Apr '23

当前例会议题：议题一：进展update --- 张伽琳 & 郑增凯议题二：openEuler-22.03-LTS-SP2需求评审 ---张伽琳欢迎大家继续申报~ -----原始约会----- 发件人: openEuler conference <public(a)openeuler.org> 发送时间: 2023年4月7日 9:30 收件人: dev@openeuler.org,kernel-discuss@openeuler.org,kernel@openeuler.org 主题: openEuler Kernel SIG双周例会时间: 2023年4月7日星期五 14:00-15:30(UTC+08:00) 北京，重庆，香港特别行政区，乌鲁木齐。地点: 您好！ Kernel SIG 邀请您参加 2023-04-07 14:00 召开的Zoom会议(自动录制) 会议主题：openEuler Kernel SIG双周例会会议内容： 1.进展update 2.议题征集中欢迎大家积极申报议题（新增议题可以直接回复邮件，或录入会议看板）会议链接：https://us06web.zoom.us/j/88353599877?pwd=aFZtMjJwUHl1UmNTZFV4eUJQM2xVdz09 会议纪要：https://etherpad.openeuler.org/p/Kernel-meetings 温馨提醒：建议接入会议后修改参会人的姓名，也可以使用您在gitee.com的ID 更多资讯尽在：https://openeuler.org/zh/ Hello! openEuler Kernel SIG invites you to attend the Zoom conference(auto recording) will be held at 2023-04-07 14:00, The subject of the conference is openEuler Kernel SIG双周例会, Summary: 1.进展update 2.议题征集中欢迎大家积极申报议题（新增议题可以直接回复邮件，或录入会议看板） You can join the meeting at https://us06web.zoom.us/j/88353599877?pwd=aFZtMjJwUHl1UmNTZFV4eUJQM2xVdz09. Add topics at https://etherpad.openeuler.org/p/Kernel-meetings. Note: You are advised to change the participant name after joining the conference or use your ID at gitee.com. More information: https://openeuler.org/en/

1 0

[PATCH openEuler-1.0-LTS 1/4] ext4: fix race between writepages and remount
by Yongqiang Liu 04 Apr '23

04 Apr '23

From: Baokun Li <libaokun1(a)huawei.com> hulk inclusion category: bugfix bugzilla: 188500, https://gitee.com/openeuler/kernel/issues/I6RJ0V CVE: NA -------------------------------- We got a WARNING in ext4_add_complete_io: ================================================================== WARNING: at fs/ext4/page-io.c:231 ext4_put_io_end_defer+0x182/0x250 CPU: 10 PID: 77 Comm: ksoftirqd/10 Tainted: 6.3.0-rc2 #85 RIP: 0010:ext4_put_io_end_defer+0x182/0x250 [ext4] [...] Call Trace: <TASK> ext4_end_bio+0xa8/0x240 [ext4] bio_endio+0x195/0x310 blk_update_request+0x184/0x770 scsi_end_request+0x2f/0x240 scsi_io_completion+0x75/0x450 scsi_finish_command+0xef/0x160 scsi_complete+0xa3/0x180 blk_complete_reqs+0x60/0x80 blk_done_softirq+0x25/0x40 __do_softirq+0x119/0x4c8 run_ksoftirqd+0x42/0x70 smpboot_thread_fn+0x136/0x3c0 kthread+0x140/0x1a0 ret_from_fork+0x2c/0x50 ================================================================== Above issue may happen as follows: cpu1 cpu2 ----------------------------|---------------------------- mount -o dioread_lock ext4_writepages ext4_do_writepages *if (ext4_should_dioread_nolock(inode))* // rsv_blocks is not assigned here mount -o remount,dioread_nolock ext4_journal_start_with_reserve __ext4_journal_start __ext4_journal_start_sb jbd2__journal_start *if (rsv_blocks)* // h_rsv_handle is not initialized here mpage_map_and_submit_extent mpage_map_one_extent dioread_nolock = ext4_should_dioread_nolock(inode) if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) mpd->io_submit.io_end->handle = handle->h_rsv_handle ext4_set_io_unwritten_flag io_end->flag |= EXT4_IO_END_UNWRITTEN // now io_end->handle is NULL but has EXT4_IO_END_UNWRITTEN flag scsi_finish_command scsi_io_completion scsi_io_completion_action scsi_end_request blk_update_request req_bio_endio bio_endio bio->bi_end_io > ext4_end_bio ext4_put_io_end_defer ext4_add_complete_io // trigger WARN_ON(!io_end->handle && sbi->s_journal); The immediate cause of this problem is that ext4_should_dioread_nolock() function returns inconsistent values in the ext4_do_writepages() and mpage_map_one_extent(). There are four conditions in this function that can be changed at mount time to cause this problem. These four conditions can be divided into two categories: (1) journal_data and EXT4_EXTENTS_FL, which can be changed by ioctl (2) DELALLOC and DIOREAD_NOLOCK, which can be changed by remount The two in the first category have been fixed by commit c8585c6fcaf2 ("ext4: fix races between changing inode journal mode and ext4_writepages") and commit cb85f4d23f79 ("ext4: fix race between writepages and enabling EXT4_EXTENTS_FL") respectively. Two cases in the other category have not yet been fixed, and the above issue is caused by this situation. We refer to the fix for the first category, when applying options during remount, we grab s_writepages_rwsem to avoid racing with writepages ops to trigger this problem. Fixes: 6b523df4fb5a ("ext4: use transaction reservation for extent conversion in ext4_end_io") Cc: stable(a)vger.kernel.org Signed-off-by: Baokun Li <libaokun1(a)huawei.com> Reviewed-by: Zhang Yi <yi.zhang(a)huawei.com> Reviewed-by: Yang Erkun <yangerkun(a)huawei.com> Signed-off-by: Yongqiang Liu <liuyongqiang13(a)huawei.com> --- fs/ext4/ext4.h | 3 ++- fs/ext4/super.c | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4c88e75180a2..6df919b154b4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1530,7 +1530,8 @@ struct ext4_sb_info { /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA - * or EXTENTS flag. + * or EXTENTS flag or between writepages ops and changing DIOREAD_NOLOCK + * mount option on remount. */ struct percpu_rw_semaphore s_writepages_rwsem; struct dax_device *s_daxdev; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8029a6f6471c..df07222f1cc5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5605,10 +5605,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) vfs_flags = SB_LAZYTIME | SB_I_VERSION; sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); + /* + * Changing the DIOREAD_NOLOCK mount option may cause two calls to + * ext4_should_dioread_nolock() to return inconsistent values, + * triggering WARN_ON in ext4_add_complete_io(). we grab here + * s_writepages_rwsem to avoid race between writepages ops and + * remount. + */ + percpu_down_write(&sbi->s_writepages_rwsem); if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; + percpu_up_write(&sbi->s_writepages_rwsem); goto restore_opts; } + percpu_up_write(&sbi->s_writepages_rwsem); if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { @@ -5833,6 +5843,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) return 0; restore_opts: + percpu_down_write(&sbi->s_writepages_rwsem); sb->s_flags = old_sb_flags; sbi->s_mount_opt = old_opts.s_mount_opt; sbi->s_mount_opt2 = old_opts.s_mount_opt2; @@ -5841,6 +5852,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sbi->s_commit_interval = old_opts.s_commit_interval; sbi->s_min_batch_time = old_opts.s_min_batch_time; sbi->s_max_batch_time = old_opts.s_max_batch_time; + percpu_up_write(&sbi->s_writepages_rwsem); + if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks) ext4_release_system_zone(sb); #ifdef CONFIG_QUOTA -- 2.25.1

1 6

[PATCH openEuler-5.10-LTS 01/15] scsi: scsi_dh_alua: fix memleak for 'qdata' in alua_activate()
by Jialin Zhang 04 Apr '23

04 Apr '23

From: Yu Kuai <yukuai3(a)huawei.com> hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6NBAZ CVE: NA -------------------------------- If alua_rtpg_queue() failed from alua_activate(), then 'qdata' is not freed, which will cause following memleak: unreferenced object 0xffff88810b2c6980 (size 32): comm "kworker/u16:2", pid 635322, jiffies 4355801099 (age 1216426.076s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 40 39 24 c1 ff ff ff ff 00 f8 ea 0a 81 88 ff ff @9$............. backtrace: [<0000000098f3a26d>] alua_activate+0xb0/0x320 [<000000003b529641>] scsi_dh_activate+0xb2/0x140 [<000000007b296db3>] activate_path_work+0xc6/0xe0 [dm_multipath] [<000000007adc9ace>] process_one_work+0x3c5/0x730 [<00000000c457a985>] worker_thread+0x93/0x650 [<00000000cb80e628>] kthread+0x1ba/0x210 [<00000000a1e61077>] ret_from_fork+0x22/0x30 Fix the problem by freeing 'qdata' in error path. Fixes: 625fe857e4fa ("scsi: scsi_dh_alua: Check scsi_device_get() return value") Signed-off-by: Yu Kuai <yukuai3(a)huawei.com> Reviewed-by: Hou Tao <houtao1(a)huawei.com> Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com> --- drivers/scsi/device_handler/scsi_dh_alua.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index fe8a5e5c0df8..bf0b3178f84d 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -1036,10 +1036,12 @@ static int alua_activate(struct scsi_device *sdev, rcu_read_unlock(); mutex_unlock(&h->init_mutex); - if (alua_rtpg_queue(pg, sdev, qdata, true)) + if (alua_rtpg_queue(pg, sdev, qdata, true)) { fn = NULL; - else + } else { + kfree(qdata); err = SCSI_DH_DEV_OFFLINED; + } kref_put(&pg->kref, release_port_group); out: if (fn) -- 2.25.1

1 14

[PATCH openEuler-5.10-LTS-SP1 01/16] scsi: scsi_dh_alua: fix memleak for 'qdata' in alua_activate()
by Jialin Zhang 04 Apr '23

04 Apr '23

From: Yu Kuai <yukuai3(a)huawei.com> hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I6NBAZ CVE: NA -------------------------------- If alua_rtpg_queue() failed from alua_activate(), then 'qdata' is not freed, which will cause following memleak: unreferenced object 0xffff88810b2c6980 (size 32): comm "kworker/u16:2", pid 635322, jiffies 4355801099 (age 1216426.076s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 40 39 24 c1 ff ff ff ff 00 f8 ea 0a 81 88 ff ff @9$............. backtrace: [<0000000098f3a26d>] alua_activate+0xb0/0x320 [<000000003b529641>] scsi_dh_activate+0xb2/0x140 [<000000007b296db3>] activate_path_work+0xc6/0xe0 [dm_multipath] [<000000007adc9ace>] process_one_work+0x3c5/0x730 [<00000000c457a985>] worker_thread+0x93/0x650 [<00000000cb80e628>] kthread+0x1ba/0x210 [<00000000a1e61077>] ret_from_fork+0x22/0x30 Fix the problem by freeing 'qdata' in error path. Fixes: 625fe857e4fa ("scsi: scsi_dh_alua: Check scsi_device_get() return value") Signed-off-by: Yu Kuai <yukuai3(a)huawei.com> Reviewed-by: Hou Tao <houtao1(a)huawei.com> Signed-off-by: Jialin Zhang <zhangjialin11(a)huawei.com> --- drivers/scsi/device_handler/scsi_dh_alua.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c b/drivers/scsi/device_handler/scsi_dh_alua.c index fe8a5e5c0df8..bf0b3178f84d 100644 --- a/drivers/scsi/device_handler/scsi_dh_alua.c +++ b/drivers/scsi/device_handler/scsi_dh_alua.c @@ -1036,10 +1036,12 @@ static int alua_activate(struct scsi_device *sdev, rcu_read_unlock(); mutex_unlock(&h->init_mutex); - if (alua_rtpg_queue(pg, sdev, qdata, true)) + if (alua_rtpg_queue(pg, sdev, qdata, true)) { fn = NULL; - else + } else { + kfree(qdata); err = SCSI_DH_DEV_OFFLINED; + } kref_put(&pg->kref, release_port_group); out: if (fn) -- 2.25.1

1 15