From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v5.16-rc7 commit 8f905c0e7354ef261360fb7535ea079b1082c105 category: bugfix bugzilla: 186714 https://gitee.com/src-openeuler/kernel/issues/I57QUK
--------------------------------
syzbot reported various issues around early demux, one being included in this changelog [1]
sk->sk_rx_dst is using RCU protection without clearly documenting it.
And following sequences in tcp_v4_do_rcv()/tcp_v6_do_rcv() are not following standard RCU rules.
[a] dst_release(dst); [b] sk->sk_rx_dst = NULL;
They look wrong because a delete operation of RCU protected pointer is supposed to clear the pointer before the call_rcu()/synchronize_rcu() guarding actual memory freeing.
In some cases indeed, dst could be freed before [b] is done.
We could cheat by clearing sk_rx_dst before calling dst_release(), but this seems the right time to stick to standard RCU annotations and debugging facilities.
[1] BUG: KASAN: use-after-free in dst_check include/net/dst.h:470 [inline] BUG: KASAN: use-after-free in tcp_v4_early_demux+0x95b/0x960 net/ipv4/tcp_ipv4.c:1792 Read of size 2 at addr ffff88807f1cb73a by task syz-executor.5/9204
CPU: 0 PID: 9204 Comm: syz-executor.5 Not tainted 5.16.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 print_address_description.constprop.0.cold+0x8d/0x320 mm/kasan/report.c:247 __kasan_report mm/kasan/report.c:433 [inline] kasan_report.cold+0x83/0xdf mm/kasan/report.c:450 dst_check include/net/dst.h:470 [inline] tcp_v4_early_demux+0x95b/0x960 net/ipv4/tcp_ipv4.c:1792 ip_rcv_finish_core.constprop.0+0x15de/0x1e80 net/ipv4/ip_input.c:340 ip_list_rcv_finish.constprop.0+0x1b2/0x6e0 net/ipv4/ip_input.c:583 ip_sublist_rcv net/ipv4/ip_input.c:609 [inline] ip_list_rcv+0x34e/0x490 net/ipv4/ip_input.c:644 __netif_receive_skb_list_ptype net/core/dev.c:5508 [inline] __netif_receive_skb_list_core+0x549/0x8e0 net/core/dev.c:5556 __netif_receive_skb_list net/core/dev.c:5608 [inline] netif_receive_skb_list_internal+0x75e/0xd80 net/core/dev.c:5699 gro_normal_list net/core/dev.c:5853 [inline] gro_normal_list net/core/dev.c:5849 [inline] napi_complete_done+0x1f1/0x880 net/core/dev.c:6590 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline] virtnet_poll+0xca2/0x11b0 drivers/net/virtio_net.c:1557 __napi_poll+0xaf/0x440 net/core/dev.c:7023 napi_poll net/core/dev.c:7090 [inline] net_rx_action+0x801/0xb40 net/core/dev.c:7177 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu+0x123/0x180 kernel/softirq.c:637 irq_exit_rcu+0x5/0x20 kernel/softirq.c:649 common_interrupt+0x52/0xc0 arch/x86/kernel/irq.c:240 asm_common_interrupt+0x1e/0x40 arch/x86/include/asm/idtentry.h:629 RIP: 0033:0x7f5e972bfd57 Code: 39 d1 73 14 0f 1f 80 00 00 00 00 48 8b 50 f8 48 83 e8 08 48 39 ca 77 f3 48 39 c3 73 3e 48 89 13 48 8b 50 f8 48 89 38 49 8b 0e <48> 8b 3e 48 83 c3 08 48 83 c6 08 eb bc 48 39 d1 72 9e 48 39 d0 73 RSP: 002b:00007fff8a413210 EFLAGS: 00000283 RAX: 00007f5e97108990 RBX: 00007f5e97108338 RCX: ffffffff81d3aa45 RDX: ffffffff81d3aa45 RSI: 00007f5e97108340 RDI: ffffffff81d3aa45 RBP: 00007f5e97107eb8 R08: 00007f5e97108d88 R09: 0000000093c2e8d9 R10: 0000000000000000 R11: 0000000000000000 R12: 00007f5e97107eb0 R13: 00007f5e97108338 R14: 00007f5e97107ea8 R15: 0000000000000019 </TASK>
Allocated by task 13: kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38 kasan_set_track mm/kasan/common.c:46 [inline] set_alloc_info mm/kasan/common.c:434 [inline] __kasan_slab_alloc+0x90/0xc0 mm/kasan/common.c:467 kasan_slab_alloc include/linux/kasan.h:259 [inline] slab_post_alloc_hook mm/slab.h:519 [inline] slab_alloc_node mm/slub.c:3234 [inline] slab_alloc mm/slub.c:3242 [inline] kmem_cache_alloc+0x202/0x3a0 mm/slub.c:3247 dst_alloc+0x146/0x1f0 net/core/dst.c:92 rt_dst_alloc+0x73/0x430 net/ipv4/route.c:1613 ip_route_input_slow+0x1817/0x3a20 net/ipv4/route.c:2340 ip_route_input_rcu net/ipv4/route.c:2470 [inline] ip_route_input_noref+0x116/0x2a0 net/ipv4/route.c:2415 ip_rcv_finish_core.constprop.0+0x288/0x1e80 net/ipv4/ip_input.c:354 ip_list_rcv_finish.constprop.0+0x1b2/0x6e0 net/ipv4/ip_input.c:583 ip_sublist_rcv net/ipv4/ip_input.c:609 [inline] ip_list_rcv+0x34e/0x490 net/ipv4/ip_input.c:644 __netif_receive_skb_list_ptype net/core/dev.c:5508 [inline] __netif_receive_skb_list_core+0x549/0x8e0 net/core/dev.c:5556 __netif_receive_skb_list net/core/dev.c:5608 [inline] netif_receive_skb_list_internal+0x75e/0xd80 net/core/dev.c:5699 gro_normal_list net/core/dev.c:5853 [inline] gro_normal_list net/core/dev.c:5849 [inline] napi_complete_done+0x1f1/0x880 net/core/dev.c:6590 virtqueue_napi_complete drivers/net/virtio_net.c:339 [inline] virtnet_poll+0xca2/0x11b0 drivers/net/virtio_net.c:1557 __napi_poll+0xaf/0x440 net/core/dev.c:7023 napi_poll net/core/dev.c:7090 [inline] net_rx_action+0x801/0xb40 net/core/dev.c:7177 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558
Freed by task 13: kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38 kasan_set_track+0x21/0x30 mm/kasan/common.c:46 kasan_set_free_info+0x20/0x30 mm/kasan/generic.c:370 ____kasan_slab_free mm/kasan/common.c:366 [inline] ____kasan_slab_free mm/kasan/common.c:328 [inline] __kasan_slab_free+0xff/0x130 mm/kasan/common.c:374 kasan_slab_free include/linux/kasan.h:235 [inline] slab_free_hook mm/slub.c:1723 [inline] slab_free_freelist_hook+0x8b/0x1c0 mm/slub.c:1749 slab_free mm/slub.c:3513 [inline] kmem_cache_free+0xbd/0x5d0 mm/slub.c:3530 dst_destroy+0x2d6/0x3f0 net/core/dst.c:127 rcu_do_batch kernel/rcu/tree.c:2506 [inline] rcu_core+0x7ab/0x1470 kernel/rcu/tree.c:2741 __do_softirq+0x29b/0x9c2 kernel/softirq.c:558
Last potentially related work creation: kasan_save_stack+0x1e/0x50 mm/kasan/common.c:38 __kasan_record_aux_stack+0xf5/0x120 mm/kasan/generic.c:348 __call_rcu kernel/rcu/tree.c:2985 [inline] call_rcu+0xb1/0x740 kernel/rcu/tree.c:3065 dst_release net/core/dst.c:177 [inline] dst_release+0x79/0xe0 net/core/dst.c:167 tcp_v4_do_rcv+0x612/0x8d0 net/ipv4/tcp_ipv4.c:1712 sk_backlog_rcv include/net/sock.h:1030 [inline] __release_sock+0x134/0x3b0 net/core/sock.c:2768 release_sock+0x54/0x1b0 net/core/sock.c:3300 tcp_sendmsg+0x36/0x40 net/ipv4/tcp.c:1441 inet_sendmsg+0x99/0xe0 net/ipv4/af_inet.c:819 sock_sendmsg_nosec net/socket.c:704 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:724 sock_write_iter+0x289/0x3c0 net/socket.c:1057 call_write_iter include/linux/fs.h:2162 [inline] new_sync_write+0x429/0x660 fs/read_write.c:503 vfs_write+0x7cd/0xae0 fs/read_write.c:590 ksys_write+0x1ee/0x250 fs/read_write.c:643 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae
The buggy address belongs to the object at ffff88807f1cb700 which belongs to the cache ip_dst_cache of size 176 The buggy address is located 58 bytes inside of 176-byte region [ffff88807f1cb700, ffff88807f1cb7b0) The buggy address belongs to the page: page:ffffea0001fc72c0 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7f1cb flags: 0xfff00000000200(slab|node=0|zone=1|lastcpupid=0x7ff) raw: 00fff00000000200 dead000000000100 dead000000000122 ffff8881413bb780 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected page_owner tracks the page as allocated page last allocated via order 0, migratetype Unmovable, gfp_mask 0x112a20(GFP_ATOMIC|__GFP_NOWARN|__GFP_NORETRY|__GFP_HARDWALL), pid 5, ts 108466983062, free_ts 108048976062 prep_new_page mm/page_alloc.c:2418 [inline] get_page_from_freelist+0xa72/0x2f50 mm/page_alloc.c:4149 __alloc_pages+0x1b2/0x500 mm/page_alloc.c:5369 alloc_pages+0x1a7/0x300 mm/mempolicy.c:2191 alloc_slab_page mm/slub.c:1793 [inline] allocate_slab mm/slub.c:1930 [inline] new_slab+0x32d/0x4a0 mm/slub.c:1993 ___slab_alloc+0x918/0xfe0 mm/slub.c:3022 __slab_alloc.constprop.0+0x4d/0xa0 mm/slub.c:3109 slab_alloc_node mm/slub.c:3200 [inline] slab_alloc mm/slub.c:3242 [inline] kmem_cache_alloc+0x35c/0x3a0 mm/slub.c:3247 dst_alloc+0x146/0x1f0 net/core/dst.c:92 rt_dst_alloc+0x73/0x430 net/ipv4/route.c:1613 __mkroute_output net/ipv4/route.c:2564 [inline] ip_route_output_key_hash_rcu+0x921/0x2d00 net/ipv4/route.c:2791 ip_route_output_key_hash+0x18b/0x300 net/ipv4/route.c:2619 __ip_route_output_key include/net/route.h:126 [inline] ip_route_output_flow+0x23/0x150 net/ipv4/route.c:2850 ip_route_output_key include/net/route.h:142 [inline] geneve_get_v4_rt+0x3a6/0x830 drivers/net/geneve.c:809 geneve_xmit_skb drivers/net/geneve.c:899 [inline] geneve_xmit+0xc4a/0x3540 drivers/net/geneve.c:1082 __netdev_start_xmit include/linux/netdevice.h:4994 [inline] netdev_start_xmit include/linux/netdevice.h:5008 [inline] xmit_one net/core/dev.c:3590 [inline] dev_hard_start_xmit+0x1eb/0x920 net/core/dev.c:3606 __dev_queue_xmit+0x299a/0x3650 net/core/dev.c:4229 page last free stack trace: reset_page_owner include/linux/page_owner.h:24 [inline] free_pages_prepare mm/page_alloc.c:1338 [inline] free_pcp_prepare+0x374/0x870 mm/page_alloc.c:1389 free_unref_page_prepare mm/page_alloc.c:3309 [inline] free_unref_page+0x19/0x690 mm/page_alloc.c:3388 qlink_free mm/kasan/quarantine.c:146 [inline] qlist_free_all+0x5a/0xc0 mm/kasan/quarantine.c:165 kasan_quarantine_reduce+0x180/0x200 mm/kasan/quarantine.c:272 __kasan_slab_alloc+0xa2/0xc0 mm/kasan/common.c:444 kasan_slab_alloc include/linux/kasan.h:259 [inline] slab_post_alloc_hook mm/slab.h:519 [inline] slab_alloc_node mm/slub.c:3234 [inline] kmem_cache_alloc_node+0x255/0x3f0 mm/slub.c:3270 __alloc_skb+0x215/0x340 net/core/skbuff.c:414 alloc_skb include/linux/skbuff.h:1126 [inline] alloc_skb_with_frags+0x93/0x620 net/core/skbuff.c:6078 sock_alloc_send_pskb+0x783/0x910 net/core/sock.c:2575 mld_newpack+0x1df/0x770 net/ipv6/mcast.c:1754 add_grhead+0x265/0x330 net/ipv6/mcast.c:1857 add_grec+0x1053/0x14e0 net/ipv6/mcast.c:1995 mld_send_initial_cr.part.0+0xf6/0x230 net/ipv6/mcast.c:2242 mld_send_initial_cr net/ipv6/mcast.c:1232 [inline] mld_dad_work+0x1d3/0x690 net/ipv6/mcast.c:2268 process_one_work+0x9b2/0x1690 kernel/workqueue.c:2298 worker_thread+0x658/0x11f0 kernel/workqueue.c:2445
Memory state around the buggy address: ffff88807f1cb600: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88807f1cb680: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc
ffff88807f1cb700: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^ ffff88807f1cb780: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc ffff88807f1cb800: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
Fixes: 41063e9dd119 ("ipv4: Early TCP socket demux.") Signed-off-by: Eric Dumazet edumazet@google.com Link: https://lore.kernel.org/r/20211220143330.680945-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org
Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com Conflict: include/net/sock.h net/ipv4/tcp_ipv4.c net/ipv6/tcp_ipv6.c net/ipv6/udp.c Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/net/sock.h | 2 +- net/ipv4/af_inet.c | 2 +- net/ipv4/tcp.c | 3 +-- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_ipv4.c | 11 +++++++---- net/ipv4/udp.c | 6 +++--- net/ipv6/tcp_ipv6.c | 11 +++++++---- net/ipv6/udp.c | 4 ++-- 8 files changed, 23 insertions(+), 18 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h index c958be11d172..bd34faf53b1f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -428,7 +428,7 @@ struct sock { #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif - struct dst_entry *sk_rx_dst; + struct dst_entry __rcu *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; atomic_t sk_omem_alloc; int sk_sndbuf; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 8bf43ca7139a..911ad595dbb9 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -158,7 +158,7 @@ void inet_sock_destruct(struct sock *sk)
kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); - dst_release(sk->sk_rx_dst); + dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); sk_refcnt_debug_dec(sk); } EXPORT_SYMBOL(inet_sock_destruct); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index fcd792816756..c19428e97876 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2814,8 +2814,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); - dst_release(sk->sk_rx_dst); - sk->sk_rx_dst = NULL; + dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL)); tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->segs_in = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 11f3f8e23a8a..6f4ac0f10f57 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5744,7 +5744,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) trace_tcp_probe(sk, skb);
tcp_mstamp_refresh(tp); - if (unlikely(!sk->sk_rx_dst)) + if (unlikely(!rcu_access_pointer(sk->sk_rx_dst))) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* * Header prediction. diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index ebfeeeadd47c..078f3a5d65b3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1670,15 +1670,18 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) struct sock *rsk;
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ - struct dst_entry *dst = sk->sk_rx_dst; + struct dst_entry *dst; + + dst = rcu_dereference_protected(sk->sk_rx_dst, + lockdep_sock_is_held(sk));
sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || !dst->ops->check(dst, 0)) { + RCU_INIT_POINTER(sk->sk_rx_dst, NULL); dst_release(dst); - sk->sk_rx_dst = NULL; } } tcp_rcv_established(sk, skb); @@ -1753,7 +1756,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_edemux; if (sk_fullsock(sk)) { - struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); + struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
if (dst) dst = dst_check(dst, 0); @@ -2160,7 +2163,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb);
if (dst && dst_hold_safe(dst)) { - sk->sk_rx_dst = dst; + rcu_assign_pointer(sk->sk_rx_dst, dst); inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 8cca8bda0612..121c08d4255c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2186,7 +2186,7 @@ bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) struct dst_entry *old;
if (dst_hold_safe(dst)) { - old = xchg(&sk->sk_rx_dst, dst); + old = xchg((__force struct dst_entry **)&sk->sk_rx_dst, dst); dst_release(old); return old != dst; } @@ -2376,7 +2376,7 @@ int __udp4_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, struct dst_entry *dst = skb_dst(skb); int ret;
- if (unlikely(sk->sk_rx_dst != dst)) + if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp_sk_rx_dst_set(sk, dst);
ret = udp_unicast_rcv_skb(sk, skb, uh); @@ -2535,7 +2535,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
skb->sk = sk; skb->destructor = sock_efree; - dst = READ_ONCE(sk->sk_rx_dst); + dst = rcu_dereference(sk->sk_rx_dst);
if (dst) dst = dst_check(dst, 0); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index df33145b876c..b87b04526e65 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -107,7 +107,7 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) if (dst && dst_hold_safe(dst)) { const struct rt6_info *rt = (const struct rt6_info *)dst;
- sk->sk_rx_dst = dst; + rcu_assign_pointer(sk->sk_rx_dst, dst); inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; tcp_inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt); } @@ -1482,15 +1482,18 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC));
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ - struct dst_entry *dst = sk->sk_rx_dst; + struct dst_entry *dst; + + dst = rcu_dereference_protected(sk->sk_rx_dst, + lockdep_sock_is_held(sk));
sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { if (inet_sk(sk)->rx_dst_ifindex != skb->skb_iif || dst->ops->check(dst, np->rx_dst_cookie) == NULL) { + RCU_INIT_POINTER(sk->sk_rx_dst, NULL); dst_release(dst); - sk->sk_rx_dst = NULL; } }
@@ -1842,7 +1845,7 @@ INDIRECT_CALLABLE_SCOPE void tcp_v6_early_demux(struct sk_buff *skb) skb->sk = sk; skb->destructor = sock_edemux; if (sk_fullsock(sk)) { - struct dst_entry *dst = READ_ONCE(sk->sk_rx_dst); + struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
if (dst) dst = dst_check(dst, tcp_inet6_sk(sk)->rx_dst_cookie); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 503706397839..7022dcbb5a11 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -940,7 +940,7 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, struct dst_entry *dst = skb_dst(skb); int ret;
- if (unlikely(sk->sk_rx_dst != dst)) + if (unlikely(rcu_dereference(sk->sk_rx_dst) != dst)) udp6_sk_rx_dst_set(sk, dst);
if (!uh->check && !udp_sk(sk)->no_check6_rx) { @@ -1054,7 +1054,7 @@ INDIRECT_CALLABLE_SCOPE void udp_v6_early_demux(struct sk_buff *skb)
skb->sk = sk; skb->destructor = sock_efree; - dst = READ_ONCE(sk->sk_rx_dst); + dst = rcu_dereference(sk->sk_rx_dst);
if (dst) dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie);
From: Zhang Wensheng zhangwensheng5@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I58CRT CVE: NA
--------------------------------
In __device_attach function, The lock holding logic is as follows: ... __device_attach device_lock(dev) // get lock dev async_schedule_dev(__device_attach_async_helper, dev); // func async_schedule_node async_schedule_node_domain(func) entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); /* when fail or work limit, sync to execute func, but __device_attach_async_helper will get lock dev as will, which will lead to A-A deadlock. */ if (!entry || atomic_read(&entry_count) > MAX_WORK) { func; else queue_work_node(node, system_unbound_wq, &entry->work) device_unlock(dev)
As above show, when it is allowed to do async probes, because of out of memory or work limit, async work is not be allowed, to do sync execute instead. it will lead to A-A deadlock because of __device_attach_async_helper getting lock dev.
To fix the deadlock, move the async_schedule_dev outside device_lock, as we can see, in async_schedule_node_domain, the parameter of queue_work_node is system_unbound_wq, so it can accept concurrent operations. which will also not change the code logic, and will not lead to deadlock.
Fixes: 765230b5f084 ("driver-core: add asynchronous probing support for drivers") Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/base/dd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 64ff137408b8..3e62b2f24bfb 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -897,6 +897,7 @@ static void __device_attach_async_helper(void *_dev, async_cookie_t cookie) static int __device_attach(struct device *dev, bool allow_async) { int ret = 0; + bool async = false;
device_lock(dev); if (dev->p->dead) { @@ -935,7 +936,7 @@ static int __device_attach(struct device *dev, bool allow_async) */ dev_dbg(dev, "scheduling asynchronous probe\n"); get_device(dev); - async_schedule_dev(__device_attach_async_helper, dev); + async = true; } else { pm_request_idle(dev); } @@ -945,6 +946,8 @@ static int __device_attach(struct device *dev, bool allow_async) } out_unlock: device_unlock(dev); + if (async) + async_schedule_dev(__device_attach_async_helper, dev); return ret; }
From: Zhang Wensheng zhangwensheng5@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I58CRT CVE: NA
--------------------------------
In __driver_attach function, The lock holding logic is as follows: ... __driver_attach if (driver_allows_async_probing(drv)) device_lock(dev) // get lock dev async_schedule_dev(__driver_attach_async_helper, dev); // func async_schedule_node async_schedule_node_domain(func) entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); /* when fail or work limit, sync to execute func, but __driver_attach_async_helper will get lock dev as will, which will lead to A-A deadlock. */ if (!entry || atomic_read(&entry_count) > MAX_WORK) { func; else queue_work_node(node, system_unbound_wq, &entry->work) device_unlock(dev)
As above show, when it is allowed to do async probes, because of out of memory or work limit, async work is not be allowed, to do sync execute instead. it will lead to A-A deadlock because of __driver_attach_async_helper getting lock dev.
To fix the deadlock, move the async_schedule_dev outside device_lock, as we can see, in async_schedule_node_domain, the parameter of queue_work_node is system_unbound_wq, so it can accept concurrent operations. which will also not change the code logic, and will not lead to deadlock.
Fixes: ef0ff68351be ("driver core: Probe devices asynchronously instead of the driver") Signed-off-by: Zhang Wensheng zhangwensheng5@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/base/dd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 3e62b2f24bfb..bde049c998de 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -1062,6 +1062,7 @@ static int __driver_attach(struct device *dev, void *data) { struct device_driver *drv = data; int ret; + bool async = false;
/* * Lock device and try to bind to it. We drop the error @@ -1098,9 +1099,11 @@ static int __driver_attach(struct device *dev, void *data) if (!dev->driver) { get_device(dev); dev->p->async_driver = drv; - async_schedule_dev(__driver_attach_async_helper, dev); + async = true; } device_unlock(dev); + if (async) + async_schedule_dev(__driver_attach_async_helper, dev); return 0; }
From: Duoming Zhou duoming@zju.edu.cn
stable inclusion from stable-v5.10.112 commit 5ea00fc60676c0eebfa8560ec461209d638bca9d bugzilla: 186602, https://gitee.com/src-openeuler/kernel/issues/I5783H CVE: CVE-2022-1204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit d01ffb9eee4af165d83b08dd73ebdf9fe94a519b upstream.
If we dereference ax25_dev after we call kfree(ax25_dev) in ax25_dev_device_down(), it will lead to concurrency UAF bugs. There are eight syscall functions suffer from UAF bugs, include ax25_bind(), ax25_release(), ax25_connect(), ax25_ioctl(), ax25_getname(), ax25_sendmsg(), ax25_getsockopt() and ax25_info_show().
One of the concurrency UAF can be shown as below:
(USE) | (FREE) | ax25_device_event | ax25_dev_device_down ax25_bind | ... ... | kfree(ax25_dev) ax25_fillin_cb() | ... ax25_fillin_cb_from_dev() | ... |
The root cause of UAF bugs is that kfree(ax25_dev) in ax25_dev_device_down() is not protected by any locks. When ax25_dev, which there are still pointers point to, is released, the concurrency UAF bug will happen.
This patch introduces refcount into ax25_dev in order to guarantee that there are no pointers point to it when ax25_dev is released.
Signed-off-by: Duoming Zhou duoming@zju.edu.cn Signed-off-by: David S. Miller davem@davemloft.net [OP: backport to 5.10: adjusted context] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
conflict: net/ax25/af_ax25.c
Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/net/ax25.h | 10 ++++++++++ net/ax25/af_ax25.c | 2 ++ net/ax25/ax25_dev.c | 12 ++++++++++-- net/ax25/ax25_route.c | 3 +++ 4 files changed, 25 insertions(+), 2 deletions(-)
diff --git a/include/net/ax25.h b/include/net/ax25.h index 8b7eb46ad72d..d81bfb674906 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -236,6 +236,7 @@ typedef struct ax25_dev { #if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_dama_info dama; #endif + refcount_t refcount; } ax25_dev;
typedef struct ax25_cb { @@ -290,6 +291,15 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } }
+#define ax25_dev_hold(__ax25_dev) \ + refcount_inc(&((__ax25_dev)->refcount)) + +static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +{ + if (refcount_dec_and_test(&ax25_dev->refcount)) { + kfree(ax25_dev); + } +} static inline __be16 ax25_type_trans(struct sk_buff *skb, struct net_device *dev) { skb->dev = dev; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 556bf1a8ea3f..d169db7f7521 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -99,6 +99,7 @@ static void ax25_kill_by_device(struct net_device *dev) lock_sock(sk); ax25_disconnect(s, ENETUNREACH); s->ax25_dev = NULL; + ax25_dev_put(ax25_dev); release_sock(sk); spin_lock_bh(&ax25_list_lock); sock_put(sk); @@ -446,6 +447,7 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) }
out_put: + ax25_dev_put(ax25_dev); ax25_cb_put(ax25); return ret;
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 4ac2e0847652..2c845ff1d036 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -37,6 +37,7 @@ ax25_dev *ax25_addr_ax25dev(ax25_address *addr) for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) if (ax25cmp(addr, (ax25_address *)ax25_dev->dev->dev_addr) == 0) { res = ax25_dev; + ax25_dev_hold(ax25_dev); } spin_unlock_bh(&ax25_dev_lock);
@@ -56,6 +57,7 @@ void ax25_dev_device_up(struct net_device *dev) return; }
+ refcount_set(&ax25_dev->refcount, 1); dev->ax25_ptr = ax25_dev; ax25_dev->dev = dev; dev_hold(dev); @@ -83,6 +85,7 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; + ax25_dev_hold(ax25_dev); spin_unlock_bh(&ax25_dev_lock);
ax25_register_dev_sysctl(ax25_dev); @@ -112,20 +115,22 @@ void ax25_dev_device_down(struct net_device *dev)
if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; + ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; dev_put(dev); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; }
while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; + ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; dev_put(dev); - kfree(ax25_dev); + ax25_dev_put(ax25_dev); return; }
@@ -133,6 +138,7 @@ void ax25_dev_device_down(struct net_device *dev) } spin_unlock_bh(&ax25_dev_lock); dev->ax25_ptr = NULL; + ax25_dev_put(ax25_dev); }
int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) @@ -149,6 +155,7 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) if (ax25_dev->forward != NULL) return -EINVAL; ax25_dev->forward = fwd_dev->dev; + ax25_dev_put(fwd_dev); break;
case SIOCAX25DELFWD: @@ -161,6 +168,7 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) return -EINVAL; }
+ ax25_dev_put(ax25_dev); return 0; }
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index b40e0bce67ea..ed8cf2983f8a 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -116,6 +116,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->dev = ax25_dev->dev; ax25_rt->digipeat = NULL; ax25_rt->ip_mode = ' '; + ax25_dev_put(ax25_dev); if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); @@ -172,6 +173,7 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } } + ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock);
return 0; @@ -214,6 +216,7 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) }
out: + ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); return err; }
From: Duoming Zhou duoming@zju.edu.cn
stable inclusion from stable-v5.10.112 commit 5ddae8d064412ed868610127561652e90acabeea bugzilla: 186602, https://gitee.com/src-openeuler/kernel/issues/I5783H CVE: CVE-2022-1204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 87563a043cef044fed5db7967a75741cc16ad2b1 upstream.
The previous commit d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") introduces refcount into ax25_dev, but there are reference leak paths in ax25_ctl_ioctl(), ax25_fwd_ioctl(), ax25_rt_add(), ax25_rt_del() and ax25_rt_opt().
This patch uses ax25_dev_put() and adjusts the position of ax25_addr_ax25dev() to fix reference cout leaks of ax25_dev.
Fixes: d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") Signed-off-by: Duoming Zhou duoming@zju.edu.cn Reviewed-by: Dan Carpenter dan.carpenter@oracle.com Link: https://lore.kernel.org/r/20220203150811.42256-1-duoming@zju.edu.cn Signed-off-by: Jakub Kicinski kuba@kernel.org [OP: backport to 5.10: adjust context] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/net/ax25.h | 8 +++++--- net/ax25/af_ax25.c | 12 ++++++++---- net/ax25/ax25_dev.c | 24 +++++++++++++++++------- net/ax25/ax25_route.c | 16 +++++++++++----- 4 files changed, 41 insertions(+), 19 deletions(-)
diff --git a/include/net/ax25.h b/include/net/ax25.h index d81bfb674906..aadff553e4b7 100644 --- a/include/net/ax25.h +++ b/include/net/ax25.h @@ -291,10 +291,12 @@ static __inline__ void ax25_cb_put(ax25_cb *ax25) } }
-#define ax25_dev_hold(__ax25_dev) \ - refcount_inc(&((__ax25_dev)->refcount)) +static inline void ax25_dev_hold(ax25_dev *ax25_dev) +{ + refcount_inc(&ax25_dev->refcount); +}
-static __inline__ void ax25_dev_put(ax25_dev *ax25_dev) +static inline void ax25_dev_put(ax25_dev *ax25_dev) { if (refcount_dec_and_test(&ax25_dev->refcount)) { kfree(ax25_dev); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index d169db7f7521..c684478aa3a1 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -366,21 +366,25 @@ static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl))) return -EFAULT;
- if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL) - return -ENODEV; - if (ax25_ctl.digi_count > AX25_MAX_DIGIS) return -EINVAL;
if (ax25_ctl.arg > ULONG_MAX / HZ && ax25_ctl.cmd != AX25_KILL) return -EINVAL;
+ ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr); + if (!ax25_dev) + return -ENODEV; + digi.ndigi = ax25_ctl.digi_count; for (k = 0; k < digi.ndigi; k++) digi.calls[k] = ax25_ctl.digi_addr[k];
- if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL) + ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev); + if (!ax25) { + ax25_dev_put(ax25_dev); return -ENOTCONN; + }
switch (ax25_ctl.cmd) { case AX25_KILL: diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 2c845ff1d036..d2e0cc67d91a 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -85,8 +85,8 @@ void ax25_dev_device_up(struct net_device *dev) spin_lock_bh(&ax25_dev_lock); ax25_dev->next = ax25_dev_list; ax25_dev_list = ax25_dev; - ax25_dev_hold(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_hold(ax25_dev);
ax25_register_dev_sysctl(ax25_dev); } @@ -115,8 +115,8 @@ void ax25_dev_device_down(struct net_device *dev)
if ((s = ax25_dev_list) == ax25_dev) { ax25_dev_list = s->next; - ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put(dev); ax25_dev_put(ax25_dev); @@ -126,8 +126,8 @@ void ax25_dev_device_down(struct net_device *dev) while (s != NULL && s->next != NULL) { if (s->next == ax25_dev) { s->next = ax25_dev->next; - ax25_dev_put(ax25_dev); spin_unlock_bh(&ax25_dev_lock); + ax25_dev_put(ax25_dev); dev->ax25_ptr = NULL; dev_put(dev); ax25_dev_put(ax25_dev); @@ -150,25 +150,35 @@ int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd)
switch (cmd) { case SIOCAX25ADDFWD: - if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL) + fwd_dev = ax25_addr_ax25dev(&fwd->port_to); + if (!fwd_dev) { + ax25_dev_put(ax25_dev); return -EINVAL; - if (ax25_dev->forward != NULL) + } + if (ax25_dev->forward) { + ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = fwd_dev->dev; ax25_dev_put(fwd_dev); + ax25_dev_put(ax25_dev); break;
case SIOCAX25DELFWD: - if (ax25_dev->forward == NULL) + if (!ax25_dev->forward) { + ax25_dev_put(ax25_dev); return -EINVAL; + } ax25_dev->forward = NULL; + ax25_dev_put(ax25_dev); break;
default: + ax25_dev_put(ax25_dev); return -EINVAL; }
- ax25_dev_put(ax25_dev); return 0; }
diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index ed8cf2983f8a..dc2168d2a32a 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -75,11 +75,13 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_dev *ax25_dev; int i;
- if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL) - return -EINVAL; if (route->digi_count > AX25_MAX_DIGIS) return -EINVAL;
+ ax25_dev = ax25_addr_ax25dev(&route->port_addr); + if (!ax25_dev) + return -EINVAL; + write_lock_bh(&ax25_route_lock);
ax25_rt = ax25_route_list; @@ -91,6 +93,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -101,6 +104,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) } } write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return 0; } ax25_rt = ax25_rt->next; @@ -108,6 +112,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route)
if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return -ENOMEM; }
@@ -116,11 +121,11 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->dev = ax25_dev->dev; ax25_rt->digipeat = NULL; ax25_rt->ip_mode = ' '; - ax25_dev_put(ax25_dev); if (route->digi_count != 0) { if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { write_unlock_bh(&ax25_route_lock); kfree(ax25_rt); + ax25_dev_put(ax25_dev); return -ENOMEM; } ax25_rt->digipeat->lastrepeat = -1; @@ -133,6 +138,7 @@ static int __must_check ax25_rt_add(struct ax25_routes_struct *route) ax25_rt->next = ax25_route_list; ax25_route_list = ax25_rt; write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev);
return 0; } @@ -173,8 +179,8 @@ static int ax25_rt_del(struct ax25_routes_struct *route) } } } - ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev);
return 0; } @@ -216,8 +222,8 @@ static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) }
out: - ax25_dev_put(ax25_dev); write_unlock_bh(&ax25_route_lock); + ax25_dev_put(ax25_dev); return err; }
From: Duoming Zhou duoming@zju.edu.cn
stable inclusion from stable-v5.10.112 commit 57cc15f5fd550316e4104eaf84b90fbc640fd7a5 bugzilla: 186602, https://gitee.com/src-openeuler/kernel/issues/I5783H CVE: CVE-2022-1204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit feef318c855a361a1eccd880f33e88c460eb63b4 upstream.
The ax25_kill_by_device() will set s->ax25_dev = NULL and call ax25_disconnect() to change states of ax25_cb and sock, if we call ax25_bind() before ax25_kill_by_device().
However, if we call ax25_bind() again between the window of ax25_kill_by_device() and ax25_dev_device_down(), the values and states changed by ax25_kill_by_device() will be reassigned.
Finally, ax25_dev_device_down() will deallocate net_device. If we dereference net_device in syscall functions such as ax25_release(), ax25_sendmsg(), ax25_getsockopt(), ax25_getname() and ax25_info_show(), a UAF bug will occur.
One of the possible race conditions is shown below:
(USE) | (FREE) ax25_bind() | | ax25_kill_by_device() ax25_bind() | ax25_connect() | ... | ax25_dev_device_down() | ... | dev_put_track(dev, ...) //FREE ax25_release() | ... ax25_send_control() | alloc_skb() //USE |
the corresponding fail log is shown below:
BUG: KASAN: use-after-free in ax25_send_control+0x43/0x210 ... Call Trace: ... ax25_send_control+0x43/0x210 ax25_release+0x2db/0x3b0 __sock_release+0x6d/0x120 sock_close+0xf/0x20 __fput+0x11f/0x420 ... Allocated by task 1283: ... __kasan_kmalloc+0x81/0xa0 alloc_netdev_mqs+0x5a/0x680 mkiss_open+0x6c/0x380 tty_ldisc_open+0x55/0x90 ... Freed by task 1969: ... kfree+0xa3/0x2c0 device_release+0x54/0xe0 kobject_put+0xa5/0x120 tty_ldisc_kill+0x3e/0x80 ...
In order to fix these UAF bugs caused by rebinding operation, this patch adds dev_hold_track() into ax25_bind() and corresponding dev_put_track() into ax25_kill_by_device().
Signed-off-by: Duoming Zhou duoming@zju.edu.cn Signed-off-by: David S. Miller davem@davemloft.net [OP: backport to 5.10: adjust dev_put_track()->dev_put() and dev_hold_track()->dev_hold()] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
conflict: net/ax25/af_ax25.c
Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ax25/af_ax25.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index c684478aa3a1..2ec490904d86 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -99,6 +99,7 @@ static void ax25_kill_by_device(struct net_device *dev) lock_sock(sk); ax25_disconnect(s, ENETUNREACH); s->ax25_dev = NULL; + dev_put(ax25_dev->dev); ax25_dev_put(ax25_dev); release_sock(sk); spin_lock_bh(&ax25_list_lock); @@ -1122,8 +1123,10 @@ static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } }
- if (ax25_dev != NULL) + if (ax25_dev) { ax25_fillin_cb(ax25, ax25_dev); + dev_hold(ax25_dev->dev); + }
done: ax25_cb_add(ax25);
From: Duoming Zhou duoming@zju.edu.cn
stable inclusion from stable-v5.10.112 commit b20a5ab0f5fb175750c6bafd4cf12daccf00c738 bugzilla: 186602, https://gitee.com/src-openeuler/kernel/issues/I5783H CVE: CVE-2022-1204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 9fd75b66b8f68498454d685dc4ba13192ae069b0 upstream.
The previous commit d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") and commit feef318c855a ("ax25: fix UAF bugs of net_device caused by rebinding operation") increase the refcounts of ax25_dev and net_device in ax25_bind() and decrease the matching refcounts in ax25_kill_by_device() in order to prevent UAF bugs, but there are reference count leaks.
The root cause of refcount leaks is shown below:
(Thread 1) | (Thread 2) ax25_bind() | ... | ax25_addr_ax25dev() | ax25_dev_hold() //(1) | ... | dev_hold_track() //(2) | ... | ax25_destroy_socket() | ax25_cb_del() | ... | hlist_del_init() //(3) | | (Thread 3) | ax25_kill_by_device() | ... | ax25_for_each(s, &ax25_list) { | if (s->ax25_dev == ax25_dev) //(4) | ... |
Firstly, we use ax25_bind() to increase the refcount of ax25_dev in position (1) and increase the refcount of net_device in position (2). Then, we use ax25_cb_del() invoked by ax25_destroy_socket() to delete ax25_cb in hlist in position (3) before calling ax25_kill_by_device(). Finally, the decrements of refcounts in ax25_kill_by_device() will not be executed, because no s->ax25_dev equals to ax25_dev in position (4).
This patch adds decrements of refcounts in ax25_release() and use lock_sock() to do synchronization. If refcounts decrease in ax25_release(), the decrements of refcounts in ax25_kill_by_device() will not be executed and vice versa.
Fixes: d01ffb9eee4a ("ax25: add refcount in ax25_dev to avoid UAF bugs") Fixes: 87563a043cef ("ax25: fix reference count leaks of ax25_dev") Fixes: feef318c855a ("ax25: fix UAF bugs of net_device caused by rebinding operation") Reported-by: Thomas Osterried thomas@osterried.de Signed-off-by: Duoming Zhou duoming@zju.edu.cn Signed-off-by: David S. Miller davem@davemloft.net [OP: backport to 5.10: adjust dev_put_track()->dev_put()] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
conflict: net/ax25/af_ax25.c
Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ax25/af_ax25.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 2ec490904d86..b16d466b3983 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -99,8 +99,10 @@ static void ax25_kill_by_device(struct net_device *dev) lock_sock(sk); ax25_disconnect(s, ENETUNREACH); s->ax25_dev = NULL; - dev_put(ax25_dev->dev); - ax25_dev_put(ax25_dev); + if (sk->sk_socket) { + dev_put(ax25_dev->dev); + ax25_dev_put(ax25_dev); + } release_sock(sk); spin_lock_bh(&ax25_list_lock); sock_put(sk); @@ -978,14 +980,20 @@ static int ax25_release(struct socket *sock) { struct sock *sk = sock->sk; ax25_cb *ax25; + ax25_dev *ax25_dev;
if (sk == NULL) return 0;
sock_hold(sk); - sock_orphan(sk); lock_sock(sk); + sock_orphan(sk); ax25 = sk_to_ax25(sk); + ax25_dev = ax25->ax25_dev; + if (ax25_dev) { + dev_put(ax25_dev->dev); + ax25_dev_put(ax25_dev); + }
if (sk->sk_type == SOCK_SEQPACKET) { switch (ax25->state) {
From: Duoming Zhou duoming@zju.edu.cn
stable inclusion from stable-v5.10.112 commit a4942c6fea879972a7fee50f7e92e2e10f3fc23e bugzilla: 186602, https://gitee.com/src-openeuler/kernel/issues/I5783H CVE: CVE-2022-1204
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 5352a761308397a0e6250fdc629bb3f615b94747 upstream.
There are UAF bugs in ax25_send_control(), when we call ax25_release() to deallocate ax25_dev. The possible race condition is shown below:
(Thread 1) | (Thread 2) ax25_dev_device_up() //(1) | | ax25_kill_by_device() ax25_bind() //(2) | ax25_connect() | ... ax25->state = AX25_STATE_1 | ... | ax25_dev_device_down() //(3)
(Thread 3) ax25_release() | ax25_dev_put() //(4) FREE | case AX25_STATE_1: | ax25_send_control() | alloc_skb() //USE |
The refcount of ax25_dev increases in position (1) and (2), and decreases in position (3) and (4). The ax25_dev will be freed before dereference sites in ax25_send_control().
The following is part of the report:
[ 102.297448] BUG: KASAN: use-after-free in ax25_send_control+0x33/0x210 [ 102.297448] Read of size 8 at addr ffff888009e6e408 by task ax25_close/602 [ 102.297448] Call Trace: [ 102.303751] ax25_send_control+0x33/0x210 [ 102.303751] ax25_release+0x356/0x450 [ 102.305431] __sock_release+0x6d/0x120 [ 102.305431] sock_close+0xf/0x20 [ 102.305431] __fput+0x11f/0x420 [ 102.305431] task_work_run+0x86/0xd0 [ 102.307130] get_signal+0x1075/0x1220 [ 102.308253] arch_do_signal_or_restart+0x1df/0xc00 [ 102.308253] exit_to_user_mode_prepare+0x150/0x1e0 [ 102.308253] syscall_exit_to_user_mode+0x19/0x50 [ 102.308253] do_syscall_64+0x48/0x90 [ 102.308253] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 102.308253] RIP: 0033:0x405ae7
This patch defers the free operation of ax25_dev and net_device after all corresponding dereference sites in ax25_release() to avoid UAF.
Fixes: 9fd75b66b8f6 ("ax25: Fix refcount leaks caused by ax25_cb_del()") Signed-off-by: Duoming Zhou duoming@zju.edu.cn Signed-off-by: Paolo Abeni pabeni@redhat.com [OP: backport to 5.10: adjust dev_put_track()->dev_put()] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ax25/af_ax25.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index b16d466b3983..a665454d6770 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -990,10 +990,6 @@ static int ax25_release(struct socket *sock) sock_orphan(sk); ax25 = sk_to_ax25(sk); ax25_dev = ax25->ax25_dev; - if (ax25_dev) { - dev_put(ax25_dev->dev); - ax25_dev_put(ax25_dev); - }
if (sk->sk_type == SOCK_SEQPACKET) { switch (ax25->state) { @@ -1055,6 +1051,10 @@ static int ax25_release(struct socket *sock) sk->sk_state_change(sk); ax25_destroy_socket(ax25); } + if (ax25_dev) { + dev_put(ax25_dev->dev); + ax25_dev_put(ax25_dev); + }
sock->sk = NULL; release_sock(sk);
From: Duoming Zhou duoming@zju.edu.cn
stable inclusion from stable-v5.10.112 commit 5c62d3bf14100a88d30888b925fcb61a8c11c012 bugzilla: 186594, https://gitee.com/src-openeuler/kernel/issues/I54QIC CVE: CVE-2022-1205
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 82e31755e55fbcea6a9dfaae5fe4860ade17cbc0 upstream.
There are race conditions that may lead to UAF bugs in ax25_heartbeat_expiry(), ax25_t1timer_expiry(), ax25_t2timer_expiry(), ax25_t3timer_expiry() and ax25_idletimer_expiry(), when we call ax25_release() to deallocate ax25_dev.
One of the UAF bugs caused by ax25_release() is shown below:
(Thread 1) | (Thread 2) ax25_dev_device_up() //(1) | ... | ax25_kill_by_device() ax25_bind() //(2) | ax25_connect() | ... ax25_std_establish_data_link() | ax25_start_t1timer() | ax25_dev_device_down() //(3) mod_timer(&ax25->t1timer,..) | | ax25_release() (wait a time) | ... | ax25_dev_put(ax25_dev) //(4)FREE ax25_t1timer_expiry() | ax25->ax25_dev->values[..] //USE| ... ... |
We increase the refcount of ax25_dev in position (1) and (2), and decrease the refcount of ax25_dev in position (3) and (4). The ax25_dev will be freed in position (4) and be used in ax25_t1timer_expiry().
The fail log is shown below:
[ 106.116942] BUG: KASAN: use-after-free in ax25_t1timer_expiry+0x1c/0x60 [ 106.116942] Read of size 8 at addr ffff88800bda9028 by task swapper/0/0 [ 106.116942] CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.17.0-06123-g0905eec574 [ 106.116942] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-14 [ 106.116942] Call Trace: ... [ 106.116942] ax25_t1timer_expiry+0x1c/0x60 [ 106.116942] call_timer_fn+0x122/0x3d0 [ 106.116942] __run_timers.part.0+0x3f6/0x520 [ 106.116942] run_timer_softirq+0x4f/0xb0 [ 106.116942] __do_softirq+0x1c2/0x651 ...
This patch adds del_timer_sync() in ax25_release(), which could ensure that all timers stop before we deallocate ax25_dev.
Signed-off-by: Duoming Zhou duoming@zju.edu.cn Signed-off-by: Paolo Abeni pabeni@redhat.com [OP: backport to 5.10: adjust context] Signed-off-by: Ovidiu Panait ovidiu.panait@windriver.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ax25/af_ax25.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index a665454d6770..5fff027f25fa 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1052,6 +1052,11 @@ static int ax25_release(struct socket *sock) ax25_destroy_socket(ax25); } if (ax25_dev) { + del_timer_sync(&ax25->timer); + del_timer_sync(&ax25->t1timer); + del_timer_sync(&ax25->t2timer); + del_timer_sync(&ax25->t3timer); + del_timer_sync(&ax25->idletimer); dev_put(ax25_dev->dev); ax25_dev_put(ax25_dev); }
From: Zhang Yi yi.zhang@huawei.com
hulk inclusion category: bugfix bugzilla: 186835, https://gitee.com/openeuler/kernel/issues/I59KJ1 CVE: NA
---------------------------
We capture a NULL pointer issue when resizing a corrupt ext4 image which freshly clear resize_inode feature (not run e2fsck). It could be simply reproduced by following steps. The problem is because of the resize_inode feature was cleared, and it will convert the filesystem to meta_bg mode in ext4_resize_fs(), but the es->s_reserved_gdt_blocks was not cleared together, so could we mistakenly call reserve_backup_gdb() and passing an uninitialized resize_inode to it when adding new group descriptors.
mkfs.ext4 /dev/sda 3G tune2fs -O ^resize_inode /dev/sda #forget to run requested e2fsck mount /dev/sda /mnt resize2fs /dev/sda 8G
======== BUG: kernel NULL pointer dereference, address: 0000000000000028 CPU: 19 PID: 3243 Comm: resize2fs Not tainted 5.18.0-rc7-00001-gfde086c5ebfd #748 ... RIP: 0010:ext4_flex_group_add+0xe08/0x2570 ... Call Trace: <TASK> ext4_resize_fs+0xbec/0x1660 __ext4_ioctl+0x1749/0x24e0 ext4_ioctl+0x12/0x20 __x64_sys_ioctl+0xa6/0x110 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f2dd739617b ========
The fix is simple, add a check in ext4_resize_fs() to make sure that the es->s_reserved_gdt_blocks is zero when the resize_inode feature is disabled.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Reviewed-by: zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/resize.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index bd0d185654f3..4a0a9fd7ee2a 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -2006,6 +2006,9 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) ext4_warning(sb, "Error opening resize inode"); return PTR_ERR(resize_inode); } + } else if (es->s_reserved_gdt_blocks) { + ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero"); + return -EFSCORRUPTED; }
if ((!resize_inode && !meta_bg) || n_blocks_count == o_blocks_count) {
From: Peng Liu liupeng256@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I59AN8 CVE: NA
--------------------------------
The user specificed memmap-reserve range may overlap in-use memory region, and users are hard to avoid this due to KASLR. Thus, the reduplicative memmap-reserve range should be ignored. Furthermore, to be consistent with INITRD, the range that not in a memory region will also be ignored.
Fixes: d05cfbd95ab2 ("arm64: Add support for memmap kernel parameters") Signed-off-by: Peng Liu liupeng256@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../admin-guide/kernel-parameters.txt | 3 ++- arch/arm64/mm/init.c | 22 ++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a70ca42d3251..5ac05b4ed804 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2838,7 +2838,8 @@ [KNL,ACPI] Mark specific memory as reserved. Region of memory to be reserved is from ss to ss+nn. For ARM64, reserved memory must be in the range of - existed memory. + existed memory and do not overlap in-use memory region, + otherwise request will be ignored. Example: Exclude memory from 0x18690000-0x1869ffff memmap=64K$0x18690000 or diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index b19bdd48cc43..f5bd046f9e19 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -298,7 +298,27 @@ static void __init reserve_memmap_regions(void) for (i = 0; i < mbk_memmap_cnt; i++) { base = mbk_memmap_regions[i].base; size = mbk_memmap_regions[i].size; - memblock_reserve(base, size); + + if (!memblock_is_region_memory(base, size)) { + pr_warn("memmap reserve: 0x%08llx - 0x%08llx is not a memory region - ignore\n", + base, base + size); + continue; + } + + if (memblock_is_region_reserved(base, size)) { + pr_warn("memmap reserve: 0x%08llx - 0x%08llx overlaps in-use memory region - ignore\n", + base, base + size); + continue; + } + + if (memblock_reserve(base, size)) { + pr_warn("memmap reserve: 0x%08llx - 0x%08llx failed\n", + base, base + size); + continue; + } + + pr_info("memmap reserved: 0x%08llx - 0x%08llx (%lld MB)", + base, base + size, size >> 20); memblock_mark_memmap(base, size); } }
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.14-rc1 commit 56b68085e536eff2676108f2f8356889a7dbbf55 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I597XM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The tag allocation code to alloc the sbitmap pairs is common for regular bitmaps tags and shared sbitmap, so refactor into a common function.
Also remove superfluous "flags" argument from blk_mq_init_shared_sbitmap().
Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/1620907258-30910-2-git-send-email-john.garry@huawe... Signed-off-by: Jens Axboe axboe@kernel.dk conflict: block/blk-mq.c Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-tag.c | 54 ++++++++++++++++++++++++++++------------------ block/blk-mq-tag.h | 9 +++++--- block/blk-mq.c | 2 +- 3 files changed, 40 insertions(+), 25 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 64c9633d5d5a..58022b8f3e6d 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -538,39 +538,54 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, node); }
-static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, - int node, int alloc_policy) +int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, + struct sbitmap_queue *breserved_tags, + unsigned int queue_depth, unsigned int reserved, + int node, int alloc_policy) { - unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + unsigned int depth = queue_depth - reserved; bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
- if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node)) + if (bt_alloc(bitmap_tags, depth, round_robin, node)) return -ENOMEM; - if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags, - round_robin, node)) + if (bt_alloc(breserved_tags, reserved, round_robin, node)) goto free_bitmap_tags;
+ return 0; + +free_bitmap_tags: + sbitmap_queue_free(bitmap_tags); + return -ENOMEM; +} + +static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, + int node, int alloc_policy) +{ + int ret; + + ret = blk_mq_init_bitmaps(&tags->__bitmap_tags, + &tags->__breserved_tags, + tags->nr_tags, tags->nr_reserved_tags, + node, alloc_policy); + if (ret) + return ret; + tags->bitmap_tags = &tags->__bitmap_tags; tags->breserved_tags = &tags->__breserved_tags;
return 0; -free_bitmap_tags: - sbitmap_queue_free(&tags->__bitmap_tags); - return -ENOMEM; }
-int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) +int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set) { - unsigned int depth = set->queue_depth - set->reserved_tags; int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); - bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - int i, node = set->numa_node; + int i, ret;
- if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node)) - return -ENOMEM; - if (bt_alloc(&set->__breserved_tags, set->reserved_tags, - round_robin, node)) - goto free_bitmap_tags; + ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags, + set->queue_depth, set->reserved_tags, + set->numa_node, alloc_policy); + if (ret) + return ret;
for (i = 0; i < set->nr_hw_queues; i++) { struct blk_mq_tags *tags = set->tags[i]; @@ -580,9 +595,6 @@ int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) }
return 0; -free_bitmap_tags: - sbitmap_queue_free(&set->__bitmap_tags); - return -ENOMEM; }
void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 25f30fa99857..baa36e5f495d 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -45,11 +45,14 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, unsigned int flags); extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags); +extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, + struct sbitmap_queue *breserved_tags, + unsigned int queue_depth, + unsigned int reserved, + int node, int alloc_policy);
-extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, - unsigned int flags); +extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set); extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set); - extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); diff --git a/block/blk-mq.c b/block/blk-mq.c index cedc355218db..78160d1f677a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3621,7 +3621,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) atomic_set(&set->active_queues_shared_sbitmap, 0); atomic_set(&set->pending_queues_shared_sbitmap, 0);
- if (blk_mq_init_shared_sbitmap(set, set->flags)) { + if (blk_mq_init_shared_sbitmap(set)) { ret = -ENOMEM; goto out_free_mq_rq_maps; }
From: John Garry john.garry@huawei.com
mainline inclusion from mainline-v5.14-rc1 commit d97e594c51660bea510a387731637b894651e4b5 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I597XM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The tags used for an IO scheduler are currently per hctx.
As such, when q->nr_hw_queues grows, so does the request queue total IO scheduler tag depth.
This may cause problems for SCSI MQ HBAs whose total driver depth is fixed.
Ming and Yanhui report higher CPU usage and lower throughput in scenarios where the fixed total driver tag depth is appreciably lower than the total scheduler tag depth: https://lore.kernel.org/linux-block/440dfcfc-1a2c-bd98-1161-cec4d78c6dfc@hua...
In that scenario, since the scheduler tag is got first, much contention is introduced since a driver tag may not be available after we have got the sched tag.
Improve this scenario by introducing request queue-wide tags for when a tagset-wide sbitmap is used. The static sched requests are still allocated per hctx, as requests are initialised per hctx, as in blk_mq_init_request(..., hctx_idx, ...) -> set->ops->init_request(.., hctx_idx, ...).
For simplicity of resizing the request queue sbitmap when updating the request queue depth, just init at the max possible size, so we don't need to deal with the possibly with swapping out a new sbitmap for old if we need to grow.
Signed-off-by: John Garry john.garry@huawei.com Reviewed-by: Ming Lei ming.lei@redhat.com Link: https://lore.kernel.org/r/1620907258-30910-3-git-send-email-john.garry@huawe... Signed-off-by: Jens Axboe axboe@kernel.dk conflict: block/blk-mq-sched.c block/blk-mq-sched.h block/blk-mq-tag.c Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-sched.c | 65 +++++++++++++++++++++++++++++++++++------- block/blk-mq-sched.h | 2 ++ block/blk-mq-tag.c | 11 ++++--- block/blk-mq.c | 13 +++++++-- include/linux/blkdev.h | 4 +++ 5 files changed, 76 insertions(+), 19 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 606bef13f1c2..b2545b98c94a 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -512,18 +512,16 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q, unsigned int hctx_idx) { struct blk_mq_tag_set *set = q->tag_set; - /* Clear HCTX_SHARED so tags are init'ed */ - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; int ret;
hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, - set->reserved_tags, flags); + set->reserved_tags, set->flags); if (!hctx->sched_tags) return -ENOMEM;
ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); if (ret) { - blk_mq_free_rq_map(hctx->sched_tags, flags); + blk_mq_free_rq_map(hctx->sched_tags, set->flags); hctx->sched_tags = NULL; }
@@ -537,16 +535,50 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q) int i;
queue_for_each_hw_ctx(q, hctx, i) { - /* Clear HCTX_SHARED so tags are freed */ - unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; - if (hctx->sched_tags) { - blk_mq_free_rq_map(hctx->sched_tags, flags); + blk_mq_free_rq_map(hctx->sched_tags, hctx->flags); hctx->sched_tags = NULL; } } }
+static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue) +{ + struct blk_mq_tag_set *set = queue->tag_set; + int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); + struct blk_mq_hw_ctx *hctx; + int ret, i; + + /* + * Set initial depth at max so that we don't need to reallocate for + * updating nr_requests. + */ + ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags, + &queue->sched_breserved_tags, + MAX_SCHED_RQ, set->reserved_tags, + set->numa_node, alloc_policy); + if (ret) + return ret; + + queue_for_each_hw_ctx(queue, hctx, i) { + hctx->sched_tags->bitmap_tags = + &queue->sched_bitmap_tags; + hctx->sched_tags->breserved_tags = + &queue->sched_breserved_tags; + } + + sbitmap_queue_resize(&queue->sched_bitmap_tags, + queue->nr_requests - set->reserved_tags); + + return 0; +} + +static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue) +{ + sbitmap_queue_free(&queue->sched_bitmap_tags); + sbitmap_queue_free(&queue->sched_breserved_tags); +} + int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { struct blk_mq_hw_ctx *hctx; @@ -571,12 +603,18 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_sched_alloc_tags(q, hctx, i); if (ret) - goto err; + goto err_free_tags; + } + + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { + ret = blk_mq_init_sched_shared_sbitmap(q); + if (ret) + goto err_free_tags; }
ret = e->ops.init_sched(q, e); if (ret) - goto err; + goto err_free_sbitmap;
blk_mq_debugfs_register_sched(q);
@@ -596,7 +634,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
return 0;
-err: +err_free_sbitmap: + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) + blk_mq_exit_sched_shared_sbitmap(q); +err_free_tags: blk_mq_sched_free_requests(q); blk_mq_sched_tags_teardown(q); q->elevator = NULL; @@ -634,5 +675,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q); + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) + blk_mq_exit_sched_shared_sbitmap(q); q->elevator = NULL; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 15f3d611db10..b228ee067491 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -5,6 +5,8 @@ #include "blk-mq.h" #include "blk-mq-tag.h"
+#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ) + void blk_mq_sched_assign_ioc(struct request *rq);
void blk_mq_sched_request_inserted(struct request *rq); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 58022b8f3e6d..98e4edd03ad4 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -13,6 +13,7 @@ #include <linux/delay.h> #include "blk.h" #include "blk-mq.h" +#include "blk-mq-sched.h" #include "blk-mq-tag.h"
#define BLK_MQ_DTAG_WAIT_EXPIRE (5 * HZ) @@ -657,8 +658,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, */ if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; - /* Only sched tags can grow, so clear HCTX_SHARED flag */ - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; struct blk_mq_tags *new; bool ret;
@@ -669,21 +668,21 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, * We need some sort of upper limit, set it high enough that * no valid use cases should require more. */ - if (tdepth > 16 * BLKDEV_MAX_RQ) + if (tdepth > MAX_SCHED_RQ) return -EINVAL;
new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, - tags->nr_reserved_tags, flags); + tags->nr_reserved_tags, set->flags); if (!new) return -ENOMEM; ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); if (ret) { - blk_mq_free_rq_map(new, flags); + blk_mq_free_rq_map(new, set->flags); return -ENOMEM; }
blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); - blk_mq_free_rq_map(*tagsptr, flags); + blk_mq_free_rq_map(*tagsptr, set->flags); *tagsptr = new; } else { /* diff --git a/block/blk-mq.c b/block/blk-mq.c index 78160d1f677a..1e4448c1e042 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3697,15 +3697,24 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } else { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr, true); + if (blk_mq_is_sbitmap_shared(set->flags)) { + hctx->sched_tags->bitmap_tags = + &q->sched_bitmap_tags; + hctx->sched_tags->breserved_tags = + &q->sched_breserved_tags; + } } if (ret) break; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(hctx); } - - if (!ret) + if (!ret) { q->nr_requests = nr; + if (q->elevator && blk_mq_is_sbitmap_shared(set->flags)) + sbitmap_queue_resize(&q->sched_bitmap_tags, + nr - set->reserved_tags); + }
blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 23dfe7608e79..dc274953e661 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -27,6 +27,7 @@ #include <linux/scatterlist.h> #include <linux/blkzoned.h> #include <linux/pm.h> +#include <linux/sbitmap.h>
struct module; struct scsi_ioctl_command; @@ -499,6 +500,9 @@ struct request_queue {
atomic_t nr_active_requests_shared_sbitmap;
+ struct sbitmap_queue sched_bitmap_tags; + struct sbitmap_queue sched_breserved_tags; + struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS);
From: Ming Lei ming.lei@redhat.com
mainline inclusion from mainline-v5.14-rc1 commit f0c1c4d2864ed614f90d2da1bab1a1c42907b940 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I597XM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
tagset can't be used after blk_cleanup_queue() is returned because freeing tagset usually follows blk_clenup_queue(). Commit d97e594c5166 ("blk-mq: Use request queue-wide tags for tagset-wide sbitmap") adds check on q->tag_set->flags in blk_mq_exit_sched(), and causes use-after-free.
Fixes it by using hctx->flags.
Reported-by: syzbot+77ba3d171a25c56756ea@syzkaller.appspotmail.com Fixes: d97e594c5166 ("blk-mq: Use request queue-wide tags for tagset-wide sbitmap") Cc: John Garry john.garry@huawei.com Signed-off-by: Ming Lei ming.lei@redhat.com Tested-by: John Garry john.garry@huawei.com Reviewed-by: John Garry john.garry@huawei.com Link: https://lore.kernel.org/r/20210609063046.122843-1-ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-mq-sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index b2545b98c94a..972bd9a3a977 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -663,6 +663,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; unsigned int i; + unsigned int flags = 0;
queue_for_each_hw_ctx(q, hctx, i) { blk_mq_debugfs_unregister_sched_hctx(hctx); @@ -670,12 +671,13 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } + flags = hctx->flags; } blk_mq_debugfs_unregister_sched(q); if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q); - if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) + if (blk_mq_is_sbitmap_shared(flags)) blk_mq_exit_sched_shared_sbitmap(q); q->elevator = NULL; }
From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I597XM CVE: NA
---------------------------
Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/blk-core.c | 10 ++++++---- block/blk-mq-sched.c | 17 ++++++++++------- block/blk-mq.c | 7 ++++--- block/blk-sysfs.c | 2 +- block/blk.h | 13 +++++++++++++ include/linux/blkdev.h | 3 --- 6 files changed, 34 insertions(+), 18 deletions(-)
diff --git a/block/blk-core.c b/block/blk-core.c index 019d583b355c..9e43501d7759 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -517,13 +517,15 @@ static void blk_timeout_work(struct work_struct *work) struct request_queue *blk_alloc_queue(int node_id) { struct request_queue *q; + struct request_queue_wrapper *q_wrapper; int ret;
- q = kmem_cache_alloc_node(blk_requestq_cachep, + q_wrapper = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, node_id); - if (!q) + if (!q_wrapper) return NULL;
+ q = &q_wrapper->q; q->last_merge = NULL;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); @@ -594,7 +596,7 @@ struct request_queue *blk_alloc_queue(int node_id) fail_id: ida_simple_remove(&blk_queue_ida, q->id); fail_q: - kmem_cache_free(blk_requestq_cachep, q); + kmem_cache_free(blk_requestq_cachep, q_wrapper); return NULL; } EXPORT_SYMBOL(blk_alloc_queue); @@ -1796,7 +1798,7 @@ int __init blk_dev_init(void) panic("Failed to create kblockd\n");
blk_requestq_cachep = kmem_cache_create("request_queue", - sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + sizeof(struct request_queue_wrapper), 0, SLAB_PANIC, NULL);
blk_debugfs_root = debugfs_create_dir("block", NULL);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 972bd9a3a977..324b59505894 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -548,13 +548,14 @@ static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue) int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); struct blk_mq_hw_ctx *hctx; int ret, i; + struct request_queue_wrapper *q_wrapper = queue_to_wrapper(queue);
/* * Set initial depth at max so that we don't need to reallocate for * updating nr_requests. */ - ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags, - &queue->sched_breserved_tags, + ret = blk_mq_init_bitmaps(&q_wrapper->sched_bitmap_tags, + &q_wrapper->sched_breserved_tags, MAX_SCHED_RQ, set->reserved_tags, set->numa_node, alloc_policy); if (ret) @@ -562,12 +563,12 @@ static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
queue_for_each_hw_ctx(queue, hctx, i) { hctx->sched_tags->bitmap_tags = - &queue->sched_bitmap_tags; + &q_wrapper->sched_bitmap_tags; hctx->sched_tags->breserved_tags = - &queue->sched_breserved_tags; + &q_wrapper->sched_breserved_tags; }
- sbitmap_queue_resize(&queue->sched_bitmap_tags, + sbitmap_queue_resize(&q_wrapper->sched_bitmap_tags, queue->nr_requests - set->reserved_tags);
return 0; @@ -575,8 +576,10 @@ static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue) { - sbitmap_queue_free(&queue->sched_bitmap_tags); - sbitmap_queue_free(&queue->sched_breserved_tags); + struct request_queue_wrapper *q_wrapper = queue_to_wrapper(queue); + + sbitmap_queue_free(&q_wrapper->sched_bitmap_tags); + sbitmap_queue_free(&q_wrapper->sched_breserved_tags); }
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) diff --git a/block/blk-mq.c b/block/blk-mq.c index 1e4448c1e042..c654981a6d50 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3671,6 +3671,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; int i, ret; + struct request_queue_wrapper *q_wrapper = queue_to_wrapper(q);
if (!set) return -EINVAL; @@ -3699,9 +3700,9 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) nr, true); if (blk_mq_is_sbitmap_shared(set->flags)) { hctx->sched_tags->bitmap_tags = - &q->sched_bitmap_tags; + &q_wrapper->sched_bitmap_tags; hctx->sched_tags->breserved_tags = - &q->sched_breserved_tags; + &q_wrapper->sched_breserved_tags; } } if (ret) @@ -3712,7 +3713,7 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) if (!ret) { q->nr_requests = nr; if (q->elevator && blk_mq_is_sbitmap_shared(set->flags)) - sbitmap_queue_resize(&q->sched_bitmap_tags, + sbitmap_queue_resize(&q_wrapper->sched_bitmap_tags, nr - set->reserved_tags); }
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 66765740902b..751b7b0bec88 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -726,7 +726,7 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) { struct request_queue *q = container_of(rcu_head, struct request_queue, rcu_head); - kmem_cache_free(blk_requestq_cachep, q); + kmem_cache_free(blk_requestq_cachep, queue_to_wrapper(q)); }
/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ diff --git a/block/blk.h b/block/blk.h index 3165c16725d5..b8948fda06e1 100644 --- a/block/blk.h +++ b/block/blk.h @@ -28,6 +28,19 @@ struct blk_flush_queue { spinlock_t mq_flush_lock; };
+/* + * The wrapper of request_queue to fix kabi while adding members. + */ +struct request_queue_wrapper { + struct request_queue q; + + struct sbitmap_queue sched_bitmap_tags; + struct sbitmap_queue sched_breserved_tags; +}; + +#define queue_to_wrapper(queue) \ + container_of(queue, struct request_queue_wrapper, q) + extern struct kmem_cache *blk_requestq_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index dc274953e661..6a4b2a01a462 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -500,9 +500,6 @@ struct request_queue {
atomic_t nr_active_requests_shared_sbitmap;
- struct sbitmap_queue sched_bitmap_tags; - struct sbitmap_queue sched_breserved_tags; - struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS);
From: Peter Zijlstra peterz@infradead.org
stable inclusion from stable-v5.10.118 commit 3ee8e109c3c316073a3e0f83ec0769c7ee8a7375 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I593PQ CVE: CVE-2022-1729
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 3ac6487e584a1eb54071dbe1212e05b884136704 upstream.
Norbert reported that it's possible to race sys_perf_event_open() such that the looser ends up in another context from the group leader, triggering many WARNs.
The move_group case checks for races against itself, but the !move_group case doesn't, seemingly relying on the previous group_leader->ctx == ctx check. However, that check is racy due to not holding any locks at that time.
Therefore, re-check the result after acquiring locks and bailing if they no longer match.
Additionally, clarify the not_move_group case from the move_group-vs-move_group race.
Fixes: f63a8daa5812 ("perf: Fix event->ctx locking") Reported-by: Norbert Slusarek nslusarek@gmx.net Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Li Huafei lihuafei1@huawei.com Reviewed-by: Yang Jihong yangjihong1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/events/core.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git a/kernel/events/core.c b/kernel/events/core.c index a59fe1c5dd9e..9d12872c73f9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11897,6 +11897,9 @@ SYSCALL_DEFINE5(perf_event_open, * Do not allow to attach to a group in a different task * or CPU context. If we're moving SW events, we'll fix * this up later, so allow that. + * + * Racy, not holding group_leader->ctx->mutex, see comment with + * perf_event_ctx_lock(). */ if (!move_group && group_leader->ctx != ctx) goto err_context; @@ -11964,6 +11967,7 @@ SYSCALL_DEFINE5(perf_event_open, } else { perf_event_ctx_unlock(group_leader, gctx); move_group = 0; + goto not_move_group; } }
@@ -11980,7 +11984,17 @@ SYSCALL_DEFINE5(perf_event_open, } } else { mutex_lock(&ctx->mutex); + + /* + * Now that we hold ctx->lock, (re)validate group_leader->ctx == ctx, + * see the group_leader && !move_group test earlier. + */ + if (group_leader && group_leader->ctx != ctx) { + err = -EINVAL; + goto err_locked; + } } +not_move_group:
if (ctx->task == TASK_TOMBSTONE) { err = -ESRCH;
From: Chao Liu liuchao173@huawei.com
euler inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I59Q07 CVE: NA
--------------------------------
Simple DMA test client. Say N unless you're debugging a DMA Device driver. It doesn't need to be compiled into the vmlinux, change it to m.
Signed-off-by: Chao Liu liuchao173@huawei.com Reviewed-by: Kai Liu kai.liu@suse.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/configs/openeuler_defconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 61c4be815462..f9c94b618ad4 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -6370,7 +6370,7 @@ CONFIG_HSU_DMA=y # DMA Clients # CONFIG_ASYNC_TX_DMA=y -CONFIG_DMATEST=y +CONFIG_DMATEST=m CONFIG_DMA_ENGINE_RAID=y
#
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I562O9 CVE: NA backport: openEuler-22.03-LTS
--------------------------------
It is different to get the cgroup that is used to update psi info in cgroup v1 and cgroup v2.
task_cgroup can only used in cgroup v1, so add branch to achieve this.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/psi.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 0c9a596692a5..c84c6f7b59fd 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -757,9 +757,13 @@ static struct psi_group *iterate_groups(struct task_struct *task, void **iter) cgroup = task->cgroups->dfl_cgrp; else { #ifdef CONFIG_CGROUP_CPUACCT - rcu_read_lock(); - cgroup = task_cgroup(task, cpuacct_cgrp_id); - rcu_read_unlock(); + if (!cgroup_subsys_on_dfl(cpuacct_cgrp_subsys)) { + rcu_read_lock(); + cgroup = task_cgroup(task, cpuacct_cgrp_id); + rcu_read_unlock(); + } else { + cgroup = task->cgroups->dfl_cgrp; + } #else cgroup = NULL; #endif
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I562O9 CVE: NA backport: openEuler-22.03-LTS
--------------------------------
Two tracepoints are added we can easily use other tools such as ebpf, ftrace, perf to monitor the memstall data and do some analysis.
The output of these tracepoints is, kcompactd0-58 [001] .... 902.709565: psi_memstall_enter: kcompactd kswapd0-132 [003] .... 902.709569: psi_memstall_leave: balance_pgdat kcompactd0-58 [001] .... 902.775230: psi_memstall_leave: kcompactd kswapd0-132 [003] .... 1337.754598: psi_memstall_enter: balance_pgdat kswapd0-132 [003] .... 1337.756076: psi_memstall_leave: balance_pgdat kcompactd0-58 [003] .... 1337.756213: psi_memstall_enter: kcompactd kcompactd0-58 [003] .... 1337.893188: psi_memstall_leave: kcompactd
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/trace/events/sched.h | 27 +++++++++++++++++++++++++++ kernel/sched/psi.c | 6 ++++++ 2 files changed, 33 insertions(+)
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 028f49662ac3..eb5ec1fb66b4 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -705,6 +705,33 @@ DECLARE_TRACE(sched_update_nr_running_tp, TP_PROTO(struct rq *rq, int change), TP_ARGS(rq, change));
+DECLARE_EVENT_CLASS(psi_memstall_template, + + TP_PROTO(unsigned long function), + + TP_ARGS(function), + + TP_STRUCT__entry( + __field(unsigned long, function) + ), + + TP_fast_assign( + __entry->function = function; + ), + + TP_printk("%ps", (void *)__entry->function) +); + +DEFINE_EVENT(psi_memstall_template, psi_memstall_enter, + TP_PROTO(unsigned long function), + TP_ARGS(function) +); + +DEFINE_EVENT(psi_memstall_template, psi_memstall_leave, + TP_PROTO(unsigned long function), + TP_ARGS(function) +); + #endif /* _TRACE_SCHED_H */
/* This part must be outside protection */ diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index c84c6f7b59fd..7a5a898ceb6f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -142,6 +142,8 @@ #include <linux/psi.h> #include "sched.h"
+#include <trace/events/sched.h> + static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled); @@ -904,6 +906,8 @@ void psi_memstall_enter(unsigned long *flags) *flags = current->in_memstall; if (*flags) return; + + trace_psi_memstall_enter(_RET_IP_); /* * in_memstall setting & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we can @@ -933,6 +937,8 @@ void psi_memstall_leave(unsigned long *flags)
if (*flags) return; + + trace_psi_memstall_leave(_RET_IP_); /* * in_memstall clearing & accounting needs to be atomic wrt * changes to the task's scheduling state, otherwise we could
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I562O9 CVE: NA backport: openEuler-22.03-LTS
--------------------------------
The cgroup structures are all allocated by the core kernel code at run time. It is also accessed only the cgroup core code and so changes made to the cgroup structure should not affect third-party kernel modules. However, a number of important kernel data structures do contain pointer to a cgroup structure and so the kABI signature has to be maintained.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: zhangjialin 00591957 zhangjialin11@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/cgroup-defs.h | 7 +++++- include/linux/psi_types.h | 44 +++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 55b9a3924cd7..35d8ce603815 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -487,7 +487,12 @@ struct cgroup { /* used to schedule release agent */ struct work_struct release_agent_work;
- /* used to track pressure stalls */ + /* used to track pressure stalls. */ + + /* + * It is accessed only the cgroup core code and so changes made to + * the cgroup structure should not affect third-party kernel modules. + */ struct psi_group psi;
/* used to store eBPF programs */ diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 17d74f62c181..b048bfc6bb21 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -10,6 +10,28 @@ #ifdef CONFIG_PSI
/* Tracked task states */ +#ifdef __GENKSYMS__ +/* + * This definition is used to keep kabi unchanged, + * and **should not changed** + */ +enum psi_task_count { + NR_IOWAIT, + NR_MEMSTALL, + NR_RUNNING, + /* + * This can't have values other than 0 or 1 and could be + * implemented as a bit flag. But for now we still have room + * in the first cacheline of psi_group_cpu, and this way we + * don't have to special case any state tracking for it. + */ + NR_ONCPU, + NR_PSI_TASK_COUNTS = 4, +}; +#else +/* + * All modification to psi_task_count should apply to here. + */ enum psi_task_count { NR_IOWAIT, NR_MEMSTALL, @@ -23,6 +45,7 @@ enum psi_task_count { NR_ONCPU, NR_PSI_TASK_COUNTS = 4, }; +#endif
/* Task state bitmasks */ #define TSK_IOWAIT (1 << NR_IOWAIT) @@ -44,6 +67,25 @@ enum psi_res { * SOME: Stalled tasks & working tasks * FULL: Stalled tasks & no working tasks */ +#ifdef __GENKSYMS__ +/* + * This definition is used to keep kabi unchanged, + * and **should not changed** + */ +enum psi_states { + PSI_IO_SOME, + PSI_IO_FULL, + PSI_MEM_SOME, + PSI_MEM_FULL, + PSI_CPU_SOME, + /* Only per-CPU, to weigh the CPU in the global average: */ + PSI_NONIDLE, + NR_PSI_STATES = 6, +}; +#else +/* + * All modification to psi_states should apply to here. + */ enum psi_states { PSI_IO_SOME, PSI_IO_FULL, @@ -54,6 +96,8 @@ enum psi_states { PSI_NONIDLE, NR_PSI_STATES = 6, }; +#endif +
enum psi_aggregators { PSI_AVGS = 0,
From: Pavel Begunkov asml.silence@gmail.com
mainline inclusion from mainline-v5.12-rc1 commit 0cf41e5e9bafc185490624c3e321c915885a91f3 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I562O9 CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Direct IO does not operate on the current working set of pages managed by the kernel, so it should not be accounted as memory stall to PSI infrastructure.
The block layer and iomap direct IO use bio_iov_iter_get_pages() to build bios, and they are the only users of it, so to avoid PSI tracking for them clear out BIO_WORKINGSET flag. Do same for dio_bio_submit() because fs/direct_io constructs bios by hand directly calling bio_add_page().
Reported-by: Christoph Hellwig hch@infradead.org Suggested-by: Christoph Hellwig hch@infradead.org Suggested-by: Johannes Weiner hannes@cmpxchg.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Pavel Begunkov asml.silence@gmail.com Reviewed-by: Ming Lei ming.lei@redhat.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- block/bio.c | 6 ++++++ fs/direct-io.c | 2 ++ 2 files changed, 8 insertions(+)
diff --git a/block/bio.c b/block/bio.c index cd0ca4166164..28191e7035f4 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1111,6 +1111,9 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. + * + * It's intended for direct IO, so doesn't do PSI tracking, the caller is + * responsible for setting BIO_WORKINGSET if necessary. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1135,6 +1138,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
if (is_bvec) bio_set_flag(bio, BIO_NO_PAGE_REF); + + /* don't account direct I/O as memory stall */ + bio_clear_flag(bio, BIO_WORKINGSET); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); diff --git a/fs/direct-io.c b/fs/direct-io.c index c64d4eb38995..9dafbb07dd6a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -426,6 +426,8 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) unsigned long flags;
bio->bi_private = dio; + /* don't account direct I/O as memory stall */ + bio_clear_flag(bio, BIO_WORKINGSET);
spin_lock_irqsave(&dio->bio_lock, flags); dio->refcount++;
From: Chengming Zhou zhouchengming@bytedance.com
mainline inclusion from mainline-v5.13-rc1 commit e7fcd762282332f765af2035a9568fb126fa3c01 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I562O9 CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The FULL state doesn't exist for the CPU resource at the system level, but exist at the cgroup level, means all non-idle tasks in a cgroup are delayed on the CPU resource which used by others outside of the cgroup or throttled by the cgroup cpu.max configuration.
Co-developed-by: Muchun Song songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Signed-off-by: Chengming Zhou zhouchengming@bytedance.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Acked-by: Johannes Weiner hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20210303034659.91735-2-zhouchengming@bytedance.com Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/psi_types.h | 3 ++- kernel/sched/psi.c | 14 +++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-)
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index b048bfc6bb21..7b0e36fdfb00 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -92,9 +92,10 @@ enum psi_states { PSI_MEM_SOME, PSI_MEM_FULL, PSI_CPU_SOME, + PSI_CPU_FULL, /* Only per-CPU, to weigh the CPU in the global average: */ PSI_NONIDLE, - NR_PSI_STATES = 6, + NR_PSI_STATES = 7, }; #endif
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 7a5a898ceb6f..2cc518b136ad 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -34,7 +34,10 @@ * delayed on that resource such that nobody is advancing and the CPU * goes idle. This leaves both workload and CPU unproductive. * - * (Naturally, the FULL state doesn't exist for the CPU resource.) + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level, means all non-idle tasks + * in a cgroup are delayed on the CPU resource which used by others outside + * of the cgroup or throttled by the cgroup cpu.max configuration. * * SOME = nr_delayed_tasks != 0 * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 @@ -232,6 +235,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; case PSI_CPU_SOME: return tasks[NR_RUNNING] > tasks[NR_ONCPU]; + case PSI_CPU_FULL: + return tasks[NR_RUNNING] && !tasks[NR_ONCPU]; case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -685,8 +690,11 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, } }
- if (groupc->state_mask & (1 << PSI_CPU_SOME)) + if (groupc->state_mask & (1 << PSI_CPU_SOME)) { groupc->times[PSI_CPU_SOME] += delta; + if (groupc->state_mask & (1 << PSI_CPU_FULL)) + groupc->times[PSI_CPU_FULL] += delta; + }
if (groupc->state_mask & (1 << PSI_NONIDLE)) groupc->times[PSI_NONIDLE] += delta; @@ -1044,7 +1052,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock);
- for (full = 0; full < 2 - (res == PSI_CPU); full++) { + for (full = 0; full < 2; full++) { unsigned long avg[3]; u64 total; int w;
From: Chengming Zhou zhouchengming@bytedance.com
mainline inclusion from mainline-v5.13-rc1 commit 7fae6c8171d20ac55402930ee8ae760cf85dff7b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I562O9 CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Move the reclaim detection from the timer tick to the task state tracking machinery using the recently added ONCPU state. And we also add task psi_flags changes checking in the psi_task_switch() optimization to update the parents properly.
In terms of performance and cost, this ONCPU task state tracking is not cheaper than previous timer tick in aggregate. But the code is simpler and shorter this way, so it's a maintainability win. And Johannes did some testing with perf bench, the performace and cost changes would be acceptable for real workloads.
Thanks to Johannes Weiner for pointing out the psi_task_switch() optimization things and the clearer changelog.
Co-developed-by: Muchun Song songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Signed-off-by: Chengming Zhou zhouchengming@bytedance.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Acked-by: Johannes Weiner hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20210303034659.91735-3-zhouchengming@bytedance.com Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/psi.h | 1 - kernel/sched/core.c | 1 - kernel/sched/psi.c | 65 ++++++++++++++++---------------------------- kernel/sched/stats.h | 9 ------ 4 files changed, 24 insertions(+), 52 deletions(-)
diff --git a/include/linux/psi.h b/include/linux/psi.h index 1b9c0e5955bc..d290f0493c33 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -22,7 +22,6 @@ void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep);
-void psi_memstall_tick(struct task_struct *task, int cpu); void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 56be8d1c7f69..0fd88dc7660f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4048,7 +4048,6 @@ void scheduler_tick(void) update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); curr->sched_class->task_tick(rq, curr, 0); calc_global_load_tick(rq); - psi_task_tick(rq);
rq_unlock(rq, &rf);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 2cc518b136ad..f27518bdc6c9 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -651,8 +651,7 @@ static void poll_timer_fn(struct timer_list *t) wake_up_interruptible(&group->poll_wait); }
-static void record_times(struct psi_group_cpu *groupc, int cpu, - bool memstall_tick) +static void record_times(struct psi_group_cpu *groupc, int cpu) { u32 delta; u64 now; @@ -671,23 +670,6 @@ static void record_times(struct psi_group_cpu *groupc, int cpu, groupc->times[PSI_MEM_SOME] += delta; if (groupc->state_mask & (1 << PSI_MEM_FULL)) groupc->times[PSI_MEM_FULL] += delta; - else if (memstall_tick) { - u32 sample; - /* - * Since we care about lost potential, a - * memstall is FULL when there are no other - * working tasks, but also when the CPU is - * actively reclaiming and nothing productive - * could run even if it were runnable. - * - * When the timer tick sees a reclaiming CPU, - * regardless of runnable tasks, sample a FULL - * tick (or less if it hasn't been a full tick - * since the last state change). - */ - sample = min(delta, (u32)jiffies_to_nsecs(1)); - groupc->times[PSI_MEM_FULL] += sample; - } }
if (groupc->state_mask & (1 << PSI_CPU_SOME)) { @@ -721,7 +703,7 @@ static void psi_group_change(struct psi_group *group, int cpu, */ write_seqcount_begin(&groupc->seq);
- record_times(groupc, cpu, false); + record_times(groupc, cpu);
for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) @@ -746,6 +728,18 @@ static void psi_group_change(struct psi_group *group, int cpu, if (test_state(groupc->tasks, s)) state_mask |= (1 << s); } + + /* + * Since we care about lost potential, a memstall is FULL + * when there are no other working tasks, but also when + * the CPU is actively reclaiming and nothing productive + * could run even if it were runnable. So when the current + * task in a cgroup is in_memstall, the corresponding groupc + * on that cpu is in PSI_MEM_FULL state. + */ + if (groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall) + state_mask |= (1 << PSI_MEM_FULL); + groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq); @@ -845,17 +839,21 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, void *iter;
if (next->pid) { + bool identical_state; + psi_flags_change(next, 0, TSK_ONCPU); /* - * When moving state between tasks, the group that - * contains them both does not change: we can stop - * updating the tree once we reach the first common - * ancestor. Iterate @next's ancestors until we - * encounter @prev's state. + * When switching between tasks that have an identical + * runtime state, the cgroup that contains both tasks + * runtime state, the cgroup that contains both tasks + * we reach the first common ancestor. Iterate @next's + * ancestors only until we encounter @prev's ONCPU. */ + identical_state = prev->psi_flags == next->psi_flags; iter = NULL; while ((group = iterate_groups(next, &iter))) { - if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { + if (identical_state && + per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) { common = group; break; } @@ -881,21 +879,6 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } }
-void psi_memstall_tick(struct task_struct *task, int cpu) -{ - struct psi_group *group; - void *iter = NULL; - - while ((group = iterate_groups(task, &iter))) { - struct psi_group_cpu *groupc; - - groupc = per_cpu_ptr(group->pcpu, cpu); - write_seqcount_begin(&groupc->seq); - record_times(groupc, cpu, true); - write_seqcount_end(&groupc->seq); - } -} - /** * psi_memstall_enter - mark the beginning of a memory stall section * @flags: flags to handle nested sections diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 06cf8202c178..c676af28347b 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -164,14 +164,6 @@ static inline void psi_sched_switch(struct task_struct *prev, psi_task_switch(prev, next, sleep); }
-static inline void psi_task_tick(struct rq *rq) -{ - if (static_branch_likely(&psi_disabled)) - return; - - if (unlikely(rq->curr->in_memstall)) - psi_memstall_tick(rq->curr, cpu_of(rq)); -} #else /* CONFIG_PSI */ static inline void psi_enqueue(struct task_struct *p, bool wakeup) {} static inline void psi_dequeue(struct task_struct *p, bool sleep) {} @@ -179,7 +171,6 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, bool sleep) {} -static inline void psi_task_tick(struct rq *rq) {} #endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-v5.13-rc1 commit fddc8bab531e217806b84906681324377d741c6c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I562O9 CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Move the unlikely branches out of line. This eliminates undesirable jumps during wakeup and sleeps for workloads that aren't under any sort of resource pressure.
Signed-off-by: Johannes Weiner hannes@cmpxchg.org Signed-off-by: Chengming Zhou zhouchengming@bytedance.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Link: https://lkml.kernel.org/r/20210303034659.91735-4-zhouchengming@bytedance.com Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/psi.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index f27518bdc6c9..51566f01cd4a 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -226,17 +226,17 @@ static bool test_state(unsigned int *tasks, enum psi_states state) { switch (state) { case PSI_IO_SOME: - return tasks[NR_IOWAIT]; + return unlikely(tasks[NR_IOWAIT]); case PSI_IO_FULL: - return tasks[NR_IOWAIT] && !tasks[NR_RUNNING]; + return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]); case PSI_MEM_SOME: - return tasks[NR_MEMSTALL]; + return unlikely(tasks[NR_MEMSTALL]); case PSI_MEM_FULL: - return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]; + return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); case PSI_CPU_SOME: - return tasks[NR_RUNNING] > tasks[NR_ONCPU]; + return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); case PSI_CPU_FULL: - return tasks[NR_RUNNING] && !tasks[NR_ONCPU]; + return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]); case PSI_NONIDLE: return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] || tasks[NR_RUNNING]; @@ -737,7 +737,7 @@ static void psi_group_change(struct psi_group *group, int cpu, * task in a cgroup is in_memstall, the corresponding groupc * on that cpu is in PSI_MEM_FULL state. */ - if (groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall) + if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)) state_mask |= (1 << PSI_MEM_FULL);
groupc->state_mask = state_mask;
From: Chengming Zhou zhouchengming@bytedance.com
mainline inclusion from mainline-v5.13-rc1 commit 4117cebf1a9fcbf35b9aabf0e37b6c5eea296798 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I562O9 CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The commit 36b238d57172 ("psi: Optimize switching tasks inside shared cgroups") only update cgroups whose state actually changes during a task switch only in task preempt case, not in task sleep case.
We actually don't need to clear and set TSK_ONCPU state for common cgroups of next and prev task in sleep case, that can save many psi_group_change especially when most activity comes from one leaf cgroup.
sleep before: psi_dequeue() while ((group = iterate_groups(prev))) # all ancestors psi_group_change(prev, .clear=TSK_RUNNING|TSK_ONCPU) psi_task_switch() while ((group = iterate_groups(next))) # all ancestors psi_group_change(next, .set=TSK_ONCPU)
sleep after: psi_dequeue() nop psi_task_switch() while ((group = iterate_groups(next))) # until (prev & next) psi_group_change(next, .set=TSK_ONCPU) while ((group = iterate_groups(prev))) # all ancestors psi_group_change(prev, .clear=common?TSK_RUNNING:TSK_RUNNING|TSK_ONCPU)
When a voluntary sleep switches to another task, we remove one call of psi_group_change() for every common cgroup ancestor of the two tasks.
Co-developed-by: Muchun Song songmuchun@bytedance.com Signed-off-by: Muchun Song songmuchun@bytedance.com Signed-off-by: Chengming Zhou zhouchengming@bytedance.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Acked-by: Johannes Weiner hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20210303034659.91735-5-zhouchengming@bytedance.com Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/psi.c | 35 +++++++++++++++++++++++++---------- kernel/sched/stats.h | 28 ++++++++++++---------------- 2 files changed, 37 insertions(+), 26 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 51566f01cd4a..1fb5cfd6a172 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -862,20 +862,35 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } }
- /* - * If this is a voluntary sleep, dequeue will have taken care - * of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We - * only need to deal with it during preemption. - */ - if (sleep) - return; - if (prev->pid) { - psi_flags_change(prev, TSK_ONCPU, 0); + int clear = TSK_ONCPU, set = 0; + + /* + * When we're going to sleep, psi_dequeue() lets us handle + * TSK_RUNNING and TSK_IOWAIT here, where we can combine it + * with TSK_ONCPU and save walking common ancestors twice. + */ + if (sleep) { + clear |= TSK_RUNNING; + if (prev->in_iowait) + set |= TSK_IOWAIT; + } + + psi_flags_change(prev, clear, set);
iter = NULL; while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, TSK_ONCPU, 0, true); + psi_group_change(group, cpu, clear, set, true); + + /* + * TSK_ONCPU is handled up to the common ancestor. If we're tasked + * with dequeuing too, finish that for the rest of the hierarchy. + */ + if (sleep) { + clear &= ~TSK_ONCPU; + for (; group; group = iterate_groups(prev, &iter)) + psi_group_change(group, cpu, clear, set, true); + } } }
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c676af28347b..5025ade880ea 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -104,28 +104,24 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
static inline void psi_dequeue(struct task_struct *p, bool sleep) { - int clear = TSK_RUNNING, set = 0; + int clear = TSK_RUNNING;
if (static_branch_likely(&psi_disabled)) return;
- if (!sleep) { - if (p->in_memstall) - clear |= TSK_MEMSTALL; - } else { - /* - * When a task sleeps, schedule() dequeues it before - * switching to the next one. Merge the clearing of - * TSK_RUNNING and TSK_ONCPU to save an unnecessary - * psi_task_change() call in psi_sched_switch(). - */ - clear |= TSK_ONCPU; + /* + * A voluntary sleep is a dequeue followed by a task switch. To + * avoid walking all ancestors twice, psi_task_switch() handles + * TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU. + * Do nothing here. + */ + if (sleep) + return;
- if (p->in_iowait) - set |= TSK_IOWAIT; - } + if (p->in_memstall) + clear |= TSK_MEMSTALL;
- psi_task_change(p, clear, set); + psi_task_change(p, clear, 0); }
static inline void psi_ttwu_dequeue(struct task_struct *p)
From: Shakeel Butt shakeelb@google.com
mainline inclusion from mainline-v5.13-rc1 commit df77430639c9cf73559bac0f25084518bf9a812d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I562O9 CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We noticed that the cost of psi increases with the increase in the levels of the cgroups. Particularly the cost of cpu_clock() sticks out as the kernel calls it multiple times as it traverses up the cgroup tree. This patch reduces the calls to cpu_clock().
Performed perf bench on Intel Broadwell with 3 levels of cgroup.
Before the patch:
$ perf bench sched all # Running sched/messaging benchmark... # 20 sender and receiver processes per group # 10 groups == 400 processes run
Total time: 0.747 [sec]
# Running sched/pipe benchmark... # Executed 1000000 pipe operations between two processes
Total time: 3.516 [sec]
3.516689 usecs/op 284358 ops/sec
After the patch:
$ perf bench sched all # Running sched/messaging benchmark... # 20 sender and receiver processes per group # 10 groups == 400 processes run
Total time: 0.640 [sec]
# Running sched/pipe benchmark... # Executed 1000000 pipe operations between two processes
Total time: 3.329 [sec]
3.329820 usecs/op 300316 ops/sec
Signed-off-by: Shakeel Butt shakeelb@google.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Acked-by: Johannes Weiner hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20210321205156.4186483-1-shakeelb@google.com Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/psi.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1fb5cfd6a172..de46e44f5e01 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -651,12 +651,10 @@ static void poll_timer_fn(struct timer_list *t) wake_up_interruptible(&group->poll_wait); }
-static void record_times(struct psi_group_cpu *groupc, int cpu) +static void record_times(struct psi_group_cpu *groupc, u64 now) { u32 delta; - u64 now;
- now = cpu_clock(cpu); delta = now - groupc->state_start; groupc->state_start = now;
@@ -683,7 +681,7 @@ static void record_times(struct psi_group_cpu *groupc, int cpu) }
static void psi_group_change(struct psi_group *group, int cpu, - unsigned int clear, unsigned int set, + unsigned int clear, unsigned int set, u64 now, bool wake_clock) { struct psi_group_cpu *groupc; @@ -703,7 +701,7 @@ static void psi_group_change(struct psi_group *group, int cpu, */ write_seqcount_begin(&groupc->seq);
- record_times(groupc, cpu); + record_times(groupc, now);
for (t = 0, m = clear; m; m &= ~(1 << t), t++) { if (!(m & (1 << t))) @@ -810,12 +808,14 @@ void psi_task_change(struct task_struct *task, int clear, int set) struct psi_group *group; bool wake_clock = true; void *iter = NULL; + u64 now;
if (!task->pid) return;
psi_flags_change(task, clear, set);
+ now = cpu_clock(cpu); /* * Periodic aggregation shuts off if there is a period of no * task changes, so we wake it back up if necessary. However, @@ -828,7 +828,7 @@ void psi_task_change(struct task_struct *task, int clear, int set) wake_clock = false;
while ((group = iterate_groups(task, &iter))) - psi_group_change(group, cpu, clear, set, wake_clock); + psi_group_change(group, cpu, clear, set, now, wake_clock); }
void psi_task_switch(struct task_struct *prev, struct task_struct *next, @@ -837,6 +837,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, struct psi_group *group, *common = NULL; int cpu = task_cpu(prev); void *iter; + u64 now = cpu_clock(cpu);
if (next->pid) { bool identical_state; @@ -858,7 +859,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, break; }
- psi_group_change(group, cpu, 0, TSK_ONCPU, true); + psi_group_change(group, cpu, 0, TSK_ONCPU, now, true); } }
@@ -880,7 +881,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
iter = NULL; while ((group = iterate_groups(prev, &iter)) && group != common) - psi_group_change(group, cpu, clear, set, true); + psi_group_change(group, cpu, clear, set, now, true);
/* * TSK_ONCPU is handled up to the common ancestor. If we're tasked @@ -889,7 +890,7 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, if (sleep) { clear &= ~TSK_ONCPU; for (; group; group = iterate_groups(prev, &iter)) - psi_group_change(group, cpu, clear, set, true); + psi_group_change(group, cpu, clear, set, now, true); } } }
From: Johannes Weiner hannes@cmpxchg.org
mainline inclusion from mainline-v5.13-rc1 commit d583d360a620e6229422b3455d0be082b8255f5e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I562O9 CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
4117cebf1a9f ("psi: Optimize task switch inside shared cgroups") introduced a race condition that corrupts internal psi state. This manifests as kernel warnings, sometimes followed by bogusly high IO pressure:
psi: task underflow! cpu=1 t=2 tasks=[0 0 0 0] clear=c set=0 (schedule() decreasing RUNNING and ONCPU, both of which are 0)
psi: incosistent task state! task=2412744:systemd cpu=17 psi_flags=e clear=3 set=0 (cgroup_move_task() clearing MEMSTALL and IOWAIT, but task is MEMSTALL | RUNNING | ONCPU)
What the offending commit does is batch the two psi callbacks in schedule() to reduce the number of cgroup tree updates. When prev is deactivated and removed from the runqueue, nothing is done in psi at first; when the task switch completes, TSK_RUNNING and TSK_IOWAIT are updated along with TSK_ONCPU.
However, the deactivation and the task switch inside schedule() aren't atomic: pick_next_task() may drop the rq lock for load balancing. When this happens, cgroup_move_task() can run after the task has been physically dequeued, but the psi updates are still pending. Since it looks at the task's scheduler state, it doesn't move everything to the new cgroup that the task switch that follows is about to clear from it. cgroup_move_task() will leak the TSK_RUNNING count in the old cgroup, and psi_sched_switch() will underflow it in the new cgroup.
A similar thing can happen for iowait. TSK_IOWAIT is usually set when a p->in_iowait task is dequeued, but again this update is deferred to the switch. cgroup_move_task() can see an unqueued p->in_iowait task and move a non-existent TSK_IOWAIT. This results in the inconsistent task state warning, as well as a counter underflow that will result in permanent IO ghost pressure being reported.
Fix this bug by making cgroup_move_task() use task->psi_flags instead of looking at the potentially mismatching scheduler state.
[ We used the scheduler state historically in order to not rely on task->psi_flags for anything but debugging. But that ship has sailed anyway, and this is simpler and more robust.
We previously already batched TSK_ONCPU clearing with the TSK_RUNNING update inside the deactivation call from schedule(). But that ordering was safe and didn't result in TSK_ONCPU corruption: unlike most places in the scheduler, cgroup_move_task() only checked task_current() and handled TSK_ONCPU if the task was still queued. ]
Fixes: 4117cebf1a9f ("psi: Optimize task switch inside shared cgroups") Signed-off-by: Johannes Weiner hannes@cmpxchg.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/20210503174917.38579-1-hannes@cmpxchg.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/psi.c | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index de46e44f5e01..9ecb558a1888 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -997,7 +997,7 @@ void psi_cgroup_free(struct cgroup *cgroup) */ void cgroup_move_task(struct task_struct *task, struct css_set *to) { - unsigned int task_flags = 0; + unsigned int task_flags; struct rq_flags rf; struct rq *rq;
@@ -1012,15 +1012,31 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
rq = task_rq_lock(task, &rf);
- if (task_on_rq_queued(task)) { - task_flags = TSK_RUNNING; - if (task_current(rq, task)) - task_flags |= TSK_ONCPU; - } else if (task->in_iowait) - task_flags = TSK_IOWAIT; - - if (task->in_memstall) - task_flags |= TSK_MEMSTALL; + /* + * We may race with schedule() dropping the rq lock between + * deactivating prev and switching to next. Because the psi + * updates from the deactivation are deferred to the switch + * callback to save cgroup tree updates, the task's scheduling + * state here is not coherent with its psi state: + * + * schedule() cgroup_move_task() + * rq_lock() + * deactivate_task() + * p->on_rq = 0 + * psi_dequeue() // defers TSK_RUNNING & TSK_IOWAIT updates + * pick_next_task() + * rq_unlock() + * rq_lock() + * psi_task_change() // old cgroup + * task->cgroups = to + * psi_task_change() // new cgroup + * rq_unlock() + * rq_lock() + * psi_sched_switch() // does deferred updates in new cgroup + * + * Don't rely on the scheduling state. Use psi_flags instead. + */ + task_flags = task->psi_flags;
if (task_flags) psi_task_change(task, task_flags, 0);
From: Brian Chen brianchen118@gmail.com
mainline inclusion from mainline-v5.17-rc1 commit cb0e52b7748737b2cf6481fdd9b920ce7e1ebbdf category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I562O9 CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We've noticed cases where tasks in a cgroup are stalled on memory but there is little memory FULL pressure since tasks stay on the runqueue in reclaim.
A simple example involves a single threaded program that keeps leaking and touching large amounts of memory. It runs in a cgroup with swap enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though there is significant CPU pressure and memory SOME, there is barely any memory FULL since the task enters reclaim and stays on the runqueue. However, this memory-bound task is effectively stalled on memory and we expect memory FULL to match memory SOME in this scenario.
The code is confused about memstall && running, thinking there is a stalled task and a productive task when there's only one task: a reclaimer that's counted as both. To fix this, we redefine the condition for PSI_MEM_FULL to check that all running tasks are in an active memstall instead of checking that there are no running tasks.
case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); + return unlikely(tasks[NR_MEMSTALL] && + tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
This will capture reclaimers. It will also capture tasks that called psi_memstall_enter() and are about to sleep, but this should be negligible noise.
Signed-off-by: Brian Chen brianchen118@gmail.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Acked-by: Johannes Weiner hannes@cmpxchg.org Link: https://lore.kernel.org/r/20211110213312.310243-1-brianchen118@gmail.com Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/psi_types.h | 13 ++++++++++- kernel/sched/psi.c | 45 ++++++++++++++++++++++++--------------- kernel/sched/stats.h | 5 ++++- 3 files changed, 44 insertions(+), 19 deletions(-)
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 7b0e36fdfb00..c17aeb774e23 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -43,7 +43,17 @@ enum psi_task_count { * don't have to special case any state tracking for it. */ NR_ONCPU, - NR_PSI_TASK_COUNTS = 4, + /* + * For IO and CPU stalls the presence of running/oncpu tasks + * in the domain means a partial rather than a full stall. + * For memory it's not so simple because of page reclaimers: + * they are running/oncpu while representing a stall. To tell + * whether a domain has productivity left or not, we need to + * distinguish between regular running (i.e. productive) + * threads and memstall ones. + */ + NR_MEMSTALL_RUNNING, + NR_PSI_TASK_COUNTS = 5, }; #endif
@@ -52,6 +62,7 @@ enum psi_task_count { #define TSK_MEMSTALL (1 << NR_MEMSTALL) #define TSK_RUNNING (1 << NR_RUNNING) #define TSK_ONCPU (1 << NR_ONCPU) +#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
/* Resources that workloads could be stalled on */ enum psi_res { diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 9ecb558a1888..26b2628c43ef 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -34,13 +34,19 @@ * delayed on that resource such that nobody is advancing and the CPU * goes idle. This leaves both workload and CPU unproductive. * - * Naturally, the FULL state doesn't exist for the CPU resource at the - * system level, but exist at the cgroup level, means all non-idle tasks - * in a cgroup are delayed on the CPU resource which used by others outside - * of the cgroup or throttled by the cgroup cpu.max configuration. - * * SOME = nr_delayed_tasks != 0 - * FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0 + * FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0 + * + * What it means for a task to be productive is defined differently + * for each resource. For IO, productive means a running task. For + * memory, productive means a running task that isn't a reclaimer. For + * CPU, productive means an oncpu task. + * + * Naturally, the FULL state doesn't exist for the CPU resource at the + * system level, but exist at the cgroup level. At the cgroup level, + * FULL means all non-idle tasks in the cgroup are delayed on the CPU + * resource which is being used by others outside of the cgroup or + * throttled by the cgroup cpu.max configuration. * * The percentage of wallclock time spent in those compound stall * states gives pressure numbers between 0 and 100 for each resource, @@ -81,13 +87,13 @@ * * threads = min(nr_nonidle_tasks, nr_cpus) * SOME = min(nr_delayed_tasks / threads, 1) - * FULL = (threads - min(nr_running_tasks, threads)) / threads + * FULL = (threads - min(nr_productive_tasks, threads)) / threads * * For the 257 number crunchers on 256 CPUs, this yields: * * threads = min(257, 256) * SOME = min(1 / 256, 1) = 0.4% - * FULL = (256 - min(257, 256)) / 256 = 0% + * FULL = (256 - min(256, 256)) / 256 = 0% * * For the 1 out of 4 memory-delayed tasks, this yields: * @@ -112,7 +118,7 @@ * For each runqueue, we track: * * tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0) - * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu]) + * tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu]) * tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0) * * and then periodically aggregate: @@ -232,7 +238,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state) case PSI_MEM_SOME: return unlikely(tasks[NR_MEMSTALL]); case PSI_MEM_FULL: - return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]); + return unlikely(tasks[NR_MEMSTALL] && + tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]); case PSI_CPU_SOME: return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]); case PSI_CPU_FULL: @@ -709,10 +716,11 @@ static void psi_group_change(struct psi_group *group, int cpu, if (groupc->tasks[t]) { groupc->tasks[t]--; } else if (!psi_bug) { - printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n", + printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n", cpu, t, groupc->tasks[0], groupc->tasks[1], groupc->tasks[2], - groupc->tasks[3], clear, set); + groupc->tasks[3], groupc->tasks[4], + clear, set); psi_bug = 1; } } @@ -867,12 +875,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, int clear = TSK_ONCPU, set = 0;
/* - * When we're going to sleep, psi_dequeue() lets us handle - * TSK_RUNNING and TSK_IOWAIT here, where we can combine it - * with TSK_ONCPU and save walking common ancestors twice. + * When we're going to sleep, psi_dequeue() lets us + * handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and + * TSK_IOWAIT here, where we can combine it with + * TSK_ONCPU and save walking common ancestors twice. */ if (sleep) { clear |= TSK_RUNNING; + if (prev->in_memstall) + clear |= TSK_MEMSTALL_RUNNING; if (prev->in_iowait) set |= TSK_IOWAIT; } @@ -923,7 +934,7 @@ void psi_memstall_enter(unsigned long *flags) rq = this_rq_lock_irq(&rf);
current->in_memstall = 1; - psi_task_change(current, 0, TSK_MEMSTALL); + psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
rq_unlock_irq(rq, &rf); } @@ -954,7 +965,7 @@ void psi_memstall_leave(unsigned long *flags) rq = this_rq_lock_irq(&rf);
current->in_memstall = 0; - psi_task_change(current, TSK_MEMSTALL, 0); + psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
rq_unlock_irq(rq, &rf); } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 5025ade880ea..b8b4e5b2694e 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -89,6 +89,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) if (static_branch_likely(&psi_disabled)) return;
+ if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; + if (!wakeup || p->sched_psi_wake_requeue) { if (p->in_memstall) set |= TSK_MEMSTALL; @@ -119,7 +122,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) return;
if (p->in_memstall) - clear |= TSK_MEMSTALL; + clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
psi_task_change(p, clear, 0); }
From: Chengming Zhou zhouchengming@bytedance.com
mainline inclusion from mainline-v5.18-rc4 commit 890d550d7dbac7a31ecaa78732aa22be282bb6b8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I562O9 CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Martin find it confusing when look at the /proc/pressure/cpu output, and found no hint about that CPU "full" line in psi Documentation.
% cat /proc/pressure/cpu some avg10=0.92 avg60=0.91 avg300=0.73 total=933490489 full avg10=0.22 avg60=0.23 avg300=0.16 total=358783277
The PSI_CPU_FULL state is introduced by commit e7fcd7622823 ("psi: Add PSI_CPU_FULL state"), which mainly for cgroup level, but also counted at the system level as a side effect.
Naturally, the FULL state doesn't exist for the CPU resource at the system level. These "full" numbers can come from CPU idle schedule latency. For example, t1 is the time when task wakeup on an idle CPU, t2 is the time when CPU pick and switch to it. The delta of (t2 - t1) will be in CPU_FULL state.
Another case all processes can be stalled is when all cgroups have been throttled at the same time, which unlikely to happen.
Anyway, CPU_FULL metric is meaningless and confusing at the system level. So this patch will report zeroes for CPU full at the system level, and update psi Documentation accordingly.
Fixes: e7fcd7622823 ("psi: Add PSI_CPU_FULL state") Reported-by: Martin Steigerwald Martin.Steigerwald@proact.de Suggested-by: Johannes Weiner hannes@cmpxchg.org Signed-off-by: Chengming Zhou zhouchengming@bytedance.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Acked-by: Johannes Weiner hannes@cmpxchg.org Link: https://lore.kernel.org/r/20220408121914.82855-1-zhouchengming@bytedance.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/accounting/psi.rst | 9 ++++----- kernel/sched/psi.c | 15 +++++++++------ 2 files changed, 13 insertions(+), 11 deletions(-)
diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst index 860fe651d645..5e40b3f437f9 100644 --- a/Documentation/accounting/psi.rst +++ b/Documentation/accounting/psi.rst @@ -37,11 +37,7 @@ Pressure interface Pressure information for each resource is exported through the respective file in /proc/pressure/ -- cpu, memory, and io.
-The format for CPU is as such:: - - some avg10=0.00 avg60=0.00 avg300=0.00 total=0 - -and for memory and IO:: +The format is as such::
some avg10=0.00 avg60=0.00 avg300=0.00 total=0 full avg10=0.00 avg60=0.00 avg300=0.00 total=0 @@ -58,6 +54,9 @@ situation from a state where some tasks are stalled but the CPU is still doing productive work. As such, time spent in this subset of the stall state is tracked separately and exported in the "full" averages.
+CPU full is undefined at the system level, but has been reported +since 5.13, so it is set to zero for backward compatibility. + The ratios (in %) are tracked as recent trends over ten, sixty, and three hundred second windows, which gives insight into short term events as well as medium and long term trends. The total absolute stall time diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 26b2628c43ef..25f7d46ad7bd 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1079,14 +1079,17 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) mutex_unlock(&group->avgs_lock);
for (full = 0; full < 2; full++) { - unsigned long avg[3]; - u64 total; + unsigned long avg[3] = { 0, }; + u64 total = 0; int w;
- for (w = 0; w < 3; w++) - avg[w] = group->avg[res * 2 + full][w]; - total = div_u64(group->total[PSI_AVGS][res * 2 + full], - NSEC_PER_USEC); + /* CPU FULL is undefined at the system level */ + if (!(group == &psi_system && res == PSI_CPU && full)) { + for (w = 0; w < 3; w++) + avg[w] = group->avg[res * 2 + full][w]; + total = div_u64(group->total[PSI_AVGS][res * 2 + full], + NSEC_PER_USEC); + }
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", full ? "full" : "some",
From: Paolo Bonzini pbonzini@redhat.com
mainline inclusion from mainline-v5.18 commit 9f46c187e2e680ecd9de7983e4d081c3391acc76 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I59I19 CVE: CVE-2022-1789
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
With shadow paging enabled, the INVPCID instruction results in a call to kvm_mmu_invpcid_gva. If INVPCID is executed with CR0.PG=0, the invlpg callback is not set and the result is a NULL pointer dereference. Fix it trivially by checking for mmu->invlpg before every call.
There are other possibilities:
- check for CR0.PG, because KVM (like all Intel processors after P5) flushes guest TLB on CR0.PG changes so that INVPCID/INVLPG are a nop with paging disabled
- check for EFER.LMA, because KVM syncs and flushes when switching MMU contexts outside of 64-bit mode
All of these are tricky, go for the simple solution. This is CVE-2022-1789.
Reported-by: Yongkang Jia kangel@zju.edu.cn Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Yipeng Zou zouyipeng@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/mmu/mmu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) mode change 100644 => 100755 arch/x86/kvm/mmu/mmu.c
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c old mode 100644 new mode 100755 index 20d29ae8ed70..9506cfcb86be --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5176,14 +5176,16 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) uint i;
if (pcid == kvm_get_active_pcid(vcpu)) { - mmu->invlpg(vcpu, gva, mmu->root_hpa); + if (mmu->invlpg) + mmu->invlpg(vcpu, gva, mmu->root_hpa); tlb_flush = true; }
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) { if (VALID_PAGE(mmu->prev_roots[i].hpa) && pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) { - mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); + if (mmu->invlpg) + mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa); tlb_flush = true; } }
From: Martin Willi martin@strongswan.org
mainline inclusion from mainline-v5.13-rc1 commit 22b6034323fd736f260e00b9ea85c634abeb3446 category: bugfix bugzilla: 186880 https://gitee.com/openeuler/kernel/issues/I5A7W4
--------------------------------
If a generic XDP program changes the destination MAC address from/to multicast/broadcast, the skb->pkt_type is updated to properly handle the packet when passed up the stack. When changing the MAC from/to the NICs MAC, PACKET_HOST/OTHERHOST is not updated, though, making the behavior different from that of native XDP.
Remember the PACKET_HOST/OTHERHOST state before calling the program in generic XDP, and update pkt_type accordingly if the destination MAC address has changed. As eth_type_trans() assumes a default pkt_type of PACKET_HOST, restore that before calling it.
The use case for this is when a XDP program wants to push received packets up the stack by rewriting the MAC to the NICs MAC, for example by cluster nodes sharing MAC addresses.
Fixes: 297249569932 ("net: fix generic XDP to handle if eth header was mangled") Signed-off-by: Martin Willi martin@strongswan.org Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: Toke Høiland-Jørgensen toke@redhat.com Link: https://lore.kernel.org/bpf/20210419141559.8611-1-martin@strongswan.org
Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com Conflict: net/core/dev.c Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/dev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/net/core/dev.c b/net/core/dev.c index f20f0d5e5280..406ed8c7f22d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4621,9 +4621,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct netdev_rx_queue *rxqueue; void *orig_data, *orig_data_end; u32 metalen, act = XDP_DROP; + bool orig_bcast, orig_host; __be16 orig_eth_type; struct ethhdr *eth; - bool orig_bcast; int hlen, off; u32 mac_len;
@@ -4670,6 +4670,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; + orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto;
@@ -4700,8 +4701,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || + (orig_host != ether_addr_equal_64bits(eth->h_dest, + skb->dev->dev_addr)) || (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); }