From: Shakeel Butt shakeelb@google.com
[ Upstream commit e876ecc67db80dfdb8e237f71e5b43bb88ae549c ]
We are testing network memory accounting in our setup and noticed inconsistent network memory usage and often unrelated cgroups network usage correlates with testing workload. On further inspection, it seems like mem_cgroup_sk_alloc() and cgroup_sk_alloc() are broken in irq context specially for cgroup v1.
mem_cgroup_sk_alloc() and cgroup_sk_alloc() can be called in irq context and kind of assumes that this can only happen from sk_clone_lock() and the source sock object has already associated cgroup. However in cgroup v1, where network memory accounting is opt-in, the source sock can be unassociated with any cgroup and the new cloned sock can get associated with unrelated interrupted cgroup.
Cgroup v2 can also suffer if the source sock object was created by process in the root cgroup or if sk_alloc() is called in irq context. The fix is to just do nothing in interrupt.
WARNING: Please note that about half of the TCP sockets are allocated from the IRQ context, so, memory used by such sockets will not be accouted by the memcg.
The stack trace of mem_cgroup_sk_alloc() from IRQ-context:
CPU: 70 PID: 12720 Comm: ssh Tainted: 5.6.0-smp-DEV #1 Hardware name: ... Call Trace: <IRQ> dump_stack+0x57/0x75 mem_cgroup_sk_alloc+0xe9/0xf0 sk_clone_lock+0x2a7/0x420 inet_csk_clone_lock+0x1b/0x110 tcp_create_openreq_child+0x23/0x3b0 tcp_v6_syn_recv_sock+0x88/0x730 tcp_check_req+0x429/0x560 tcp_v6_rcv+0x72d/0xa40 ip6_protocol_deliver_rcu+0xc9/0x400 ip6_input+0x44/0xd0 ? ip6_protocol_deliver_rcu+0x400/0x400 ip6_rcv_finish+0x71/0x80 ipv6_rcv+0x5b/0xe0 ? ip6_sublist_rcv+0x2e0/0x2e0 process_backlog+0x108/0x1e0 net_rx_action+0x26b/0x460 __do_softirq+0x104/0x2a6 do_softirq_own_stack+0x2a/0x40 </IRQ> do_softirq.part.19+0x40/0x50 __local_bh_enable_ip+0x51/0x60 ip6_finish_output2+0x23d/0x520 ? ip6table_mangle_hook+0x55/0x160 __ip6_finish_output+0xa1/0x100 ip6_finish_output+0x30/0xd0 ip6_output+0x73/0x120 ? __ip6_finish_output+0x100/0x100 ip6_xmit+0x2e3/0x600 ? ipv6_anycast_cleanup+0x50/0x50 ? inet6_csk_route_socket+0x136/0x1e0 ? skb_free_head+0x1e/0x30 inet6_csk_xmit+0x95/0xf0 __tcp_transmit_skb+0x5b4/0xb20 __tcp_send_ack.part.60+0xa3/0x110 tcp_send_ack+0x1d/0x20 tcp_rcv_state_process+0xe64/0xe80 ? tcp_v6_connect+0x5d1/0x5f0 tcp_v6_do_rcv+0x1b1/0x3f0 ? tcp_v6_do_rcv+0x1b1/0x3f0 __release_sock+0x7f/0xd0 release_sock+0x30/0xa0 __inet_stream_connect+0x1c3/0x3b0 ? prepare_to_wait+0xb0/0xb0 inet_stream_connect+0x3b/0x60 __sys_connect+0x101/0x120 ? __sys_getsockopt+0x11b/0x140 __x64_sys_connect+0x1a/0x20 do_syscall_64+0x51/0x200 entry_SYSCALL_64_after_hwframe+0x44/0xa9
The stack trace of mem_cgroup_sk_alloc() from IRQ-context: Fixes: 2d7580738345 ("mm: memcontrol: consolidate cgroup socket tracking") Fixes: d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") Signed-off-by: Shakeel Butt shakeelb@google.com Reviewed-by: Roman Gushchin guro@fb.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org
Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/cgroup/cgroup.c | 4 ++++ mm/memcontrol.c | 4 ++++ 2 files changed, 8 insertions(+)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 78ef274b036e..94f4d6a73d70 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5928,6 +5928,10 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) return; }
+ /* Don't associate the sock with unrelated interrupted task's cgroup. */ + if (in_interrupt()) + return; + rcu_read_lock();
while (true) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5d62c1fada01..f90672e7b4e6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6328,6 +6328,10 @@ void mem_cgroup_sk_alloc(struct sock *sk) return; }
+ /* Do not associate the sock with unrelated interrupted task's memcg. */ + if (in_interrupt()) + return; + rcu_read_lock(); memcg = mem_cgroup_from_task(current); if (memcg == root_mem_cgroup)
From: Shakeel Butt shakeelb@google.com
[ Upstream commit d752a4986532cb6305dfd5290a614cde8072769d ]
If a TCP socket is allocated in IRQ context or cloned from unassociated (i.e. not associated to a memcg) in IRQ context then it will remain unassociated for its whole life. Almost half of the TCPs created on the system are created in IRQ context, so, memory used by such sockets will not be accounted by the memcg.
This issue is more widespread in cgroup v1 where network memory accounting is opt-in but it can happen in cgroup v2 if the source socket for the cloning was created in root memcg.
To fix the issue, just do the association of the sockets at the accept() time in the process context and then force charge the memory buffer already used and reserved by the socket.
Signed-off-by: Shakeel Butt shakeelb@google.com Reviewed-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 14 -------------- net/core/sock.c | 5 ++++- net/ipv4/inet_connection_sock.c | 20 ++++++++++++++++++++ 3 files changed, 24 insertions(+), 15 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f90672e7b4e6..7c8401faa676 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6314,20 +6314,6 @@ void mem_cgroup_sk_alloc(struct sock *sk) if (!mem_cgroup_sockets_enabled) return;
- /* - * Socket cloning can throw us here with sk_memcg already - * filled. It won't however, necessarily happen from - * process context. So the test for root memcg given - * the current task's memcg won't help us in this case. - * - * Respecting the original socket's memcg is a better - * decision in this case. - */ - if (sk->sk_memcg) { - css_get(&sk->sk_memcg->css); - return; - } - /* Do not associate the sock with unrelated interrupted task's memcg. */ if (in_interrupt()) return; diff --git a/net/core/sock.c b/net/core/sock.c index 910d0e0ad5c4..c1ecb35b8b83 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1711,7 +1711,10 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_zckey, 0);
sock_reset_flag(newsk, SOCK_DONE); - mem_cgroup_sk_alloc(newsk); + + /* sk->sk_memcg will be populated at accept() time */ + newsk->sk_memcg = NULL; + cgroup_sk_alloc(&newsk->sk_cgrp_data);
rcu_read_lock(); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b7bb29f46b73..12455c060dc6 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -486,6 +486,26 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) } spin_unlock_bh(&queue->fastopenq.lock); } + + if (mem_cgroup_sockets_enabled) { + int amt; + + /* atomically get the memory usage, set and charge the + * sk->sk_memcg. + */ + lock_sock(newsk); + + /* The sk has not been accepted yet, no need to look at + * sk->sk_wmem_queued. + */ + amt = sk_mem_pages(newsk->sk_forward_alloc + + atomic_read(&sk->sk_rmem_alloc)); + mem_cgroup_sk_alloc(newsk); + if (newsk->sk_memcg && amt) + mem_cgroup_charge_skmem(newsk->sk_memcg, amt); + + release_sock(newsk); + } out: release_sock(sk); if (req)
From: Eric Dumazet edumazet@google.com
commit 06669ea346e476a5339033d77ef175566a40efbb upstream.
Locking newsk while still holding the listener lock triggered a lockdep splat [1]
We can simply move the memcg code after we release the listener lock, as this can also help if multiple threads are sharing a common listener.
Also fix a typo while reading socket sk_rmem_alloc.
[1] WARNING: possible recursive locking detected 5.6.0-rc3-syzkaller #0 Not tainted -------------------------------------------- syz-executor598/9524 is trying to acquire lock: ffff88808b5b8b90 (sk_lock-AF_INET6){+.+.}, at: lock_sock include/net/sock.h:1541 [inline] ffff88808b5b8b90 (sk_lock-AF_INET6){+.+.}, at: inet_csk_accept+0x69f/0xd30 net/ipv4/inet_connection_sock.c:492
but task is already holding lock: ffff88808b5b9590 (sk_lock-AF_INET6){+.+.}, at: lock_sock include/net/sock.h:1541 [inline] ffff88808b5b9590 (sk_lock-AF_INET6){+.+.}, at: inet_csk_accept+0x8d/0xd30 net/ipv4/inet_connection_sock.c:445
other info that might help us debug this: Possible unsafe locking scenario:
CPU0 ---- lock(sk_lock-AF_INET6); lock(sk_lock-AF_INET6);
*** DEADLOCK ***
May be due to missing lock nesting notation
1 lock held by syz-executor598/9524: #0: ffff88808b5b9590 (sk_lock-AF_INET6){+.+.}, at: lock_sock include/net/sock.h:1541 [inline] #0: ffff88808b5b9590 (sk_lock-AF_INET6){+.+.}, at: inet_csk_accept+0x8d/0xd30 net/ipv4/inet_connection_sock.c:445
stack backtrace: CPU: 0 PID: 9524 Comm: syz-executor598 Not tainted 5.6.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x188/0x20d lib/dump_stack.c:118 print_deadlock_bug kernel/locking/lockdep.c:2370 [inline] check_deadlock kernel/locking/lockdep.c:2411 [inline] validate_chain kernel/locking/lockdep.c:2954 [inline] __lock_acquire.cold+0x114/0x288 kernel/locking/lockdep.c:3954 lock_acquire+0x197/0x420 kernel/locking/lockdep.c:4484 lock_sock_nested+0xc5/0x110 net/core/sock.c:2947 lock_sock include/net/sock.h:1541 [inline] inet_csk_accept+0x69f/0xd30 net/ipv4/inet_connection_sock.c:492 inet_accept+0xe9/0x7c0 net/ipv4/af_inet.c:734 __sys_accept4_file+0x3ac/0x5b0 net/socket.c:1758 __sys_accept4+0x53/0x90 net/socket.c:1809 __do_sys_accept4 net/socket.c:1821 [inline] __se_sys_accept4 net/socket.c:1818 [inline] __x64_sys_accept4+0x93/0xf0 net/socket.c:1818 do_syscall_64+0xf6/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x4445c9 Code: e8 0c 0d 03 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb 08 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ffc35b37608 EFLAGS: 00000246 ORIG_RAX: 0000000000000120 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00000000004445c9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000306777 R09: 0000000000306777 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00000000004053d0 R14: 0000000000000000 R15: 0000000000000000
Fixes: d752a4986532 ("net: memcg: late association of sock to memcg") Signed-off-by: Eric Dumazet edumazet@google.com Cc: Shakeel Butt shakeelb@google.com Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/inet_connection_sock.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 12455c060dc6..67e5f05ca757 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -487,27 +487,27 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) spin_unlock_bh(&queue->fastopenq.lock); }
- if (mem_cgroup_sockets_enabled) { +out: + release_sock(sk); + if (newsk && mem_cgroup_sockets_enabled) { int amt;
/* atomically get the memory usage, set and charge the - * sk->sk_memcg. + * newsk->sk_memcg. */ lock_sock(newsk);
- /* The sk has not been accepted yet, no need to look at - * sk->sk_wmem_queued. + /* The socket has not been accepted yet, no need to look at + * newsk->sk_wmem_queued. */ amt = sk_mem_pages(newsk->sk_forward_alloc + - atomic_read(&sk->sk_rmem_alloc)); + atomic_read(&newsk->sk_rmem_alloc)); mem_cgroup_sk_alloc(newsk); if (newsk->sk_memcg && amt) mem_cgroup_charge_skmem(newsk->sk_memcg, amt);
release_sock(newsk); } -out: - release_sock(sk); if (req) reqsk_put(req); return newsk;
From: Cong Wang xiyou.wangcong@gmail.com
[ Upstream commit ad0f75e5f57ccbceec13274e1e242f2b5a6397ed ]
When we clone a socket in sk_clone_lock(), its sk_cgrp_data is copied, so the cgroup refcnt must be taken too. And, unlike the sk_alloc() path, sock_update_netprioidx() is not called here. Therefore, it is safe and necessary to grab the cgroup refcnt even when cgroup_sk_alloc is disabled.
sk_clone_lock() is in BH context anyway, the in_interrupt() would terminate this function if called there. And for sk_alloc() skcd->val is always zero. So it's safe to factor out the code to make it more readable.
The global variable 'cgroup_sk_alloc_disabled' is used to determine whether to take these reference counts. It is impossible to make the reference counting correct unless we save this bit of information in skcd->val. So, add a new bit there to record whether the socket has already taken the reference counts. This obviously relies on kmalloc() to align cgroup pointers to at least 4 bytes, ARCH_KMALLOC_MINALIGN is certainly larger than that.
This bug seems to be introduced since the beginning, commit d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets") tried to fix it but not compeletely. It seems not easy to trigger until the recent commit 090e28b229af ("netprio_cgroup: Fix unlimited memory leak of v2 cgroups") was merged.
Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Reported-by: Cameron Berkenpas cam@neo-zeon.de Reported-by: Peter Geis pgwipeout@gmail.com Reported-by: Lu Fengqi lufq.fnst@cn.fujitsu.com Reported-by: Daniƫl Sonck dsonck92@gmail.com Reported-by: Zhang Qiang qiang.zhang@windriver.com Tested-by: Cameron Berkenpas cam@neo-zeon.de Tested-by: Peter Geis pgwipeout@gmail.com Tested-by: Thomas Lamprecht t.lamprecht@proxmox.com Cc: Daniel Borkmann daniel@iogearbox.net Cc: Zefan Li lizefan@huawei.com Cc: Tejun Heo tj@kernel.org Cc: Roman Gushchin guro@fb.com Signed-off-by: Cong Wang xiyou.wangcong@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/cgroup-defs.h | 6 ++++-- include/linux/cgroup.h | 4 +++- kernel/cgroup/cgroup.c | 29 ++++++++++++++++++----------- net/core/sock.c | 2 +- 4 files changed, 26 insertions(+), 15 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index a01ebb630abc..56b073368374 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -755,7 +755,8 @@ struct sock_cgroup_data { union { #ifdef __LITTLE_ENDIAN struct { - u8 is_data; + u8 is_data : 1; + u8 no_refcnt : 1; u8 padding; u16 prioidx; u32 classid; @@ -765,7 +766,8 @@ struct sock_cgroup_data { u32 classid; u16 prioidx; u8 padding; - u8 is_data; + u8 no_refcnt : 1; + u8 is_data : 1; } __packed; #endif u64 val; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b4854b48a4f3..85d3d608d7e4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -809,6 +809,7 @@ extern spinlock_t cgroup_sk_update_lock;
void cgroup_sk_alloc_disable(void); void cgroup_sk_alloc(struct sock_cgroup_data *skcd); +void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd);
static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) @@ -822,7 +823,7 @@ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) */ v = READ_ONCE(skcd->val);
- if (v & 1) + if (v & 3) return &cgrp_dfl_root.cgrp;
return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; @@ -834,6 +835,7 @@ static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) #else /* CONFIG_CGROUP_DATA */
static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} +static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
#endif /* CONFIG_CGROUP_DATA */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 94f4d6a73d70..69a0b985af8e 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5914,17 +5914,8 @@ void cgroup_sk_alloc_disable(void)
void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { - if (cgroup_sk_alloc_disabled) - return; - - /* Socket clone path */ - if (skcd->val) { - /* - * We might be cloning a socket which is left in an empty - * cgroup and the cgroup might have already been rmdir'd. - * Don't use cgroup_get_live(). - */ - cgroup_get(sock_cgroup_ptr(skcd)); + if (cgroup_sk_alloc_disabled) { + skcd->no_refcnt = 1; return; }
@@ -5948,8 +5939,24 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) rcu_read_unlock(); }
+void cgroup_sk_clone(struct sock_cgroup_data *skcd) +{ + /* Socket clone path */ + if (skcd->val) { + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ + cgroup_get(sock_cgroup_ptr(skcd)); + } +} + void cgroup_sk_free(struct sock_cgroup_data *skcd) { + if (skcd->no_refcnt) + return; + cgroup_put(sock_cgroup_ptr(skcd)); }
diff --git a/net/core/sock.c b/net/core/sock.c index c1ecb35b8b83..fa2f9a6cf84e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1715,7 +1715,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL;
- cgroup_sk_alloc(&newsk->sk_cgrp_data); + cgroup_sk_clone(&newsk->sk_cgrp_data);
rcu_read_lock(); filter = rcu_dereference(sk->sk_filter);
From: Chen Zhou chenzhou10@huawei.com
hulk inclusion category: bugfix bugzilla: NA CVE: NA
-------------------------------------------------
Commit 354e249da4b0 ("cgroup: fix cgroup_sk_alloc() for sk_clone_lock()") modifies struct sock_cgroup_data, which made the KABI broken.
Use is_data to save the val of is_data and no_refcnt, bit 0 of is_data : is_data, bit 1 of is_data: no_refcnt.
Signed-off-by: Chen Zhou chenzhou10@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/cgroup-defs.h | 11 +++++++---- kernel/cgroup/cgroup.c | 4 ++-- 2 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 56b073368374..cad1a82ca7d7 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -753,10 +753,14 @@ static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) {} */ struct sock_cgroup_data { union { + /* + * Field is_data is the combination of is_data and no_refcnt, + * bit 0: is_data, + * bit 1: no_refcnt. + */ #ifdef __LITTLE_ENDIAN struct { - u8 is_data : 1; - u8 no_refcnt : 1; + u8 is_data; u8 padding; u16 prioidx; u32 classid; @@ -766,8 +770,7 @@ struct sock_cgroup_data { u32 classid; u16 prioidx; u8 padding; - u8 no_refcnt : 1; - u8 is_data : 1; + u8 is_data; } __packed; #endif u64 val; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 69a0b985af8e..258331394bfe 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5915,7 +5915,7 @@ void cgroup_sk_alloc_disable(void) void cgroup_sk_alloc(struct sock_cgroup_data *skcd) { if (cgroup_sk_alloc_disabled) { - skcd->no_refcnt = 1; + skcd->is_data |= 2; return; }
@@ -5954,7 +5954,7 @@ void cgroup_sk_clone(struct sock_cgroup_data *skcd)
void cgroup_sk_free(struct sock_cgroup_data *skcd) { - if (skcd->no_refcnt) + if (skcd->is_data & 2) return;
cgroup_put(sock_cgroup_ptr(skcd));