Patchset of CVE-2022-48689.
Eric Dumazet (1): tcp: TX zerocopy should not sense pfmemalloc status
Paolo Abeni (1): tcp: factor out tcp_build_frag()
Pavel Begunkov (1): net: introduce __skb_fill_page_desc_noacc
Yunsheng Lin (1): net: skbuff: update comment about pfmemalloc propagating
include/linux/skbuff.h | 41 +++++++++++--- include/net/tcp.h | 3 ++ net/core/datagram.c | 2 +- net/ipv4/tcp.c | 119 +++++++++++++++++++++++------------------ 4 files changed, 105 insertions(+), 60 deletions(-)
From: Paolo Abeni pabeni@redhat.com
mainline inclusion from mainline-v5.11-rc1 commit b796d04bd014fd24e60ab4a6c604b258ac947825 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9LK5X CVE: CVE-2022-48689
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Will be needed by the next patch, as MPTCP needs to handle directly the error/memory-allocation-needed path.
No functional changes intended.
Additionally let MPTCP code access the tcp_remove_empty_skb() helper.
Signed-off-by: Paolo Abeni pabeni@redhat.com Signed-off-by: Jakub Kicinski kuba@kernel.org Conflicts: net/ipv4/tcp.c [The version has backported commit 5b7b4afead62.] Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com --- include/net/tcp.h | 3 ++ net/ipv4/tcp.c | 119 ++++++++++++++++++++++++++-------------------- 2 files changed, 70 insertions(+), 52 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h index bb7422b87ceb..e033ec6ff07f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -319,6 +319,7 @@ void tcp_shutdown(struct sock *sk, int how); int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb);
+void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); @@ -326,6 +327,8 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags); +struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, + struct page *page, int offset, size_t *size); ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_send_mss(struct sock *sk, int *size_goal, int flags); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ec1d1b967a0c..5940823adaa5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -962,7 +962,7 @@ int tcp_send_mss(struct sock *sk, int *size_goal, int flags) * importantly be able to generate EPOLLOUT for Edge Trigger epoll() * users. */ -static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) +void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) { if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { tcp_unlink_write_queue(skb, sk); @@ -972,6 +972,68 @@ static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) } }
+struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, + struct page *page, int offset, size_t *size) +{ + struct sk_buff *skb = tcp_write_queue_tail(sk); + struct tcp_sock *tp = tcp_sk(sk); + bool can_coalesce; + int copy, i; + + if (!skb || (copy = size_goal - skb->len) <= 0 || + !tcp_skb_can_collapse_to(skb)) { +new_segment: + if (!sk_stream_memory_free(sk)) + return NULL; + + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, + tcp_rtx_and_write_queues_empty(sk)); + if (!skb) + return NULL; + +#ifdef CONFIG_TLS_DEVICE + skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); +#endif + skb_entail(sk, skb); + copy = size_goal; + } + + if (copy > *size) + copy = *size; + + i = skb_shinfo(skb)->nr_frags; + can_coalesce = skb_can_coalesce(skb, i, page, offset); + if (!can_coalesce && i >= sysctl_max_skb_frags) { + tcp_mark_push(tp, skb); + goto new_segment; + } + if (!sk_wmem_schedule(sk, copy)) + return NULL; + + if (can_coalesce) { + skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); + } else { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, copy); + } + + if (!(flags & MSG_NO_SHARED_FRAGS)) + skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk_wmem_queued_add(sk, copy); + sk_mem_charge(sk, copy); + skb->ip_summed = CHECKSUM_PARTIAL; + WRITE_ONCE(tp->write_seq, tp->write_seq + copy); + TCP_SKB_CB(skb)->end_seq += copy; + tcp_skb_pcount_set(skb, 0); + + *size = copy; + return skb; +} + ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -1007,60 +1069,13 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, goto out_err;
while (size > 0) { - struct sk_buff *skb = tcp_write_queue_tail(sk); - int copy, i; - bool can_coalesce; - - if (!skb || (copy = size_goal - skb->len) <= 0 || - !tcp_skb_can_collapse_to(skb)) { -new_segment: - if (!sk_stream_memory_free(sk)) - goto wait_for_space; - - skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, - tcp_rtx_and_write_queues_empty(sk)); - if (!skb) - goto wait_for_space; - -#ifdef CONFIG_TLS_DEVICE - skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); -#endif - skb_entail(sk, skb); - copy = size_goal; - } + struct sk_buff *skb; + size_t copy = size;
- if (copy > size) - copy = size; - - i = skb_shinfo(skb)->nr_frags; - can_coalesce = skb_can_coalesce(skb, i, page, offset); - if (!can_coalesce && i >= sysctl_max_skb_frags) { - tcp_mark_push(tp, skb); - goto new_segment; - } - if (!sk_wmem_schedule(sk, copy)) + skb = tcp_build_frag(sk, size_goal, flags, page, offset, ©); + if (!skb) goto wait_for_space;
- if (can_coalesce) { - skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); - } else { - get_page(page); - skb_fill_page_desc(skb, i, page, offset, copy); - } - - if (!(flags & MSG_NO_SHARED_FRAGS)) - skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG; - - skb->len += copy; - skb->data_len += copy; - skb->truesize += copy; - sk_wmem_queued_add(sk, copy); - sk_mem_charge(sk, copy); - skb->ip_summed = CHECKSUM_PARTIAL; - WRITE_ONCE(tp->write_seq, tp->write_seq + copy); - TCP_SKB_CB(skb)->end_seq += copy; - tcp_skb_pcount_set(skb, 0); - if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH;
From: Pavel Begunkov asml.silence@gmail.com
mainline inclusion from mainline-v6.0-rc1 commit 84ce071e38a6e25ea3ea91188e5482ac1f17b3af category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9LK5X CVE: CVE-2022-48689
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Managed pages contain pinned userspace pages and controlled by upper layers, there is no need in tracking skb->pfmemalloc for them. Introduce a helper for filling frags but ignoring page tracking, it'll be needed later.
Signed-off-by: Pavel Begunkov asml.silence@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com --- include/linux/skbuff.h | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c3c89b6cc42f..2c7748721089 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2173,6 +2173,22 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb) return skb_headlen(skb) + __skb_pagelen(skb); }
+static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo, + int i, struct page *page, + int off, int size) +{ + skb_frag_t *frag = &shinfo->frags[i]; + + /* + * Propagate page pfmemalloc to the skb if we can. The problem is + * that not all callers have unique ownership of the page but rely + * on page_is_pfmemalloc doing the right thing(tm). + */ + frag->bv_page = page; + frag->bv_offset = off; + skb_frag_size_set(frag, size); +} + /** * __skb_fill_page_desc - initialise a paged fragment in an skb * @skb: buffer containing fragment to be initialised @@ -2189,17 +2205,7 @@ static inline unsigned int skb_pagelen(const struct sk_buff *skb) static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { - skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - /* - * Propagate page pfmemalloc to the skb if we can. The problem is - * that not all callers have unique ownership of the page but rely - * on page_is_pfmemalloc doing the right thing(tm). - */ - frag->bv_page = page; - frag->bv_offset = off; - skb_frag_size_set(frag, size); - + __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size); page = compound_head(page); if (page_is_pfmemalloc(page)) skb->pfmemalloc = true;
From: Yunsheng Lin linyunsheng@huawei.com
mainline inclusion from mainline-v6.5-rc1 commit 8b33485128ad932f807f4535e0b440733d8b5808 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9LK5X CVE: CVE-2022-48689
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
__skb_fill_page_desc_noacc() is not doing any pfmemalloc propagating, and yet it has a comment about that, commit 84ce071e38a6 ("net: introduce __skb_fill_page_desc_noacc") may have accidentally moved it to __skb_fill_page_desc_noacc(), so move it back to __skb_fill_page_desc() which is supposed to be doing pfmemalloc propagating.
Signed-off-by: Yunsheng Lin linyunsheng@huawei.com CC: Pavel Begunkov asml.silence@gmail.com Reviewed-by: Pavel Begunkov asml.silence@gmail.com Link: https://lore.kernel.org/r/20230515050107.46397-1-linyunsheng@huawei.com Signed-off-by: Paolo Abeni pabeni@redhat.com Conflicts: include/linux/skbuff.h [The version does not include b51f4113ebb0.] Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com --- include/linux/skbuff.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2c7748721089..b356e1127e15 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2179,11 +2179,6 @@ static inline void __skb_fill_page_desc_noacc(struct skb_shared_info *shinfo, { skb_frag_t *frag = &shinfo->frags[i];
- /* - * Propagate page pfmemalloc to the skb if we can. The problem is - * that not all callers have unique ownership of the page but rely - * on page_is_pfmemalloc doing the right thing(tm). - */ frag->bv_page = page; frag->bv_offset = off; skb_frag_size_set(frag, size); @@ -2206,6 +2201,11 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size) { __skb_fill_page_desc_noacc(skb_shinfo(skb), i, page, off, size); + + /* Propagate page pfmemalloc to the skb if we can. The problem is + * that not all callers have unique ownership of the page but rely + * on page_is_pfmemalloc doing the right thing(tm). + */ page = compound_head(page); if (page_is_pfmemalloc(page)) skb->pfmemalloc = true;
From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v6.0-rc5 commit 3261400639463a853ba2b3be8bd009c2a8089775 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9LK5X CVE: CVE-2022-48689
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We got a recent syzbot report [1] showing a possible misuse of pfmemalloc page status in TCP zerocopy paths.
Indeed, for pages coming from user space or other layers, using page_is_pfmemalloc() is moot, and possibly could give false positives.
There has been attempts to make page_is_pfmemalloc() more robust, but not using it in the first place in this context is probably better, removing cpu cycles.
Note to stable teams :
You need to backport 84ce071e38a6 ("net: introduce __skb_fill_page_desc_noacc") as a prereq.
Race is more probable after commit c07aea3ef4d4 ("mm: add a signature in struct page") because page_is_pfmemalloc() is now using low order bit from page->lru.next, which can change more often than page->index.
Low order bit should never be set for lru.next (when used as an anchor in LRU list), so KCSAN report is mostly a false positive.
Backporting to older kernel versions seems not necessary.
[1] BUG: KCSAN: data-race in lru_add_fn / tcp_build_frag
write to 0xffffea0004a1d2c8 of 8 bytes by task 18600 on cpu 0: __list_add include/linux/list.h:73 [inline] list_add include/linux/list.h:88 [inline] lruvec_add_folio include/linux/mm_inline.h:105 [inline] lru_add_fn+0x440/0x520 mm/swap.c:228 folio_batch_move_lru+0x1e1/0x2a0 mm/swap.c:246 folio_batch_add_and_move mm/swap.c:263 [inline] folio_add_lru+0xf1/0x140 mm/swap.c:490 filemap_add_folio+0xf8/0x150 mm/filemap.c:948 __filemap_get_folio+0x510/0x6d0 mm/filemap.c:1981 pagecache_get_page+0x26/0x190 mm/folio-compat.c:104 grab_cache_page_write_begin+0x2a/0x30 mm/folio-compat.c:116 ext4_da_write_begin+0x2dd/0x5f0 fs/ext4/inode.c:2988 generic_perform_write+0x1d4/0x3f0 mm/filemap.c:3738 ext4_buffered_write_iter+0x235/0x3e0 fs/ext4/file.c:270 ext4_file_write_iter+0x2e3/0x1210 call_write_iter include/linux/fs.h:2187 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x468/0x760 fs/read_write.c:578 ksys_write+0xe8/0x1a0 fs/read_write.c:631 __do_sys_write fs/read_write.c:643 [inline] __se_sys_write fs/read_write.c:640 [inline] __x64_sys_write+0x3e/0x50 fs/read_write.c:640 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd
read to 0xffffea0004a1d2c8 of 8 bytes by task 18611 on cpu 1: page_is_pfmemalloc include/linux/mm.h:1740 [inline] __skb_fill_page_desc include/linux/skbuff.h:2422 [inline] skb_fill_page_desc include/linux/skbuff.h:2443 [inline] tcp_build_frag+0x613/0xb20 net/ipv4/tcp.c:1018 do_tcp_sendpages+0x3e8/0xaf0 net/ipv4/tcp.c:1075 tcp_sendpage_locked net/ipv4/tcp.c:1140 [inline] tcp_sendpage+0x89/0xb0 net/ipv4/tcp.c:1150 inet_sendpage+0x7f/0xc0 net/ipv4/af_inet.c:833 kernel_sendpage+0x184/0x300 net/socket.c:3561 sock_sendpage+0x5a/0x70 net/socket.c:1054 pipe_to_sendpage+0x128/0x160 fs/splice.c:361 splice_from_pipe_feed fs/splice.c:415 [inline] __splice_from_pipe+0x222/0x4d0 fs/splice.c:559 splice_from_pipe fs/splice.c:594 [inline] generic_splice_sendpage+0x89/0xc0 fs/splice.c:743 do_splice_from fs/splice.c:764 [inline] direct_splice_actor+0x80/0xa0 fs/splice.c:931 splice_direct_to_actor+0x305/0x620 fs/splice.c:886 do_splice_direct+0xfb/0x180 fs/splice.c:974 do_sendfile+0x3bf/0x910 fs/read_write.c:1249 __do_sys_sendfile64 fs/read_write.c:1317 [inline] __se_sys_sendfile64 fs/read_write.c:1303 [inline] __x64_sys_sendfile64+0x10c/0x150 fs/read_write.c:1303 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x2b/0x70 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd
value changed: 0x0000000000000000 -> 0xffffea0004a1d288
Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 18611 Comm: syz-executor.4 Not tainted 6.0.0-rc2-syzkaller-00248-ge022620b5d05-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/22/2022
Fixes: c07aea3ef4d4 ("mm: add a signature in struct page") Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: Eric Dumazet edumazet@google.com Cc: Shakeel Butt shakeelb@google.com Reviewed-by: Shakeel Butt shakeelb@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Ziyang Xuan william.xuanziyang@huawei.com --- include/linux/skbuff.h | 21 +++++++++++++++++++++ net/core/datagram.c | 2 +- net/ipv4/tcp.c | 2 +- 3 files changed, 23 insertions(+), 2 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b356e1127e15..ce3dfed6b915 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2232,6 +2232,27 @@ static inline void skb_fill_page_desc(struct sk_buff *skb, int i, skb_shinfo(skb)->nr_frags = i + 1; }
+/** + * skb_fill_page_desc_noacc - initialise a paged fragment in an skb + * @skb: buffer containing fragment to be initialised + * @i: paged fragment index to initialise + * @page: the page to use for this fragment + * @off: the offset to the data with @page + * @size: the length of the data + * + * Variant of skb_fill_page_desc() which does not deal with + * pfmemalloc, if page is not owned by us. + */ +static inline void skb_fill_page_desc_noacc(struct sk_buff *skb, int i, + struct page *page, int off, + int size) +{ + struct skb_shared_info *shinfo = skb_shinfo(skb); + + __skb_fill_page_desc_noacc(shinfo, i, page, off, size); + shinfo->nr_frags = i + 1; +} + void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size, unsigned int truesize);
diff --git a/net/core/datagram.c b/net/core/datagram.c index 9e77695d1bdc..b5f4c1b93779 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -677,7 +677,7 @@ int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, page_ref_sub(last_head, refs); refs = 0; } - skb_fill_page_desc(skb, frag++, head, start, size); + skb_fill_page_desc_noacc(skb, frag++, head, start, size); } if (refs) page_ref_sub(last_head, refs); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5940823adaa5..830d6b2039f5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1014,7 +1014,7 @@ struct sk_buff *tcp_build_frag(struct sock *sk, int size_goal, int flags, skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { get_page(page); - skb_fill_page_desc(skb, i, page, offset, copy); + skb_fill_page_desc_noacc(skb, i, page, offset, copy); }
if (!(flags & MSG_NO_SHARED_FRAGS))
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/7174 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/E...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/7174 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/E...