From: Wang Yufen wangyufen@huawei.com
mainline inclusion from mainline-v5.13-rc1 commit 37f0e514db660f03f8982b8f4fbbd4b2740abe7d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Currently we purge the ingress_skb queue only when psock refcnt goes down to 0, so locking the queue is not necessary, but in order to be called during ->close, we have to lock it here.
Signed-off-by: Cong Wang cong.wang@bytedance.com Signed-off-by: Alexei Starovoitov ast@kernel.org Acked-by: Jakub Sitnicki jakub@cloudflare.com Acked-by: John Fastabend john.fastabend@gmail.com Link: https://lore.kernel.org/bpf/20210331023237.41094-2-xiyou.wangcong@gmail.com Signed-off-by: Wang Yufen wangyufen@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 4ee4fe436847..f80428cc7264 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -638,7 +638,7 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
static void sk_psock_zap_ingress(struct sk_psock *psock) { - __skb_queue_purge(&psock->ingress_skb); + skb_queue_purge(&psock->ingress_skb); __sk_psock_purge_ingress_msg(psock); }
From: Cong Wang cong.wang@bytedance.com
mainline inclusion from mainline-v5.13-rc1 commit b01fd6e802b6d0a635176f943315670b679d8d7b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Currently we rely on lock_sock to protect ingress_msg, it is too big for this, we can actually just use a spinlock to protect this list like protecting other skb queues.
__tcp_bpf_recvmsg() is still special because of peeking, it still has to use lock_sock.
Signed-off-by: Cong Wang cong.wang@bytedance.com Signed-off-by: Alexei Starovoitov ast@kernel.org Acked-by: Jakub Sitnicki jakub@cloudflare.com Acked-by: John Fastabend john.fastabend@gmail.com Link: https://lore.kernel.org/bpf/20210331023237.41094-3-xiyou.wangcong@gmail.com
Conflicts: net/core/skmsg.c Signed-off-by: Wang Yufen wangyufen@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/skmsg.h | 46 +++++++++++++++++++++++++++++++++++++++++++ net/core/skmsg.c | 3 +++ net/ipv4/tcp_bpf.c | 18 ++++++----------- 3 files changed, 55 insertions(+), 12 deletions(-)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 23d4e86a4395..2324cd5208e0 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -99,6 +99,7 @@ struct sk_psock { struct sk_psock_parser parser; struct sk_buff_head ingress_skb; struct list_head ingress_msg; + spinlock_t ingress_lock; unsigned long state; struct list_head link; spinlock_t link_lock; @@ -298,7 +299,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk) static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { + spin_lock_bh(&psock->ingress_lock); list_add_tail(&msg->list, &psock->ingress_msg); + spin_unlock_bh(&psock->ingress_lock); +} + +static inline struct sk_msg *sk_psock_dequeue_msg(struct sk_psock *psock) +{ + struct sk_msg *msg; + + spin_lock_bh(&psock->ingress_lock); + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + if (msg) + list_del(&msg->list); + spin_unlock_bh(&psock->ingress_lock); + return msg; +} + +static inline struct sk_msg *sk_psock_peek_msg(struct sk_psock *psock) +{ + struct sk_msg *msg; + + spin_lock_bh(&psock->ingress_lock); + msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, list); + spin_unlock_bh(&psock->ingress_lock); + return msg; +} + +static inline struct sk_msg *sk_psock_next_msg(struct sk_psock *psock, + struct sk_msg *msg) +{ + struct sk_msg *ret; + + spin_lock_bh(&psock->ingress_lock); + if (list_is_last(&msg->list, &psock->ingress_msg)) + ret = NULL; + else + ret = list_next_entry(msg, list); + spin_unlock_bh(&psock->ingress_lock); + return ret; }
static inline bool sk_psock_queue_empty(const struct sk_psock *psock) @@ -306,6 +345,13 @@ static inline bool sk_psock_queue_empty(const struct sk_psock *psock) return psock ? list_empty(&psock->ingress_msg) : true; }
+static inline void kfree_sk_msg(struct sk_msg *msg) +{ + if (msg->skb) + consume_skb(msg->skb); + kfree(msg); +} + static inline void sk_psock_report_error(struct sk_psock *psock, int err) { struct sock *sk = psock->sk; diff --git a/net/core/skmsg.c b/net/core/skmsg.c index f80428cc7264..34c356988176 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -598,6 +598,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node)
INIT_WORK(&psock->work, sk_psock_backlog); INIT_LIST_HEAD(&psock->ingress_msg); + spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb);
sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); @@ -639,7 +640,9 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) static void sk_psock_zap_ingress(struct sk_psock *psock) { skb_queue_purge(&psock->ingress_skb); + spin_lock_bh(&psock->ingress_lock); __sk_psock_purge_ingress_msg(psock); + spin_unlock_bh(&psock->ingress_lock); }
static void sk_psock_link_destroy(struct sk_psock *psock) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 6b745ce4108c..b7731dbb270f 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -18,9 +18,7 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct sk_msg *msg_rx; int i, copied = 0;
- msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); - + msg_rx = sk_psock_peek_msg(psock); while (copied != len) { struct scatterlist *sge;
@@ -68,22 +66,18 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, } while (i != msg_rx->sg.end);
if (unlikely(peek)) { - if (msg_rx == list_last_entry(&psock->ingress_msg, - struct sk_msg, list)) + msg_rx = sk_psock_next_msg(psock, msg_rx); + if (!msg_rx) break; - msg_rx = list_next_entry(msg_rx, list); continue; }
msg_rx->sg.start = i; if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { - list_del(&msg_rx->list); - if (msg_rx->skb) - consume_skb(msg_rx->skb); - kfree(msg_rx); + msg_rx = sk_psock_dequeue_msg(psock); + kfree_sk_msg(msg_rx); } - msg_rx = list_first_entry_or_null(&psock->ingress_msg, - struct sk_msg, list); + msg_rx = sk_psock_peek_msg(psock); }
return copied;
From: Cong Wang cong.wang@bytedance.com
mainline inclusion from mainline-v5.13-rc1 commit 0739cd28f2645e814586c7536ba5da9825cb8029 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We only have skb_send_sock_locked() which requires callers to use lock_sock(). Introduce a variant skb_send_sock() which locks on its own, callers do not need to lock it any more. This will save us from adding a ->sendmsg_locked for each protocol.
To reuse the code, pass function pointers to __skb_send_sock() and build skb_send_sock() and skb_send_sock_locked() on top.
Signed-off-by: Cong Wang cong.wang@bytedance.com Signed-off-by: Alexei Starovoitov ast@kernel.org Reviewed-by: Jakub Sitnicki jakub@cloudflare.com Link: https://lore.kernel.org/bpf/20210331023237.41094-4-xiyou.wangcong@gmail.com Signed-off-by: Wang Yufen wangyufen@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/skbuff.h | 1 + net/core/skbuff.c | 55 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 49 insertions(+), 7 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e5f61bdd42a8..68efccc15a87 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3609,6 +3609,7 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, unsigned int flags); int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, int len); +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len); void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to); unsigned int skb_zerocopy_headlen(const struct sk_buff *from); int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 16c74a81b7bf..08d7b78a61cd 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2478,9 +2478,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, } EXPORT_SYMBOL_GPL(skb_splice_bits);
-/* Send skb data on a socket. Socket must be locked. */ -int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, - int len) +static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size) +{ + struct socket *sock = sk->sk_socket; + + if (!sock) + return -EINVAL; + return kernel_sendmsg(sock, msg, vec, num, size); +} + +static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + struct socket *sock = sk->sk_socket; + + if (!sock) + return -EINVAL; + return kernel_sendpage(sock, page, offset, size, flags); +} + +typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size); +typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, + size_t size, int flags); +static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, + int len, sendmsg_func sendmsg, sendpage_func sendpage) { unsigned int orig_len = len; struct sk_buff *head = skb; @@ -2500,7 +2523,8 @@ int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT;
- ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen); + ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, + sendmsg_unlocked, sk, &msg, &kv, 1, slen); if (ret <= 0) goto error;
@@ -2531,9 +2555,11 @@ int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, slen = min_t(size_t, len, skb_frag_size(frag) - offset);
while (slen) { - ret = kernel_sendpage_locked(sk, skb_frag_page(frag), - skb_frag_off(frag) + offset, - slen, MSG_DONTWAIT); + ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, + sendpage_unlocked, sk, + skb_frag_page(frag), + skb_frag_off(frag) + offset, + slen, MSG_DONTWAIT); if (ret <= 0) goto error;
@@ -2565,8 +2591,23 @@ int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, error: return orig_len == len ? ret : orig_len - len; } + +/* Send skb data on a socket. Socket must be locked. */ +int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, + int len) +{ + return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, + kernel_sendpage_locked); +} EXPORT_SYMBOL_GPL(skb_send_sock_locked);
+/* Send skb data on a socket. Socket must be unlocked. */ +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, + sendpage_unlocked); +} + /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer
From: Cong Wang cong.wang@bytedance.com
mainline inclusion from mainline-v5.13-rc1 commit 799aa7f98d53e0f541fa6b4dc9aa47b4ff2178e3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We do not have to lock the sock to avoid losing sk_socket, instead we can purge all the ingress queues when we close the socket. Sending or receiving packets after orphaning socket makes no sense.
We do purge these queues when psock refcnt reaches zero but here we want to purge them explicitly in sock_map_close(). There are also some nasty race conditions on testing bit SK_PSOCK_TX_ENABLED and queuing/canceling the psock work, we can expand psock->ingress_lock a bit to protect them too.
As noticed by John, we still have to lock the psock->work, because the same work item could be running concurrently on different CPU's.
Signed-off-by: Cong Wang cong.wang@bytedance.com Signed-off-by: Alexei Starovoitov ast@kernel.org Acked-by: John Fastabend john.fastabend@gmail.com Link: https://lore.kernel.org/bpf/20210331023237.41094-5-xiyou.wangcong@gmail.com Signed-off-by: Wang Yufen wangyufen@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/skmsg.h | 3 ++- net/core/skmsg.c | 50 +++++++++++++++++++++++++++++-------------- net/core/sock_map.c | 1 + 3 files changed, 37 insertions(+), 17 deletions(-)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 2324cd5208e0..31ba9d43ffd1 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -108,6 +108,7 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); struct proto *sk_proto; + struct mutex work_mutex; struct sk_psock_work_state work_state; struct work_struct work; union { @@ -361,6 +362,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) }
struct sk_psock *sk_psock_init(struct sock *sk, int node); +void sk_psock_stop(struct sk_psock *psock, bool wait);
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); @@ -451,7 +453,6 @@ static inline struct sk_psock *sk_psock_get(struct sock *sk) return psock; }
-void sk_psock_stop(struct sock *sk, struct sk_psock *psock); void sk_psock_drop(struct sock *sk, struct sk_psock *psock);
static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 34c356988176..7af259e92a0c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -504,7 +504,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; - return skb_send_sock_locked(psock->sk, skb, off, len); + return skb_send_sock(psock->sk, skb, off, len); } return sk_psock_skb_ingress(psock, skb); } @@ -518,8 +518,7 @@ static void sk_psock_backlog(struct work_struct *work) u32 len, off; int ret;
- /* Lock sock to avoid losing sk_socket during loop. */ - lock_sock(psock->sk); + mutex_lock(&psock->work_mutex); if (state->skb) { skb = state->skb; len = state->len; @@ -535,7 +534,7 @@ static void sk_psock_backlog(struct work_struct *work) ingress = tcp_skb_bpf_ingress(skb); do { ret = -EIO; - if (likely(psock->sk->sk_socket)) + if (!sock_flag(psock->sk, SOCK_DEAD)) ret = sk_psock_handle_skb(psock, skb, off, len, ingress); if (ret <= 0) { @@ -559,7 +558,7 @@ static void sk_psock_backlog(struct work_struct *work) kfree_skb(skb); } end: - release_sock(psock->sk); + mutex_unlock(&psock->work_mutex); }
struct sk_psock *sk_psock_init(struct sock *sk, int node) @@ -597,6 +596,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) spin_lock_init(&psock->link_lock);
INIT_WORK(&psock->work, sk_psock_backlog); + mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); @@ -637,12 +637,10 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) } }
-static void sk_psock_zap_ingress(struct sk_psock *psock) +static void __sk_psock_zap_ingress(struct sk_psock *psock) { skb_queue_purge(&psock->ingress_skb); - spin_lock_bh(&psock->ingress_lock); __sk_psock_purge_ingress_msg(psock); - spin_unlock_bh(&psock->ingress_lock); }
static void sk_psock_link_destroy(struct sk_psock *psock) @@ -655,6 +653,18 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } }
+void sk_psock_stop(struct sk_psock *psock, bool wait) +{ + spin_lock_bh(&psock->ingress_lock); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + sk_psock_cork_free(psock); + __sk_psock_zap_ingress(psock); + spin_unlock_bh(&psock->ingress_lock); + + if (wait) + cancel_work_sync(&psock->work); +} + static void sk_psock_destroy_deferred(struct work_struct *gc) { struct sk_psock *psock = container_of(gc, struct sk_psock, gc); @@ -666,12 +676,12 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) strp_done(&psock->parser.strp);
cancel_work_sync(&psock->work); + mutex_destroy(&psock->work_mutex);
psock_progs_drop(&psock->progs);
sk_psock_link_destroy(psock); sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock);
if (psock->sk_redir) sock_put(psock->sk_redir); @@ -689,8 +699,7 @@ static void sk_psock_destroy(struct rcu_head *rcu)
void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { - sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock); + sk_psock_stop(psock, false);
write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); @@ -700,7 +709,6 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) else if (psock->progs.skb_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
call_rcu(&psock->rcu, sk_psock_destroy); } @@ -786,14 +794,20 @@ static int sk_psock_skb_redirect(struct sk_buff *skb) * error that caused the pipe to break. We can't send a packet on * a socket that is in this state so we drop the skb. */ - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { + kfree_skb(skb); + return -EIO; + } + spin_lock_bh(&psock_other->ingress_lock); + if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + spin_unlock_bh(&psock_other->ingress_lock); kfree_skb(skb); return -EIO; }
skb_queue_tail(&psock_other->ingress_skb, skb); schedule_work(&psock_other->work); + spin_unlock_bh(&psock_other->ingress_lock); return 0; }
@@ -859,8 +873,12 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, err = sk_psock_skb_ingress_self(psock, skb); } if (err < 0) { - skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + } + spin_unlock_bh(&psock->ingress_lock); } break; case __SK_REDIRECT: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 4534cd6be0c8..8525c11dbc3a 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1559,6 +1559,7 @@ void sock_map_close(struct sock *sk, long timeout) saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); + sk_psock_stop(psock, true); release_sock(sk); saved_close(sk, timeout); }
From: Cong Wang cong.wang@bytedance.com
mainline inclusion from mainline-v5.13-rc1 commit aadb2bb83ff789de63b48b4edeab7329423a50d3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The last refcnt of the psock can be gone right after sock_map_remove_links(), so sk_psock_stop() could trigger a UAF. The reason why I placed sk_psock_stop() there is to avoid RCU read critical section, and more importantly, some callee of sock_map_remove_links() is supposed to be called with RCU read lock, we can not simply get rid of RCU read lock here. Therefore, the only choice we have is to grab an additional refcnt with sk_psock_get() and put it back after sk_psock_stop().
Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Reported-by: syzbot+7b6548ae483d6f4c64ae@syzkaller.appspotmail.com Signed-off-by: Cong Wang cong.wang@bytedance.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Acked-by: Jakub Sitnicki jakub@cloudflare.com Link: https://lore.kernel.org/bpf/20210408030556.45134-1-xiyou.wangcong@gmail.com Signed-off-by: Wang Yufen wangyufen@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/sock_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 8525c11dbc3a..aea06310b267 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1549,7 +1549,7 @@ void sock_map_close(struct sock *sk, long timeout)
lock_sock(sk); rcu_read_lock(); - psock = sk_psock(sk); + psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); @@ -1560,6 +1560,7 @@ void sock_map_close(struct sock *sk, long timeout) sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock, true); + sk_psock_put(sk, psock); release_sock(sk); saved_close(sk, timeout); }
From: Cong Wang cong.wang@bytedance.com
mainline inclusion from mainline-v5.14-rc1 commit 0cf6672b23c8aa9d9274798dd63cbf6ede77ef90 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
If the dest psock does not set SK_PSOCK_TX_ENABLED, the skb can't be queued anywhere so must be dropped.
This one is found during code review.
Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Signed-off-by: Cong Wang cong.wang@bytedance.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Acked-by: Jakub Sitnicki jakub@cloudflare.com Link: https://lore.kernel.org/bpf/20210615021342.7416-6-xiyou.wangcong@gmail.com Signed-off-by: Wang Yufen wangyufen@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/skmsg.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 7af259e92a0c..e7e364892182 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -877,8 +877,13 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { skb_queue_tail(&psock->ingress_skb, skb); schedule_work(&psock->work); + err = 0; } spin_unlock_bh(&psock->ingress_lock); + if (err < 0) { + tcp_skb_bpf_redirect_clear(skb); + goto out_free; + } } break; case __SK_REDIRECT:
From: John Fastabend john.fastabend@gmail.com
mainline inclusion from mainline-v5.13-rc1 commit 9635720b7c88592214562cb72605bdab6708006c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
If backlog handler is running during a tear down operation we may enqueue data on the ingress msg queue while tear down is trying to free it.
sk_psock_backlog() sk_psock_handle_skb() skb_psock_skb_ingress() sk_psock_skb_ingress_enqueue() sk_psock_queue_msg(psock,msg) spin_lock(ingress_lock) sk_psock_zap_ingress() _sk_psock_purge_ingerss_msg() _sk_psock_purge_ingress_msg() -- free ingress_msg list -- spin_unlock(ingress_lock) spin_lock(ingress_lock) list_add_tail(msg,ingress_msg) <- entry on list with no one left to free it. spin_unlock(ingress_lock)
To fix we only enqueue from backlog if the ENABLED bit is set. The tear down logic clears the bit with ingress_lock set so we wont enqueue the msg in the last step.
Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Signed-off-by: John Fastabend john.fastabend@gmail.com Signed-off-by: Andrii Nakryiko andrii@kernel.org Acked-by: Jakub Sitnicki jakub@cloudflare.com Acked-by: Martin KaFai Lau kafai@fb.com Link: https://lore.kernel.org/bpf/20210727160500.1713554-4-john.fastabend@gmail.co... Signed-off-by: Wang Yufen wangyufen@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/skmsg.h | 54 ++++++++++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 19 deletions(-)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 31ba9d43ffd1..94ab91ada722 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -297,11 +297,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk) return rcu_dereference_sk_user_data(sk); }
+static inline void sk_psock_set_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + set_bit(bit, &psock->state); +} + +static inline void sk_psock_clear_state(struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + clear_bit(bit, &psock->state); +} + +static inline bool sk_psock_test_state(const struct sk_psock *psock, + enum sk_psock_state_bits bit) +{ + return test_bit(bit, &psock->state); +} + +static inline void sock_drop(struct sock *sk, struct sk_buff *skb) +{ + sk_drops_add(sk, skb); + kfree_skb(skb); +} + +static inline void drop_sk_msg(struct sk_psock *psock, struct sk_msg *msg) +{ + if (msg->skb) + sock_drop(psock->sk, msg->skb); + kfree(msg); +} + static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); - list_add_tail(&msg->list, &psock->ingress_msg); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + list_add_tail(&msg->list, &psock->ingress_msg); + else + drop_sk_msg(psock, msg); spin_unlock_bh(&psock->ingress_lock); }
@@ -423,24 +457,6 @@ static inline void sk_psock_restore_proto(struct sock *sk, } }
-static inline void sk_psock_set_state(struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - set_bit(bit, &psock->state); -} - -static inline void sk_psock_clear_state(struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - clear_bit(bit, &psock->state); -} - -static inline bool sk_psock_test_state(const struct sk_psock *psock, - enum sk_psock_state_bits bit) -{ - return test_bit(bit, &psock->state); -} - static inline struct sk_psock *sk_psock_get(struct sock *sk) { struct sk_psock *psock;
From: John Fastabend john.fastabend@gmail.com
mainline inclusion from mainline-v5.14-rc4 commit 476d98018f32e68e7c5d4e8456940cf2b6d66f10 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Its possible if a socket is closed and the receive thread is under memory pressure it may have cached a skb. We need to ensure these skbs are free'd along with the normal ingress_skb queue.
Before 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") tear down and backlog processing both had sock_lock for the common case of socket close or unhash. So it was not possible to have both running in parrallel so all we would need is the kfree in those kernels.
But, latest kernels include the commit 799aa7f98d5e and this requires a bit more work. Without the ingress_lock guarding reading/writing the state->skb case its possible the tear down could run before the state update causing it to leak memory or worse when the backlog reads the state it could potentially run interleaved with the tear down and we might end up free'ing the state->skb from tear down side but already have the reference from backlog side. To resolve such races we wrap accesses in ingress_lock on both sides serializing tear down and backlog case. In both cases this only happens after an EAGAIN error case so having an extra lock in place is likely fine. The normal path will skip the locks.
Note, we check state->skb before grabbing lock. This works because we can only enqueue with the mutex we hold already. Avoiding a race on adding state->skb after the check. And if tear down path is running that is also fine if the tear down path then removes state->skb we will simply set skb=NULL and the subsequent goto is skipped. This slight complication avoids locking in normal case.
With this fix we no longer see this warning splat from tcp side on socket close when we hit the above case with redirect to ingress self.
[224913.935822] WARNING: CPU: 3 PID: 32100 at net/core/stream.c:208 sk_stream_kill_queues+0x212/0x220 [224913.935841] Modules linked in: fuse overlay bpf_preload x86_pkg_temp_thermal intel_uncore wmi_bmof squashfs sch_fq_codel efivarfs ip_tables x_tables uas xhci_pci ixgbe mdio xfrm_algo xhci_hcd wmi [224913.935897] CPU: 3 PID: 32100 Comm: fgs-bench Tainted: G I 5.14.0-rc1alu+ #181 [224913.935908] Hardware name: Dell Inc. Precision 5820 Tower/002KVM, BIOS 1.9.2 01/24/2019 [224913.935914] RIP: 0010:sk_stream_kill_queues+0x212/0x220 [224913.935923] Code: 8b 83 20 02 00 00 85 c0 75 20 5b 5d 41 5c 41 5d 41 5e 41 5f c3 48 89 df e8 2b 11 fe ff eb c3 0f 0b e9 7c ff ff ff 0f 0b eb ce <0f> 0b 5b 5d 41 5c 41 5d 41 5e 41 5f c3 90 0f 1f 44 00 00 41 57 41 [224913.935932] RSP: 0018:ffff88816271fd38 EFLAGS: 00010206 [224913.935941] RAX: 0000000000000ae8 RBX: ffff88815acd5240 RCX: dffffc0000000000 [224913.935948] RDX: 0000000000000003 RSI: 0000000000000ae8 RDI: ffff88815acd5460 [224913.935954] RBP: ffff88815acd5460 R08: ffffffff955c0ae8 R09: fffffbfff2e6f543 [224913.935961] R10: ffffffff9737aa17 R11: fffffbfff2e6f542 R12: ffff88815acd5390 [224913.935967] R13: ffff88815acd5480 R14: ffffffff98d0c080 R15: ffffffff96267500 [224913.935974] FS: 00007f86e6bd1700(0000) GS:ffff888451cc0000(0000) knlGS:0000000000000000 [224913.935981] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [224913.935988] CR2: 000000c0008eb000 CR3: 00000001020e0005 CR4: 00000000003706e0 [224913.935994] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [224913.936000] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [224913.936007] Call Trace: [224913.936016] inet_csk_destroy_sock+0xba/0x1f0 [224913.936033] __tcp_close+0x620/0x790 [224913.936047] tcp_close+0x20/0x80 [224913.936056] inet_release+0x8f/0xf0 [224913.936070] __sock_release+0x72/0x120 [224913.936083] sock_close+0x14/0x20
Fixes: a136678c0bdbb ("bpf: sk_msg, zap ingress queue on psock down") Signed-off-by: John Fastabend john.fastabend@gmail.com Signed-off-by: Andrii Nakryiko andrii@kernel.org Acked-by: Jakub Sitnicki jakub@cloudflare.com Acked-by: Martin KaFai Lau kafai@fb.com Link: https://lore.kernel.org/bpf/20210727160500.1713554-3-john.fastabend@gmail.co... Signed-off-by: Wang Yufen wangyufen@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/skmsg.c | 42 +++++++++++++++++++++++++++++++++++------- 1 file changed, 35 insertions(+), 7 deletions(-)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e7e364892182..fca67391ccaf 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -509,23 +509,42 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, return sk_psock_skb_ingress(psock, skb); }
+static void sk_psock_skb_state(struct sk_psock *psock, + struct sk_psock_work_state *state, + struct sk_buff *skb, + int len, int off) +{ + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + state->skb = skb; + state->len = len; + state->off = off; + } else { + sock_drop(psock->sk, skb); + } + spin_unlock_bh(&psock->ingress_lock); +} + static void sk_psock_backlog(struct work_struct *work) { struct sk_psock *psock = container_of(work, struct sk_psock, work); struct sk_psock_work_state *state = &psock->work_state; - struct sk_buff *skb; + struct sk_buff *skb = NULL; bool ingress; u32 len, off; int ret;
mutex_lock(&psock->work_mutex); - if (state->skb) { + if (unlikely(state->skb)) { + spin_lock_bh(&psock->ingress_lock); skb = state->skb; len = state->len; off = state->off; state->skb = NULL; - goto start; + spin_unlock_bh(&psock->ingress_lock); } + if (skb) + goto start;
while ((skb = skb_dequeue(&psock->ingress_skb))) { len = skb->len; @@ -539,9 +558,8 @@ static void sk_psock_backlog(struct work_struct *work) len, ingress); if (ret <= 0) { if (ret == -EAGAIN) { - state->skb = skb; - state->len = len; - state->off = off; + sk_psock_skb_state(psock, state, skb, + len, off); goto end; } /* Hard errors break pipe and stop xmit. */ @@ -639,7 +657,17 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
static void __sk_psock_zap_ingress(struct sk_psock *psock) { - skb_queue_purge(&psock->ingress_skb); + struct sk_buff *skb; + + while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { + tcp_skb_bpf_redirect_clear(skb); + sock_drop(psock->sk, skb); + } + kfree_skb(psock->work_state.skb); + /* We null the skb here to ensure that calls to sk_psock_backlog + * do not pick up the free'd skb. + */ + psock->work_state.skb = NULL; __sk_psock_purge_ingress_msg(psock); }
From: John Fastabend john.fastabend@gmail.com
mainline inclusion from mainline-v5.14-rc4 commit 343597d558e79fe704ba8846b5b2ed24056b89c2 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We don't want strparser to run and pass skbs into skmsg handlers when the psock is null. We just sk_drop them in this case. When removing a live socket from map it means extra drops that we do not need to incur. Move the zap below strparser close to avoid this condition.
This way we stop the stream parser first stopping it from processing packets and then delete the psock.
Fixes: a136678c0bdbb ("bpf: sk_msg, zap ingress queue on psock down") Signed-off-by: John Fastabend john.fastabend@gmail.com Signed-off-by: Andrii Nakryiko andrii@kernel.org Acked-by: Jakub Sitnicki jakub@cloudflare.com Acked-by: Martin KaFai Lau kafai@fb.com Link: https://lore.kernel.org/bpf/20210727160500.1713554-2-john.fastabend@gmail.co... Signed-off-by: Wang Yufen wangyufen@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/skmsg.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c index fca67391ccaf..019501d771bf 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -727,8 +727,6 @@ static void sk_psock_destroy(struct rcu_head *rcu)
void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { - sk_psock_stop(psock, false); - write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); @@ -738,6 +736,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock);
+ sk_psock_stop(psock, false); call_rcu(&psock->rcu, sk_psock_destroy); } EXPORT_SYMBOL_GPL(sk_psock_drop);
From: Wang Yufen wangyufen@huawei.com
mainline inclusion from mainline-v5.13-rc1 commit 938d3480b92fa5e454b7734294f12a7b75126f09 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
If tcp_bpf_sendmsg is running during a tear down operation we may enqueue data on the ingress msg queue while tear down is trying to free it.
sk1 (redirect sk2) sk2 ------------------- --------------- tcp_bpf_sendmsg() tcp_bpf_send_verdict() tcp_bpf_sendmsg_redir() bpf_tcp_ingress() sock_map_close() lock_sock() lock_sock() ... blocking sk_psock_stop sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); release_sock(sk); lock_sock() sk_mem_charge() get_page() sk_psock_queue_msg() sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED); drop_sk_msg() release_sock()
While drop_sk_msg(), the msg has charged memory form sk by sk_mem_charge and has sg pages need to put. To fix we use sk_msg_free() and then kfee() msg.
This issue can cause the following info: WARNING: CPU: 0 PID: 9202 at net/core/stream.c:205 sk_stream_kill_queues+0xc8/0xe0 Call Trace: <IRQ> inet_csk_destroy_sock+0x55/0x110 tcp_rcv_state_process+0xe5f/0xe90 ? sk_filter_trim_cap+0x10d/0x230 ? tcp_v4_do_rcv+0x161/0x250 tcp_v4_do_rcv+0x161/0x250 tcp_v4_rcv+0xc3a/0xce0 ip_protocol_deliver_rcu+0x3d/0x230 ip_local_deliver_finish+0x54/0x60 ip_local_deliver+0xfd/0x110 ? ip_protocol_deliver_rcu+0x230/0x230 ip_rcv+0xd6/0x100 ? ip_local_deliver+0x110/0x110 __netif_receive_skb_one_core+0x85/0xa0 process_backlog+0xa4/0x160 __napi_poll+0x29/0x1b0 net_rx_action+0x287/0x300 __do_softirq+0xff/0x2fc do_softirq+0x79/0x90 </IRQ>
WARNING: CPU: 0 PID: 531 at net/ipv4/af_inet.c:154 inet_sock_destruct+0x175/0x1b0 Call Trace: <TASK> __sk_destruct+0x24/0x1f0 sk_psock_destroy+0x19b/0x1c0 process_one_work+0x1b3/0x3c0 ? process_one_work+0x3c0/0x3c0 worker_thread+0x30/0x350 ? process_one_work+0x3c0/0x3c0 kthread+0xe6/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK>
Fixes: 9635720b7c88 ("bpf, sockmap: Fix memleak on ingress msg enqueue") Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Link: https://lore.kernel.org/bpf/20220304081145.2037182-2-wangyufen@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/skmsg.h | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 94ab91ada722..3994a69b85d2 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -321,21 +321,16 @@ static inline void sock_drop(struct sock *sk, struct sk_buff *skb) kfree_skb(skb); }
-static inline void drop_sk_msg(struct sk_psock *psock, struct sk_msg *msg) -{ - if (msg->skb) - sock_drop(psock->sk, msg->skb); - kfree(msg); -} - static inline void sk_psock_queue_msg(struct sk_psock *psock, struct sk_msg *msg) { spin_lock_bh(&psock->ingress_lock); if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) list_add_tail(&msg->list, &psock->ingress_msg); - else - drop_sk_msg(psock, msg); + else { + sk_msg_free(psock->sk, msg); + kfree(msg); + } spin_unlock_bh(&psock->ingress_lock); }
From: Wang Yufen wangyufen@huawei.com
stable inclusion from stable-v5.10.110 commit bec34a91eba3483e1830c02bdd36f8f968642047 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
[ Upstream commit 9c34e38c4a870eb30b13f42f5b44f42e9d19ccb8 ]
If tcp_bpf_sendmsg() is running while sk msg is full. When sk_msg_alloc() returns -ENOMEM error, tcp_bpf_sendmsg() goes to wait_for_memory. If partial memory has been alloced by sk_msg_alloc(), that is, msg_tx->sg.size is greater than osize after sk_msg_alloc(), memleak occurs. To fix we use sk_msg_trim() to release the allocated memory, then goto wait for memory.
Other call paths of sk_msg_alloc() have the similar issue, such as tls_sw_sendmsg(), so handle sk_msg_trim logic inside sk_msg_alloc(), as Cong Wang suggested.
This issue can cause the following info: WARNING: CPU: 3 PID: 7950 at net/core/stream.c:208 sk_stream_kill_queues+0xd4/0x1a0 Call Trace: <TASK> inet_csk_destroy_sock+0x55/0x110 __tcp_close+0x279/0x470 tcp_close+0x1f/0x60 inet_release+0x3f/0x80 __sock_release+0x3d/0xb0 sock_close+0x11/0x20 __fput+0x92/0x250 task_work_run+0x6a/0xa0 do_exit+0x33b/0xb60 do_group_exit+0x2f/0xa0 get_signal+0xb6/0x950 arch_do_signal_or_restart+0xac/0x2a0 exit_to_user_mode_prepare+0xa9/0x200 syscall_exit_to_user_mode+0x12/0x30 do_syscall_64+0x46/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae </TASK>
WARNING: CPU: 3 PID: 2094 at net/ipv4/af_inet.c:155 inet_sock_destruct+0x13c/0x260 Call Trace: <TASK> __sk_destruct+0x24/0x1f0 sk_psock_destroy+0x19b/0x1c0 process_one_work+0x1b3/0x3c0 kthread+0xe6/0x110 ret_from_fork+0x22/0x30 </TASK>
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Link: https://lore.kernel.org/bpf/20220304081145.2037182-3-wangyufen@huawei.com Signed-off-by: Sasha Levin sashal@kernel.org Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/skmsg.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 019501d771bf..75214e2a2333 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -27,6 +27,7 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, int elem_first_coalesce) { struct page_frag *pfrag = sk_page_frag(sk); + u32 osize = msg->sg.size; int ret = 0;
len -= msg->sg.size; @@ -35,13 +36,17 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, u32 orig_offset; int use, i;
- if (!sk_page_frag_refill(sk, pfrag)) - return -ENOMEM; + if (!sk_page_frag_refill(sk, pfrag)) { + ret = -ENOMEM; + goto msg_trim; + }
orig_offset = pfrag->offset; use = min_t(int, len, pfrag->size - orig_offset); - if (!sk_wmem_schedule(sk, use)) - return -ENOMEM; + if (!sk_wmem_schedule(sk, use)) { + ret = -ENOMEM; + goto msg_trim; + }
i = msg->sg.end; sk_msg_iter_var_prev(i); @@ -71,6 +76,10 @@ int sk_msg_alloc(struct sock *sk, struct sk_msg *msg, int len, }
return ret; + +msg_trim: + sk_msg_trim(sk, msg, osize); + return ret; } EXPORT_SYMBOL_GPL(sk_msg_alloc);
From: Wang Yufen wangyufen@huawei.com
stable inclusion from stable-v5.10.110 commit 7b812a369e6416ab06d83cdd39d8e3f752781dd0 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
[ Upstream commit 84472b436e760ba439e1969a9e3c5ae7c86de39d ]
In tcp_bpf_send_verdict(), if msg has more data after tcp_bpf_sendmsg_redir():
tcp_bpf_send_verdict() tosend = msg->sg.size //msg->sg.size = 22220 case __SK_REDIRECT: sk_msg_return() //uncharged msg->sg.size(22220) sk->sk_forward_alloc tcp_bpf_sendmsg_redir() //after tcp_bpf_sendmsg_redir, msg->sg.size=11000 goto more_data; tosend = msg->sg.size //msg->sg.size = 11000 case __SK_REDIRECT: sk_msg_return() //uncharged msg->sg.size(11000) to sk->sk_forward_alloc
The msg->sg.size(11000) has been uncharged twice, to fix we can charge the remaining msg->sg.size before goto more data.
This issue can cause the following info: WARNING: CPU: 0 PID: 9860 at net/core/stream.c:208 sk_stream_kill_queues+0xd4/0x1a0 Call Trace: <TASK> inet_csk_destroy_sock+0x55/0x110 __tcp_close+0x279/0x470 tcp_close+0x1f/0x60 inet_release+0x3f/0x80 __sock_release+0x3d/0xb0 sock_close+0x11/0x20 __fput+0x92/0x250 task_work_run+0x6a/0xa0 do_exit+0x33b/0xb60 do_group_exit+0x2f/0xa0 get_signal+0xb6/0x950 arch_do_signal_or_restart+0xac/0x2a0 ? vfs_write+0x237/0x290 exit_to_user_mode_prepare+0xa9/0x200 syscall_exit_to_user_mode+0x12/0x30 do_syscall_64+0x46/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae </TASK>
WARNING: CPU: 0 PID: 2136 at net/ipv4/af_inet.c:155 inet_sock_destruct+0x13c/0x260 Call Trace: <TASK> __sk_destruct+0x24/0x1f0 sk_psock_destroy+0x19b/0x1c0 process_one_work+0x1b3/0x3c0 worker_thread+0x30/0x350 ? process_one_work+0x3c0/0x3c0 kthread+0xe6/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK>
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Link: https://lore.kernel.org/bpf/20220304081145.2037182-4-wangyufen@huawei.com Signed-off-by: Sasha Levin sashal@kernel.org Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv4/tcp_bpf.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index b7731dbb270f..ef3a8046dbbd 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -365,7 +365,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, cork = true; psock->cork = NULL; } - sk_msg_return(sk, msg, tosend); + sk_msg_return(sk, msg, msg->sg.size); release_sock(sk);
ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags); @@ -405,8 +405,11 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock, } if (msg && msg->sg.data[msg->sg.start].page_link && - msg->sg.data[msg->sg.start].length) + msg->sg.data[msg->sg.start].length) { + if (eval == __SK_REDIRECT) + sk_mem_charge(sk, msg->sg.size); goto more_data; + } } return ret; }
From: Wang Yufen wangyufen@huawei.com
stable inclusion from stable-v5.10.110 commit cd84ea3920aef936c559b63099ef0013ce6b2325 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
[ Upstream commit 2486ab434b2c2a14e9237296db00b1e1b7ae3273 ]
If tcp_bpf_sendmsg is running during a tear down operation, psock may be freed.
tcp_bpf_sendmsg() tcp_bpf_send_verdict() sk_msg_return() tcp_bpf_sendmsg_redir() unlikely(!psock)) sk_msg_free()
The mem of msg has been uncharged in tcp_bpf_send_verdict() by sk_msg_return(), and would be uncharged by sk_msg_free() again. When psock is null, we can simply returning an error code, this would then trigger the sk_msg_free_nocharge in the error path of __SK_REDIRECT and would have the side effect of throwing an error up to user space. This would be a slight change in behavior from user side but would look the same as an error if the redirect on the socket threw an error.
This issue can cause the following info: WARNING: CPU: 0 PID: 2136 at net/ipv4/af_inet.c:155 inet_sock_destruct+0x13c/0x260 Call Trace: <TASK> __sk_destruct+0x24/0x1f0 sk_psock_destroy+0x19b/0x1c0 process_one_work+0x1b3/0x3c0 worker_thread+0x30/0x350 ? process_one_work+0x3c0/0x3c0 kthread+0xe6/0x110 ? kthread_complete_and_exit+0x20/0x20 ret_from_fork+0x22/0x30 </TASK>
Fixes: 604326b41a6f ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: John Fastabend john.fastabend@gmail.com Link: https://lore.kernel.org/bpf/20220304081145.2037182-5-wangyufen@huawei.com Signed-off-by: Sasha Levin sashal@kernel.org Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv4/tcp_bpf.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ef3a8046dbbd..9eaace83231c 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -212,10 +212,9 @@ int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, struct sk_psock *psock = sk_psock_get(sk); int ret;
- if (unlikely(!psock)) { - sk_msg_free(sk, msg); - return 0; - } + if (unlikely(!psock)) + return -EPIPE; + ret = ingress ? bpf_tcp_ingress(sk, psock, msg, bytes, flags) : tcp_bpf_push_locked(sk, msg, bytes, flags, false); sk_psock_put(sk, psock);
From: Wang Yufen wangyufen@huawei.com
hulk inclusion category: feature bugzilla: 186640, https://gitee.com/openeuler/kernel/issues/I545NW
--------------------------------
A tcp socket in a sockmap. If the packets transmission rate is very fast and the packets receiving rate is very slow, a large number of packets are stacked in the ingress queue on the packets receiving side. As a result the memory is exhausted and the system ooms.
To fix, we add sk_rmem_alloc while sk_msg queued in the ingress queue and subtract sk_rmem_alloc while sk_msg dequeued from the ingress queue and check sk_rmem_alloc at the beginning of bpf_tcp_ingress().
Signed-off-by: Wang Yufen wangyufen@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/core/skmsg.c | 1 + net/ipv4/tcp_bpf.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 75214e2a2333..02d6d254d94a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -659,6 +659,7 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); sk_msg_free(psock->sk, msg); kfree(msg); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 9eaace83231c..afeaf35194de 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -43,8 +43,10 @@ int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - if (!msg_rx->skb) + if (!msg_rx->skb) { + atomic_sub(copy, &sk->sk_rmem_alloc); sk_mem_uncharge(sk, copy); + } msg_rx->sg.size -= copy;
if (!sge->length) { @@ -98,6 +100,11 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, return -ENOMEM;
lock_sock(sk); + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { + kfree(tmp); + release_sock(sk); + return -EAGAIN; + } tmp->sg.start = msg->sg.start; i = msg->sg.start; do { @@ -127,6 +134,7 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, if (!ret) { msg->sg.start = i; sk_psock_queue_msg(psock, tmp); + atomic_add(tmp->sg.size, &sk->sk_rmem_alloc); sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp);
From: linyujun linyujun809@huawei.com
mainline inclusion from mainline-v5.18-rc1 commit 9be4c88bb7924f68f88cfd47d925c2d046f51a73 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I55VUL
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The following KASAN warning is detected by QEMU.
================================================================== BUG: KASAN: stack-out-of-bounds in unwind_frame+0x508/0x870 Read of size 4 at addr c36bba90 by task cat/163
CPU: 1 PID: 163 Comm: cat Not tainted 5.10.0-rc1 #40 Hardware name: ARM-Versatile Express [<c0113fac>] (unwind_backtrace) from [<c010e71c>] (show_stack+0x10/0x14) [<c010e71c>] (show_stack) from [<c0b805b4>] (dump_stack+0x98/0xb0) [<c0b805b4>] (dump_stack) from [<c0b7d658>] (print_address_description.constprop.0+0x58/0x4bc) [<c0b7d658>] (print_address_description.constprop.0) from [<c031435c>] (kasan_report+0x154/0x170) [<c031435c>] (kasan_report) from [<c0113c44>] (unwind_frame+0x508/0x870) [<c0113c44>] (unwind_frame) from [<c010e298>] (__save_stack_trace+0x110/0x134) [<c010e298>] (__save_stack_trace) from [<c01ce0d8>] (stack_trace_save+0x8c/0xb4) [<c01ce0d8>] (stack_trace_save) from [<c0313520>] (kasan_set_track+0x38/0x60) [<c0313520>] (kasan_set_track) from [<c0314cb8>] (kasan_set_free_info+0x20/0x2c) [<c0314cb8>] (kasan_set_free_info) from [<c0313474>] (__kasan_slab_free+0xec/0x120) [<c0313474>] (__kasan_slab_free) from [<c0311e20>] (kmem_cache_free+0x7c/0x334) [<c0311e20>] (kmem_cache_free) from [<c01c35dc>] (rcu_core+0x390/0xccc) [<c01c35dc>] (rcu_core) from [<c01013a8>] (__do_softirq+0x180/0x518) [<c01013a8>] (__do_softirq) from [<c0135214>] (irq_exit+0x9c/0xe0) [<c0135214>] (irq_exit) from [<c01a40e4>] (__handle_domain_irq+0xb0/0x110) [<c01a40e4>] (__handle_domain_irq) from [<c0691248>] (gic_handle_irq+0xa0/0xb8) [<c0691248>] (gic_handle_irq) from [<c0100b0c>] (__irq_svc+0x6c/0x94) Exception stack(0xc36bb928 to 0xc36bb970) b920: c36bb9c0 00000000 c0126919 c0101228 c36bb9c0 b76d7730 b940: c36b8000 c36bb9a0 c3335b00 c01ce0d8 00000003 c36bba3c c36bb940 c36bb978 b960: c010e298 c011373c 60000013 ffffffff [<c0100b0c>] (__irq_svc) from [<c011373c>] (unwind_frame+0x0/0x870) [<c011373c>] (unwind_frame) from [<00000000>] (0x0)
The buggy address belongs to the page: page:(ptrval) refcount:0 mapcount:0 mapping:00000000 index:0x0 pfn:0x636bb flags: 0x0() raw: 00000000 00000000 ef867764 00000000 00000000 00000000 ffffffff 00000000 page dumped because: kasan: bad access detected
addr c36bba90 is located in stack of task cat/163 at offset 48 in frame: stack_trace_save+0x0/0xb4
this frame has 1 object: [32, 48) 'trace'
Memory state around the buggy address: c36bb980: f1 f1 f1 f1 00 04 f2 f2 00 00 f3 f3 00 00 00 00 c36bba00: 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 f1
c36bba80: 00 00 f3 f3 00 00 00 00 00 00 00 00 00 00 00 00
^ c36bbb00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 c36bbb80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ==================================================================
There is a same issue on x86 and has been resolved by the commit f7d27c35ddff ("x86/mm, kasan: Silence KASAN warnings in get_wchan()"). The solution could be applied to arm architecture too.
Signed-off-by: Lin Yujun linyujun809@huawei.com Reported-by: He Ying heying24@huawei.com Signed-off-by: Russell King (Oracle) rmk+kernel@armlinux.org.uk Signed-off-by: Lin Yujun linyujun809@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm/kernel/stacktrace.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c index db798eac7431..824774999825 100644 --- a/arch/arm/kernel/stacktrace.c +++ b/arch/arm/kernel/stacktrace.c @@ -53,17 +53,17 @@ int notrace unwind_frame(struct stackframe *frame) return -EINVAL;
frame->sp = frame->fp; - frame->fp = *(unsigned long *)(fp); - frame->pc = *(unsigned long *)(fp + 4); + frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); + frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 4)); #else /* check current frame pointer is within bounds */ if (fp < low + 12 || fp > high - 4) return -EINVAL;
/* restore the registers from the stack frame */ - frame->fp = *(unsigned long *)(fp - 12); - frame->sp = *(unsigned long *)(fp - 8); - frame->pc = *(unsigned long *)(fp - 4); + frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp - 12)); + frame->sp = READ_ONCE_NOCHECK(*(unsigned long *)(fp - 8)); + frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp - 4)); #endif
return 0;
From: Peng Liu liupeng256@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56F4Z CVE: NA
--------------------------------
When efi is enabled, the memory block will initialize after early parameters parse. And, memblock_setclr_flag should be used after the initialization of the memory block.
However, "arm64: Request resources for reserved memory via memmap" could use memblock_setclr_flag when the memory block is empty. To fix this, memblock_setclr_flag should call after the initialization of the memory block.
Fixes: 374db2be8805 ("arm64: Request resources for reserved memory via memmap") Signed-off-by: Peng Liu liupeng256@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/mm/init.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 0c43739bc6c5..7cee9bd13017 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -269,6 +269,36 @@ static void __init fdt_enforce_memory_region(void) memblock_add(usable_rgns[1].base, usable_rgns[1].size); }
+#define MAX_RES_REGIONS 32 + +static struct memblock_region mbk_memmap_regions[MAX_RES_REGIONS] __initdata_memblock; +static int mbk_memmap_cnt __initdata; + +static void __init setup_mbk_memmap_regions(phys_addr_t base, phys_addr_t size) +{ + if (mbk_memmap_cnt >= MAX_RES_REGIONS) { + pr_err("Too many memmap specified, exceed %d\n", MAX_RES_REGIONS); + return; + } + + mbk_memmap_regions[mbk_memmap_cnt].base = base; + mbk_memmap_regions[mbk_memmap_cnt].size = size; + mbk_memmap_cnt++; +} + +static void __init reserve_memmap_regions(void) +{ + phys_addr_t base, size; + int i; + + for (i = 0; i < mbk_memmap_cnt; i++) { + base = mbk_memmap_regions[i].base; + size = mbk_memmap_regions[i].size; + memblock_reserve(base, size); + memblock_mark_memmap(base, size); + } +} + static int need_remove_real_memblock __initdata;
static int __init parse_memmap_one(char *p) @@ -305,8 +335,7 @@ static int __init parse_memmap_one(char *p) memblock_add(start_at, mem_size); } else if (*p == '$') { start_at = memparse(p + 1, &p); - memblock_reserve(start_at, mem_size); - memblock_mark_memmap(start_at, mem_size); + setup_mbk_memmap_regions(start_at, mem_size); } else if (*p == '!') { start_at = memparse(p + 1, &p); setup_reserve_pmem(start_at, mem_size); @@ -504,6 +533,8 @@ void __init bootmem_init(void)
reserve_quick_kexec();
+ reserve_memmap_regions(); + reserve_pmem();
reserve_pin_memory_res();
From: Peng Liu liupeng256@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56F4Z CVE: NA
--------------------------------
For arm64, the exactmap has to remove all memblock and then add the user assigned range of memblock. EFI initializes memblock too late so removing all memblock after it will cause panic. Hence, the exactmap can be just used in dt boot mode, add this limitation to Documentation/admin-guide/kernel-parameters.txt.
Signed-off-by: Peng Liu liupeng256@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/kernel-parameters.txt | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f648dbd7d902..2702a1369c58 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2820,6 +2820,8 @@
memmap=exactmap [KNL,X86,ARM64] Enable setting of an exact E820 and ARM64 memory map, as specified by the user. + For ARM64, this setting is limited to dt boot mode as + exact mapping must be done after initializing memblock. Such memmap=exactmap lines can be constructed based on BIOS output or other requirements. See the memmap=nn@ss option description.
From: Ricardo Koller ricarkol@google.com
mainline inclusion from mainline-v5.15-rc1 commit b9a51949cebcd57bfb9385d9da62ace52564898c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I56LDF
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
vgic_get_irq(intid) is used all over the vgic code in order to get a reference to a struct irq. It warns whenever intid is not a valid number (like when it's a reserved IRQ number). The issue is that this warning can be triggered from userspace (e.g., KVM_IRQ_LINE for intid 1020).
Drop the WARN call from vgic_get_irq.
Signed-off-by: Ricardo Koller ricarkol@google.com Reviewed-by: Oliver Upton oupton@google.com Signed-off-by: Marc Zyngier maz@kernel.org Link: https://lore.kernel.org/r/20210818213205.598471-1-ricarkol@google.com Signed-off-by: Lin Yujun linyujun809@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kvm/vgic/vgic.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 15b666200f0b..afb077b1cda6 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -106,7 +106,6 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, if (intid >= VGIC_MIN_LPI) return vgic_get_lpi(kvm, intid);
- WARN(1, "Looking up struct vgic_irq for reserved INTID"); return NULL; }
From: MichelleJin shjy180909@gmail.com
mainline inclusion from mainline-v5.16-rc1 commit f04ed7d277e842af9934b71b529341d1ba31a9c1 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I55ZC1 CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
When rhashtable_init() fails, it returns -EINVAL. However, since error return value of rhashtable_init is not checked, it can cause use of uninitialized pointers. So, fix unhandled errors of rhashtable_init.
Signed-off-by: MichelleJin shjy180909@gmail.com Reviewed-by: David Ahern dsahern@kernel.org Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv6/ila/ila_xlat.c | 6 +++++- net/ipv6/seg6.c | 8 ++++++-- net/ipv6/seg6_hmac.c | 4 +--- 3 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index a1ac0e3d8c60..47447f0241df 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -610,7 +610,11 @@ int ila_xlat_init_net(struct net *net) if (err) return err;
- rhashtable_init(&ilan->xlat.rhash_table, &rht_params); + err = rhashtable_init(&ilan->xlat.rhash_table, &rht_params); + if (err) { + free_bucket_spinlocks(ilan->xlat.locks); + return err; + }
return 0; } diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index d2f8138e5a73..0b341be7f6aa 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -377,7 +377,11 @@ static int __net_init seg6_net_init(struct net *net) net->ipv6.seg6_data = sdata;
#ifdef CONFIG_IPV6_SEG6_HMAC - seg6_hmac_net_init(net); + if (seg6_hmac_net_init(net)) { + kfree(sdata); + kfree(rcu_dereference_raw(sdata->tun_src)); + return -ENOMEM; + }; #endif
return 0; @@ -391,7 +395,7 @@ static void __net_exit seg6_net_exit(struct net *net) seg6_hmac_net_exit(net); #endif
- kfree(sdata->tun_src); + kfree(rcu_dereference_raw(sdata->tun_src)); kfree(sdata); }
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 687d95dce085..29bc4e7c3046 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -405,9 +405,7 @@ int __net_init seg6_hmac_net_init(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net);
- rhashtable_init(&sdata->hmac_infos, &rht_params); - - return 0; + return rhashtable_init(&sdata->hmac_infos, &rht_params); } EXPORT_SYMBOL(seg6_hmac_net_init);
From: MichelleJin shjy180909@gmail.com
mainline inclusion from mainline-v5.16-rc1 commit 23b08260481ca552180130bbef0f3a60df4c092e category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I55ZC1 CVE: NA backport: openEuler-22.03-LTS
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
sdata->tun_src should be freed before sdata is freed because sdata->tun_src is allocated after sdata allocation. So, kfree(sdata) and kfree(rcu_dereference_raw(sdata->tun_src)) are changed code order.
Fixes: f04ed7d277e8 ("net: ipv6: check return value of rhashtable_init")
Signed-off-by: MichelleJin shjy180909@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv6/seg6.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 0b341be7f6aa..bff6bc75cfb7 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -378,8 +378,8 @@ static int __net_init seg6_net_init(struct net *net)
#ifdef CONFIG_IPV6_SEG6_HMAC if (seg6_hmac_net_init(net)) { - kfree(sdata); kfree(rcu_dereference_raw(sdata->tun_src)); + kfree(sdata); return -ENOMEM; }; #endif
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: 186690, https://gitee.com/openeuler/kernel/issues/I56U8P
--------------------------------
Following script will cause a ENOSPC error while creating ubi volume: ID="0x20,0xa5,0x00,0x16" # 2G-size 128KB-PEB modprobe nandsim id_bytes=$ID modprobe ubi ubiattach -m0 ubimkvol /dev/ubi0 -N test -s 400MiB # ENOSPC returned
Commit c4d51010ef6c32 ("ubi: fastmap: Fix high cpu usage of ubi_bgt by making sure wl_pool not empty") fills free PEBs into fm pool after reserving 'beb_rsvd_pebs', which will cause that there maybe no enough free PEBs filled into fm pool (Especially when the UBI device has just been initialized). Then, ubimkvol could get ENOSPC after ubi attached. Fix it by filling fm pool without reserving 'beb_rsvd_pebs'.
Fixes: c4d51010ef6c32 ("ubi: fastmap: Fix high cpu ... not empty") Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/mtd/ubi/fastmap-wl.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/drivers/mtd/ubi/fastmap-wl.c b/drivers/mtd/ubi/fastmap-wl.c index 21ea5ca8270b..053ab52668e8 100644 --- a/drivers/mtd/ubi/fastmap-wl.c +++ b/drivers/mtd/ubi/fastmap-wl.c @@ -100,22 +100,28 @@ struct ubi_wl_entry *ubi_wl_get_fm_peb(struct ubi_device *ubi, int anchor) /* * has_enough_free_count - whether ubi has enough free pebs to fill fm pools * @ubi: UBI device description object + * @is_wl_pool: whether UBI is filling wear leveling pool * * This helper function checks whether there are enough free pebs (deducted * by fastmap pebs) to fill fm_pool and fm_wl_pool, above rule works after * there is at least one of free pebs is filled into fm_wl_pool. + * For wear leveling pool, UBI should also reserve free pebs for bad pebs + * handling, because there maybe no enough free pebs for user volumes after + * producing new bad pebs. */ -static bool has_enough_free_count(struct ubi_device *ubi) +static bool has_enough_free_count(struct ubi_device *ubi, bool is_wl_pool) { int fm_used = 0; // fastmap non anchor pebs. + int beb_rsvd_pebs;
if (!ubi->free.rb_node) return false;
+ beb_rsvd_pebs = is_wl_pool ? ubi->beb_rsvd_pebs : 0; if (ubi->fm_wl_pool.size > 0 && !(ubi->ro_mode || ubi->fm_disabled)) fm_used = ubi->fm_size / ubi->leb_size - 1;
- return ubi->free_count - ubi->beb_rsvd_pebs > fm_used; + return ubi->free_count - beb_rsvd_pebs > fm_used; }
/** @@ -151,7 +157,7 @@ void ubi_refill_pools(struct ubi_device *ubi) for (;;) { enough = 0; if (pool->size < pool->max_size) { - if (!has_enough_free_count(ubi)) + if (!has_enough_free_count(ubi, false)) break;
e = wl_get_wle(ubi); @@ -164,7 +170,7 @@ void ubi_refill_pools(struct ubi_device *ubi) enough++;
if (wl_pool->size < wl_pool->max_size) { - if (!has_enough_free_count(ubi)) + if (!has_enough_free_count(ubi, true)) break;
e = find_wl_entry(ubi, &ubi->free, WL_FREE_MAX_DIFF);
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: 186693, https://gitee.com/openeuler/kernel/issues/I56UC5
--------------------------------
There is an use-after-free problem for 'eba_tbl' in ubi_create_volume()'s error handling path:
ubi_eba_replace_table(vol, eba_tbl) vol->eba_tbl = tbl out_mapping: ubi_eba_destroy_table(eba_tbl) // Free 'eba_tbl' out_unlock: put_device(&vol->dev) vol_release kfree(tbl->entries) // UAF
Fix it by removing redundant 'eba_tbl' releasing. Fetch a reproducer in [Link].
Fixes: 493cfaeaa0c9b ("mtd: utilize new cdev_device_add helper function") Link: https://bugzilla.kernel.org/show_bug.cgi?id=215965 Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/mtd/ubi/vmt.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c index 04d0e217ea1c..90e5f8d2fe0a 100644 --- a/drivers/mtd/ubi/vmt.c +++ b/drivers/mtd/ubi/vmt.c @@ -309,7 +309,6 @@ int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req) ubi->volumes[vol_id] = NULL; ubi->vol_count -= 1; spin_unlock(&ubi->volumes_lock); - ubi_eba_destroy_table(eba_tbl); out_acc: spin_lock(&ubi->volumes_lock); ubi->rsvd_pebs -= vol->reserved_pebs;
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4VZJT CVE: NA
--------------------------------
Before, when detect the cpu is overloaded, we throttle offline tasks at exit_to_user_mode_loop() before returning to user mode. Some architects(e.g.,arm64) do not support QOS scheduler because a task do not via exit_to_user_mode_loop() return to userspace at these platforms. In order to slove this problem and support qos scheduler on all architectures, if we require throttling offline tasks, we set flag TIF_NOTIFY_RESUME to an offline task when it is picked and throttle it at tracehook_notify_resume().
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: zheng zucheng zhengzucheng@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/tracehook.h | 4 ++++ kernel/entry/common.c | 7 +------ kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++------- 3 files changed, 37 insertions(+), 13 deletions(-)
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index b480e1a07ed8..5913deb26219 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -196,6 +196,10 @@ static inline void tracehook_notify_resume(struct pt_regs *regs)
mem_cgroup_handle_over_high(); blkcg_maybe_throttle_current(); +#ifdef CONFIG_QOS_SCHED + sched_qos_offline_wait(); +#endif + }
#endif /* <linux/tracehook.h> */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 18a29ca01bfe..cea3957ebdbc 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -160,10 +160,6 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & _TIF_SIGPENDING) arch_do_signal(regs);
-#ifdef CONFIG_QOS_SCHED - sched_qos_offline_wait(); -#endif - if (ti_work & _TIF_NOTIFY_RESUME) { tracehook_notify_resume(regs); rseq_handle_notify_resume(NULL, regs); @@ -198,8 +194,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) /* Flush pending rcuog wakeup before the last need_resched() check */ rcu_nocb_flush_deferred_wakeup();
- if (unlikely((ti_work & EXIT_TO_USER_MODE_WORK) || - sched_qos_cpu_overload())) + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) ti_work = exit_to_user_mode_loop(regs, ti_work);
arch_exit_to_user_mode_prepare(regs, ti_work); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f577d581166b..a98a04044c8f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -26,6 +26,7 @@ #endif #ifdef CONFIG_QOS_SCHED #include <linux/delay.h> +#include <linux/tracehook.h> #endif
/* @@ -7178,6 +7179,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ }
#ifdef CONFIG_QOS_SCHED + +static inline bool is_offline_task(struct task_struct *p) +{ + return task_group(p)->qos_level == QOS_LEVEL_OFFLINE; +} + static void start_qos_hrtimer(int cpu); static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { @@ -7342,15 +7349,11 @@ void sched_qos_offline_wait(void) rcu_read_lock(); qos_level = task_group(current)->qos_level; rcu_read_unlock(); - if (qos_level != -1 || signal_pending(current)) + if (qos_level != -1 || fatal_signal_pending(current)) break; - msleep_interruptible(sysctl_offline_wait_interval); - } -}
-int sched_qos_cpu_overload(void) -{ - return __this_cpu_read(qos_cpu_overload); + schedule_timeout_killable(msecs_to_jiffies(sysctl_offline_wait_interval)); + } }
static enum hrtimer_restart qos_overload_timer_handler(struct hrtimer *timer) @@ -7383,6 +7386,23 @@ void init_qos_hrtimer(int cpu) hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); hrtimer->function = qos_overload_timer_handler; } + +/* + * To avoid Priority inversion issues, when this cpu is qos_cpu_overload, + * we should schedule offline tasks to run so that they can leave kernel + * critical sections, and throttle them before returning to user mode. + */ +static void qos_schedule_throttle(struct task_struct *p) +{ + if (unlikely(current->flags & PF_KTHREAD)) + return; + + if (unlikely(this_cpu_read(qos_cpu_overload))) { + if (is_offline_task(p)) + set_notify_resume(p); + } +} + #endif
#ifdef CONFIG_QOS_SCHED_SMT_EXPELLER @@ -7690,9 +7710,14 @@ done: __maybe_unused;
update_misfit_status(p, rq);
+#ifdef CONFIG_QOS_SCHED + qos_schedule_throttle(p); +#endif + #ifdef CONFIG_QOS_SCHED_SMT_EXPELLER qos_smt_expel(this_cpu, p); #endif + return p;
idle:
From: Zhang Qiao zhangqiao22@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4VZJT CVE: NA
--------------------------------
1. Qos throttle reuse tg_{throttle,unthrottle}_{up,down} that can write some cfs-bandwidth fields, it may cause some unknown data error. So add qos_tg_{throttle,unthrottle}_{up,down} for qos throttle.
2. walk_tg_tree_from() caller must hold rcu_lock, currently there is none, so add it now.
Signed-off-by: Zhang Qiao zhangqiao22@huawei.com Reviewed-by: Chen Hui judy.chenhui@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/sched/fair.c | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a98a04044c8f..5fe13efce378 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5460,6 +5460,10 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
lockdep_assert_held(&rq->lock);
+#ifdef CONFIG_QOS_SCHED + unthrottle_qos_cfs_rqs(cpu_of(rq)); +#endif + rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; @@ -5482,10 +5486,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) unthrottle_cfs_rq(cfs_rq); } rcu_read_unlock(); - -#ifdef CONFIG_QOS_SCHED - unthrottle_qos_cfs_rqs(cpu_of(rq)); -#endif }
#else /* CONFIG_CFS_BANDWIDTH */ @@ -7186,6 +7186,27 @@ static inline bool is_offline_task(struct task_struct *p) }
static void start_qos_hrtimer(int cpu); + +static int qos_tg_unthrottle_up(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count--; + + return 0; +} + +static int qos_tg_throttle_down(struct task_group *tg, void *data) +{ + struct rq *rq = data; + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + + cfs_rq->throttle_count++; + + return 0; +} + static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -7197,7 +7218,7 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq)
/* freeze hierarchy runnable averages while throttled */ rcu_read_lock(); - walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq); + walk_tg_tree_from(cfs_rq->tg, qos_tg_throttle_down, tg_nop, (void *)rq); rcu_read_unlock();
task_delta = cfs_rq->h_nr_running; @@ -7228,7 +7249,6 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) start_qos_hrtimer(cpu_of(rq));
cfs_rq->throttled = 1; - cfs_rq->throttled_clock = rq_clock(rq);
list_add(&cfs_rq->qos_throttled_list, &per_cpu(qos_throttled_cfs_rq, cpu_of(rq))); @@ -7237,7 +7257,6 @@ static void throttle_qos_cfs_rq(struct cfs_rq *cfs_rq) static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); - struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; int enqueue = 1; unsigned int prev_nr = cfs_rq->h_nr_running; @@ -7247,13 +7266,12 @@ static void unthrottle_qos_cfs_rq(struct cfs_rq *cfs_rq)
cfs_rq->throttled = 0;
- update_rq_clock(rq); - - cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock; list_del_init(&cfs_rq->qos_throttled_list);
/* update hierarchical throttle state */ - walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); + rcu_read_lock(); + walk_tg_tree_from(cfs_rq->tg, tg_nop, qos_tg_unthrottle_up, (void *)rq); + rcu_read_unlock();
if (!cfs_rq->load.weight) return;
From: Zhen Lei thunder.leizhen@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I575ZA CVE: NA
-------------------------------------------------------------------------
Fix below build errors: error: implicit declaration of function ‘reserve_crashkernel_high’ undefined reference to `request_quick_kexec_res' undefined reference to `reserve_quick_kexec'
1. An empty reserve_crashkernel_high() should be defined when CONFIG_KEXEC_CORE is not set. 2. CONFIG_ARM64_PMEM_RESERVE needs to be changed to CONFIG_QUICK_KEXEC. This is a mistake.
Fixes: d61f4ca086e9 ("arm64: quick_kexec: Move to stand-alone file") Fixes: baac34dd89ca ("arm64: kdump: Use page-level mapping for the high memory of crashkernel") Signed-off-by: Zhen Lei thunder.leizhen@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/mm/init.c | 4 ++++ arch/arm64/mm/internal.h | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 7cee9bd13017..b19bdd48cc43 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -70,6 +70,10 @@ phys_addr_t arm64_dma_phys_limit __ro_after_init; static void __init reserve_crashkernel(void) { } + +static void __init reserve_crashkernel_high(void) +{ +} #endif
/* diff --git a/arch/arm64/mm/internal.h b/arch/arm64/mm/internal.h index e1c6fc36b3b5..9b8e20d87172 100644 --- a/arch/arm64/mm/internal.h +++ b/arch/arm64/mm/internal.h @@ -14,7 +14,7 @@ static inline void __init setup_reserve_pmem(u64 start, u64 size) {} static inline void __init reserve_pmem(void) {} static inline void __init request_pmem_res_resource(void) {} #endif -#ifdef CONFIG_ARM64_PMEM_RESERVE +#ifdef CONFIG_QUICK_KEXEC void __init reserve_quick_kexec(void); void __init request_quick_kexec_res(struct resource *res); #else
From: Baokun Li libaokun1@huawei.com
hulk inclusion category: bugfix bugzilla: 186638, https://gitee.com/openeuler/kernel/issues/I57PM8 CVE: NA
--------------------------------
Hulk Robot reported a BUG_ON: ================================================================== EXT4-fs error (device loop3): ext4_mb_generate_buddy:805: group 0, block bitmap and bg descriptor inconsistent: 25 vs 31513 free clusters kernel BUG at fs/ext4/ext4_jbd2.c:53! invalid opcode: 0000 [#1] SMP KASAN PTI CPU: 0 PID: 25371 Comm: syz-executor.3 Not tainted 5.10.0+ #1 RIP: 0010:ext4_put_nojournal fs/ext4/ext4_jbd2.c:53 [inline] RIP: 0010:__ext4_journal_stop+0x10e/0x110 fs/ext4/ext4_jbd2.c:116 [...] Call Trace: ext4_write_inline_data_end+0x59a/0x730 fs/ext4/inline.c:795 generic_perform_write+0x279/0x3c0 mm/filemap.c:3344 ext4_buffered_write_iter+0x2e3/0x3d0 fs/ext4/file.c:270 ext4_file_write_iter+0x30a/0x11c0 fs/ext4/file.c:520 do_iter_readv_writev+0x339/0x3c0 fs/read_write.c:732 do_iter_write+0x107/0x430 fs/read_write.c:861 vfs_writev fs/read_write.c:934 [inline] do_pwritev+0x1e5/0x380 fs/read_write.c:1031 [...] ==================================================================
Above issue may happen as follows: cpu1 cpu2 __________________________|__________________________ do_pwritev vfs_writev do_iter_write ext4_file_write_iter ext4_buffered_write_iter generic_perform_write ext4_da_write_begin vfs_fallocate ext4_fallocate ext4_convert_inline_data ext4_convert_inline_data_nolock ext4_destroy_inline_data_nolock clear EXT4_STATE_MAY_INLINE_DATA ext4_map_blocks ext4_ext_map_blocks ext4_mb_new_blocks ext4_mb_regular_allocator ext4_mb_good_group_nolock ext4_mb_init_group ext4_mb_init_cache ext4_mb_generate_buddy --> error ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ext4_restore_inline_data set EXT4_STATE_MAY_INLINE_DATA ext4_block_write_begin ext4_da_write_end ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ext4_write_inline_data_end handle=NULL ext4_journal_stop(handle) __ext4_journal_stop ext4_put_nojournal(handle) ref_cnt = (unsigned long)handle BUG_ON(ref_cnt == 0) ---> BUG_ON
The lock held by ext4_convert_inline_data is xattr_sem, but the lock held by generic_perform_write is i_rwsem. Therefore, the two locks can be concurrent.
To solve above issue, we add inode_lock() for ext4_convert_inline_data(). At the same time, move ext4_convert_inline_data() in front of ext4_punch_hole(), remove similar handling from ext4_punch_hole().
Fixes: 0c8d414f163f ("ext4: let fallocate handle inline data correctly") Cc: stable@vger.kernel.org Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/extents.c | 10 ++++++---- fs/ext4/inode.c | 9 --------- 2 files changed, 6 insertions(+), 13 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 30f2fa7a4dbd..0687e8ffa69d 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4700,15 +4700,17 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
ext4_fc_start_update(inode);
+ inode_lock(inode); + ret = ext4_convert_inline_data(inode); + inode_unlock(inode); + if (ret) + goto exit; + if (mode & FALLOC_FL_PUNCH_HOLE) { ret = ext4_punch_hole(inode, offset, len); goto exit; }
- ret = ext4_convert_inline_data(inode); - if (ret) - goto exit; - if (mode & FALLOC_FL_COLLAPSE_RANGE) { ret = ext4_collapse_range(inode, offset, len); goto exit; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 9b2afb51d93e..8a48214d97cf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3952,15 +3952,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
trace_ext4_punch_hole(inode, offset, length, 0);
- ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); - if (ext4_has_inline_data(inode)) { - down_write(&EXT4_I(inode)->i_mmap_sem); - ret = ext4_convert_inline_data(inode); - up_write(&EXT4_I(inode)->i_mmap_sem); - if (ret) - return ret; - } - /* * Write out all dirty pages to avoid race conditions * Then release them.
From: Willy Tarreau w@1wt.eu
mainline inclusion from mainline-v5.18-rc6 commit f71f01394f742fc4558b3f9f4c7ef4c4cf3b07c8 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I582HK CVE: CVE-2022-1652
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?h=l...
--------------------------------
Interrupt handler bad_flp_intr() may cause a UAF on the recently freed request just to increment the error count. There's no point keeping that one in the request anyway, and since the interrupt handler uses a static pointer to the error which cannot be kept in sync with the pending request, better make it use a static error counter that's reset for each new request. This reset now happens when entering redo_fd_request() for a new request via set_next_request().
One initial concern about a single error counter was that errors on one floppy drive could be reported on another one, but this problem is not real given that the driver uses a single drive at a time, as that PC-compatible controllers also have this limitation by using shared signals. As such the error count is always for the "current" drive.
Reported-by: Minh Yuan yuanmingbuaa@gmail.com Suggested-by: Linus Torvalds torvalds@linuxfoundation.org Tested-by: Denis Efremov efremov@linux.com Signed-off-by: Willy Tarreau w@1wt.eu Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Luo Meng luomeng12@huawei.com
Conflicts: drivers/block/floppy.c Reviewed-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/block/floppy.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index aaee15058d18..8fc23f5026f0 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -509,8 +509,8 @@ static unsigned long fdc_busy; static DECLARE_WAIT_QUEUE_HEAD(fdc_wait); static DECLARE_WAIT_QUEUE_HEAD(command_done);
-/* Errors during formatting are counted here. */ -static int format_errors; +/* errors encountered on the current (or last) request */ +static int floppy_errors;
/* Format request descriptor. */ static struct format_descr format_req; @@ -530,7 +530,6 @@ static struct format_descr format_req; static char *floppy_track_buffer; static int max_buffer_sectors;
-static int *errors; typedef void (*done_f)(int); static const struct cont_t { void (*interrupt)(void); @@ -1455,7 +1454,7 @@ static int interpret_errors(void) if (drive_params[current_drive].flags & FTD_MSG) DPRINT("Over/Underrun - retrying\n"); bad = 0; - } else if (*errors >= drive_params[current_drive].max_errors.reporting) { + } else if (floppy_errors >= drive_params[current_drive].max_errors.reporting) { print_errors(); } if (reply_buffer[ST2] & ST2_WC || reply_buffer[ST2] & ST2_BC) @@ -2095,7 +2094,7 @@ static void bad_flp_intr(void) if (!next_valid_format(current_drive)) return; } - err_count = ++(*errors); + err_count = ++floppy_errors; INFBOUND(write_errors[current_drive].badness, err_count); if (err_count > drive_params[current_drive].max_errors.abort) cont->done(0); @@ -2240,9 +2239,8 @@ static int do_format(int drive, struct format_descr *tmp_format_req) return -EINVAL; } format_req = *tmp_format_req; - format_errors = 0; cont = &format_cont; - errors = &format_errors; + floppy_errors = 0; ret = wait_til_done(redo_format, true); if (ret == -EINTR) return -EINTR; @@ -2721,7 +2719,7 @@ static int make_raw_rw_request(void) */ if (!direct || (indirect * 2 > direct * 3 && - *errors < drive_params[current_drive].max_errors.read_track && + floppy_errors < drive_params[current_drive].max_errors.read_track && ((!probing || (drive_params[current_drive].read_track & (1 << drive_state[current_drive].probed_format)))))) { max_size = blk_rq_sectors(current_req); @@ -2846,10 +2844,11 @@ static int set_next_request(void) current_req = list_first_entry_or_null(&floppy_reqs, struct request, queuelist); if (current_req) { - current_req->error_count = 0; + floppy_errors = 0; list_del_init(¤t_req->queuelist); + return 1; } - return current_req != NULL; + return 0; }
/* Starts or continues processing request. Will automatically unlock the @@ -2908,7 +2907,6 @@ static void redo_fd_request(void) _floppy = floppy_type + drive_params[current_drive].autodetect[drive_state[current_drive].probed_format]; } else probing = 0; - errors = &(current_req->error_count); tmp = make_raw_rw_request(); if (tmp < 2) { request_done(tmp);
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: 186784, https://gitee.com/openeuler/kernel/issues/I5851T CVE: NA
--------------------------------
There is a false positive WARNON happening in execve(2)/uselib(2) syscalls with concurrent noexec-remount.
execveat remount do_open_execat(path/bin) do_filp_open path_openat do_open may_open path_noexec() // PASS remount(path->mnt, MS_NOEXEC) WARNON(path_noexec(&file->f_path)) // path_noexec() checks fail
Since may_open() has already checked the same conditions, fix it by removing 'S_ISREG' and 'path_noexec' check in do_open_execat()/uselib(2).
Fixes: 0fd338b2d2cdf8 ("exec: move path_noexec() check earlier") Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/exec.c | 22 +--------------------- 1 file changed, 1 insertion(+), 21 deletions(-)
diff --git a/fs/exec.c b/fs/exec.c index 72f8763b3ce9..2147ae12787c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -143,16 +143,6 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) if (IS_ERR(file)) goto out;
- /* - * may_open() has already checked for this, so it should be - * impossible to trip now. But we need to be extra cautious - * and check again at the very end too. - */ - error = -EACCES; - if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || - path_noexec(&file->f_path))) - goto exit; - fsnotify_open(file);
error = -ENOEXEC; @@ -171,7 +161,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) break; } read_unlock(&binfmt_lock); -exit: + fput(file); out: return error; @@ -913,16 +903,6 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) if (IS_ERR(file)) goto out;
- /* - * may_open() has already checked for this, so it should be - * impossible to trip now. But we need to be extra cautious - * and check again at the very end too. - */ - err = -EACCES; - if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode) || - path_noexec(&file->f_path))) - goto exit; - err = deny_write_access(file); if (err) goto exit;
From: Baokun Li libaokun1@huawei.com
hulk inclusion category: bugfix bugzilla: 186770, https://gitee.com/openeuler/kernel/issues/I58670 CVE: NA
--------------------------------
Hulk Robot reported a BUG_ON:
================================================================== kernel BUG at fs/ext4/extents_status.c:199! [...] RIP: 0010:ext4_es_end fs/ext4/extents_status.c:199 [inline] RIP: 0010:__es_tree_search+0x1e0/0x260 fs/ext4/extents_status.c:217 [...] Call Trace: ext4_es_cache_extent+0x109/0x340 fs/ext4/extents_status.c:766 ext4_cache_extents+0x239/0x2e0 fs/ext4/extents.c:561 ext4_find_extent+0x6b7/0xa20 fs/ext4/extents.c:964 ext4_ext_map_blocks+0x16b/0x4b70 fs/ext4/extents.c:4384 ext4_map_blocks+0xe26/0x19f0 fs/ext4/inode.c:567 ext4_getblk+0x320/0x4c0 fs/ext4/inode.c:980 ext4_bread+0x2d/0x170 fs/ext4/inode.c:1031 ext4_quota_read+0x248/0x320 fs/ext4/super.c:6257 v2_read_header+0x78/0x110 fs/quota/quota_v2.c:63 v2_check_quota_file+0x76/0x230 fs/quota/quota_v2.c:82 vfs_load_quota_inode+0x5d1/0x1530 fs/quota/dquot.c:2368 dquot_enable+0x28a/0x330 fs/quota/dquot.c:2490 ext4_quota_enable fs/ext4/super.c:6137 [inline] ext4_enable_quotas+0x5d7/0x960 fs/ext4/super.c:6163 ext4_fill_super+0xa7c9/0xdc00 fs/ext4/super.c:4754 mount_bdev+0x2e9/0x3b0 fs/super.c:1158 mount_fs+0x4b/0x1e4 fs/super.c:1261 [...] ==================================================================
Above issue may happen as follows: ------------------------------------- ext4_fill_super ext4_enable_quotas ext4_quota_enable ext4_iget __ext4_iget ext4_ext_check_inode ext4_ext_check __ext4_ext_check ext4_valid_extent_entries Check for overlapping extents does't take effect dquot_enable vfs_load_quota_inode v2_check_quota_file v2_read_header ext4_quota_read ext4_bread ext4_getblk ext4_map_blocks ext4_ext_map_blocks ext4_find_extent ext4_cache_extents ext4_es_cache_extent ext4_es_cache_extent __es_tree_search ext4_es_end BUG_ON(es->es_lblk + es->es_len < es->es_lblk)
The error ext4 extents is as follows: 0af3 0300 0400 0000 00000000 extent_header 00000000 0100 0000 12000000 extent1 00000000 0100 0000 18000000 extent2 02000000 0400 0000 14000000 extent3
In the ext4_valid_extent_entries function, if prev is 0, no error is returned even if lblock<=prev. This was intended to skip the check on the first extent, but in the error image above, prev=0+1-1=0 when checking the second extent, so even though lblock<=prev, the function does not return an error. As a result, bug_ON occurs in __es_tree_search and the system panics.
To solve this problem, we only need to check that: 1. The lblock of the first extent is not less than 0. 2. The lblock of the next extent is not less than the next block of the previous extent. The same applies to extent_idx.
Fixes: 5946d089379a ("ext4: check for overlapping extents in ext4_valid_extent_entries()") Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Baokun Li libaokun1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/extents.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0687e8ffa69d..4323186bae78 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -371,7 +371,7 @@ static int ext4_valid_extent_entries(struct inode *inode, { unsigned short entries; ext4_lblk_t lblock = 0; - ext4_lblk_t prev = 0; + ext4_lblk_t cur = 0;
if (eh->eh_entries == 0) return 1; @@ -395,11 +395,11 @@ static int ext4_valid_extent_entries(struct inode *inode,
/* Check for overlapping extents */ lblock = le32_to_cpu(ext->ee_block); - if ((lblock <= prev) && prev) { + if (lblock < cur) { *pblk = ext4_ext_pblock(ext); return 0; } - prev = lblock + ext4_ext_get_actual_len(ext) - 1; + cur = lblock + ext4_ext_get_actual_len(ext); ext++; entries--; } @@ -419,13 +419,13 @@ static int ext4_valid_extent_entries(struct inode *inode,
/* Check for overlapping index extents */ lblock = le32_to_cpu(ext_idx->ei_block); - if ((lblock <= prev) && prev) { + if (lblock < cur) { *pblk = ext4_idx_pblock(ext_idx); return 0; } ext_idx++; entries--; - prev = lblock; + cur = lblock + 1; } } return 1;
From: Lecopzer Chen lecopzer.chen@mediatek.com
mainline inclusion from mainline-5.13-rc1 commit 9a0732efa77418fc85b1bdc5ddee619e62f59545 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I57SUK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Linux support KAsan for VMALLOC since commit 3c5c3cfb9ef4da9 ("kasan: support backing vmalloc space with real shadow memory")
Like how the MODULES_VADDR does now, just not to early populate the VMALLOC_START between VMALLOC_END.
Before:
MODULE_VADDR: no mapping, no zero shadow at init VMALLOC_VADDR: backed with zero shadow at init
After:
MODULE_VADDR: no mapping, no zero shadow at init VMALLOC_VADDR: no mapping, no zero shadow at init
Thus the mapping will get allocated on demand by the core function of KASAN_VMALLOC.
----------- vmalloc_shadow_start | | | | | | <= non-mapping | | | | |-----------| |///////////|<- kimage shadow with page table mapping. |-----------| | | | | <= non-mapping | | ------------- vmalloc_shadow_end |00000000000| |00000000000| <= Zero shadow |00000000000| ------------- KASAN_SHADOW_END
Signed-off-by: Lecopzer Chen lecopzer.chen@mediatek.com Acked-by: Andrey Konovalov andreyknvl@gmail.com Tested-by: Andrey Konovalov andreyknvl@gmail.com Tested-by: Ard Biesheuvel ardb@kernel.org Link: https://lore.kernel.org/r/20210324040522.15548-2-lecopzer.chen@mediatek.com [catalin.marinas@arm.com: add a build check on VMALLOC_START != MODULES_END] Signed-off-by: Catalin Marinas catalin.marinas@arm.com (cherry picked from commit 9a0732efa77418fc85b1bdc5ddee619e62f59545) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Peng Liu liupeng256@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/mm/kasan_init.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index b24e43d20667..e7267bff73e3 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -212,6 +212,7 @@ void __init kasan_init(void) { u64 kimg_shadow_start, kimg_shadow_end; u64 mod_shadow_start, mod_shadow_end; + u64 vmalloc_shadow_end; phys_addr_t pa_start, pa_end; u64 i;
@@ -221,6 +222,8 @@ void __init kasan_init(void) mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR); mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END);
+ vmalloc_shadow_end = (u64)kasan_mem_to_shadow((void *)VMALLOC_END); + /* * We are going to perform proper setup of shadow memory. * At first we should unmap early shadow (clear_pgds() call below). @@ -239,12 +242,18 @@ void __init kasan_init(void)
kasan_populate_early_shadow(kasan_mem_to_shadow((void *)PAGE_END), (void *)mod_shadow_start); - kasan_populate_early_shadow((void *)kimg_shadow_end, - (void *)KASAN_SHADOW_END);
- if (kimg_shadow_start > mod_shadow_end) - kasan_populate_early_shadow((void *)mod_shadow_end, - (void *)kimg_shadow_start); + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) { + BUILD_BUG_ON(VMALLOC_START != MODULES_END); + kasan_populate_early_shadow((void *)vmalloc_shadow_end, + (void *)KASAN_SHADOW_END); + } else { + kasan_populate_early_shadow((void *)kimg_shadow_end, + (void *)KASAN_SHADOW_END); + if (kimg_shadow_start > mod_shadow_end) + kasan_populate_early_shadow((void *)mod_shadow_end, + (void *)kimg_shadow_start); + }
for_each_mem_range(i, &pa_start, &pa_end) { void *start = (void *)__phys_to_virt(pa_start);
From: Lecopzer Chen lecopzer.chen@mediatek.com
mainline inclusion from mainline-5.13-rc1 commit 7d7b88ff5f8fd79a72baacc521407a3101f0ffae category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I57SUK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Arm64 provides defined macro for KERNEL_START and KERNEL_END, thus replace them by the abstration instead of using _text and _end.
Signed-off-by: Lecopzer Chen lecopzer.chen@mediatek.com Acked-by: Andrey Konovalov andreyknvl@gmail.com Tested-by: Andrey Konovalov andreyknvl@gmail.com Tested-by: Ard Biesheuvel ardb@kernel.org Link: https://lore.kernel.org/r/20210324040522.15548-3-lecopzer.chen@mediatek.com Signed-off-by: Catalin Marinas catalin.marinas@arm.com (cherry picked from commit 7d7b88ff5f8fd79a72baacc521407a3101f0ffae) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Peng Liu liupeng256@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/mm/kasan_init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index e7267bff73e3..02051d4074c4 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -216,8 +216,8 @@ void __init kasan_init(void) phys_addr_t pa_start, pa_end; u64 i;
- kimg_shadow_start = (u64)kasan_mem_to_shadow(_text) & PAGE_MASK; - kimg_shadow_end = PAGE_ALIGN((u64)kasan_mem_to_shadow(_end)); + kimg_shadow_start = (u64)kasan_mem_to_shadow(KERNEL_START) & PAGE_MASK; + kimg_shadow_end = PAGE_ALIGN((u64)kasan_mem_to_shadow(KERNEL_END));
mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR); mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END); @@ -238,7 +238,7 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
kasan_map_populate(kimg_shadow_start, kimg_shadow_end, - early_pfn_to_nid(virt_to_pfn(lm_alias(_text)))); + early_pfn_to_nid(virt_to_pfn(lm_alias(KERNEL_START))));
kasan_populate_early_shadow(kasan_mem_to_shadow((void *)PAGE_END), (void *)mod_shadow_start);
From: Lecopzer Chen lecopzer.chen@mediatek.com
mainline inclusion from mainline-5.13-rc1 commit 71b613fc0c69080262f4ebe810c3d909d7ff8132 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I57SUK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We can backed shadow memory in vmalloc area after vmalloc area isn't populated at kasan_init(), thus make KASAN_VMALLOC selectable.
Signed-off-by: Lecopzer Chen lecopzer.chen@mediatek.com Acked-by: Andrey Konovalov andreyknvl@gmail.com Tested-by: Andrey Konovalov andreyknvl@gmail.com Tested-by: Ard Biesheuvel ardb@kernel.org Link: https://lore.kernel.org/r/20210324040522.15548-4-lecopzer.chen@mediatek.com Signed-off-by: Catalin Marinas catalin.marinas@arm.com (cherry picked from commit 71b613fc0c69080262f4ebe810c3d909d7ff8132) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Peng Liu liupeng256@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/Kconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 28b4e754e856..e253fdba1249 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -138,6 +138,7 @@ config ARM64 select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_JUMP_LABEL_RELATIVE select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48) + select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN select HAVE_ARCH_KASAN_SW_TAGS if HAVE_ARCH_KASAN select HAVE_ARCH_KFENCE select HAVE_ARCH_KGDB
From: Lecopzer Chen lecopzer.chen@mediatek.com
mainline inclusion from mainline-5.13-rc1 commit 31d02e7ab00873befd2cfb6e44581490d947c38b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I57SUK CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
After KASAN_VMALLOC works in arm64, we can randomize module region into vmalloc area now.
Test: VMALLOC area ffffffc010000000 fffffffdf0000000
before the patch: module_alloc_base/end ffffffc008b80000 ffffffc010000000 after the patch: module_alloc_base/end ffffffdcf4bed000 ffffffc010000000
And the function that insmod some modules is fine.
Suggested-by: Ard Biesheuvel ardb@kernel.org Signed-off-by: Lecopzer Chen lecopzer.chen@mediatek.com Link: https://lore.kernel.org/r/20210324040522.15548-5-lecopzer.chen@mediatek.com Signed-off-by: Catalin Marinas catalin.marinas@arm.com (cherry picked from commit 31d02e7ab00873befd2cfb6e44581490d947c38b)
[KF: 5.10 don't support hardware tag-based mode KASAN, KASAN_HW_TAGS, context conflict fixup in kaslr.c/module.c] Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Peng Liu liupeng256@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/kaslr.c | 15 ++++++++------- arch/arm64/kernel/module.c | 13 +++++++------ 2 files changed, 15 insertions(+), 13 deletions(-)
diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index b181e0544b79..9181d2856be3 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -151,14 +151,15 @@ u64 __init kaslr_early_init(u64 dt_phys) /* use the top 16 bits to randomize the linear region */ memstart_offset_seed = seed >> 48;
- if (IS_ENABLED(CONFIG_KASAN)) + if (!IS_ENABLED(CONFIG_KASAN_VMALLOC) && (IS_ENABLED(CONFIG_KASAN))) /* - * KASAN does not expect the module region to intersect the - * vmalloc region, since shadow memory is allocated for each - * module at load time, whereas the vmalloc region is shadowed - * by KASAN zero pages. So keep modules out of the vmalloc - * region if KASAN is enabled, and put the kernel well within - * 4 GB of the module region. + * KASAN without KASAN_VMALLOC does not expect the module region + * to intersect the vmalloc region, since shadow memory is + * allocated for each module at load time, whereas the vmalloc + * region is shadowed by KASAN zero pages. So keep modules + * out of the vmalloc region if KASAN is enabled without + * KASAN_VMALLOC, and put the kernel well within 4 GB of the + * module region. */ return offset % SZ_2G;
diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 031be3c6a9d5..acd557c83b6f 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -39,13 +39,14 @@ void *module_alloc(unsigned long size) NUMA_NO_NODE, __builtin_return_address(0));
if (!p && IS_ENABLED(CONFIG_ARM64_MODULE_PLTS) && - !IS_ENABLED(CONFIG_KASAN)) + (IS_ENABLED(CONFIG_KASAN_VMALLOC) || !IS_ENABLED(CONFIG_KASAN))) /* - * KASAN can only deal with module allocations being served - * from the reserved module region, since the remainder of - * the vmalloc region is already backed by zero shadow pages, - * and punching holes into it is non-trivial. Since the module - * region is not randomized when KASAN is enabled, it is even + * KASAN without KASAN_VMALLOC can only deal with module + * allocations being served from the reserved module region, + * since the remainder of the vmalloc region is already + * backed by zero shadow pages, and punching holes into it + * is non-trivial. Since the module region is not randomized + * when KASAN is enabled without KASAN_VMALLOC, it is even * less likely that the module region gets exhausted, so we * can simply omit this fallback in that case. */
From: Eric Dumazet edumazet@google.com
stable inclusion from stable-v5.10.113 commit 43ce33a68e2bcc431097e1075aad5393d0bf53ba bugzilla: 186701, https://gitee.com/src-openeuler/kernel/issues/I5850T CVE: CVE-2022-29581
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 3db09e762dc79584a69c10d74a6b98f89a9979f8 upstream.
We are now able to detect extra put_net() at the moment they happen, instead of much later in correct code paths.
u32_init_knode() / tcf_exts_init() populates the ->exts.net pointer, but as mentioned in tcf_exts_init(), the refcount on netns has not been elevated yet.
The refcount is taken only once tcf_exts_get_net() is called.
So the two u32_destroy_key() calls from u32_change() are attempting to release an invalid reference on the netns.
syzbot report:
refcount_t: decrement hit 0; leaking memory. WARNING: CPU: 0 PID: 21708 at lib/refcount.c:31 refcount_warn_saturate+0xbf/0x1e0 lib/refcount.c:31 Modules linked in: CPU: 0 PID: 21708 Comm: syz-executor.5 Not tainted 5.18.0-rc2-next-20220412-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:refcount_warn_saturate+0xbf/0x1e0 lib/refcount.c:31 Code: 1d 14 b6 b2 09 31 ff 89 de e8 6d e9 89 fd 84 db 75 e0 e8 84 e5 89 fd 48 c7 c7 40 aa 26 8a c6 05 f4 b5 b2 09 01 e8 e5 81 2e 05 <0f> 0b eb c4 e8 68 e5 89 fd 0f b6 1d e3 b5 b2 09 31 ff 89 de e8 38 RSP: 0018:ffffc900051af1b0 EFLAGS: 00010286 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000040000 RSI: ffffffff8160a0c8 RDI: fffff52000a35e28 RBP: 0000000000000004 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff81604a9e R11: 0000000000000000 R12: 1ffff92000a35e3b R13: 00000000ffffffef R14: ffff8880211a0194 R15: ffff8880577d0a00 FS: 00007f25d183e700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f19c859c028 CR3: 0000000051009000 CR4: 00000000003506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> __refcount_dec include/linux/refcount.h:344 [inline] refcount_dec include/linux/refcount.h:359 [inline] ref_tracker_free+0x535/0x6b0 lib/ref_tracker.c:118 netns_tracker_free include/net/net_namespace.h:327 [inline] put_net_track include/net/net_namespace.h:341 [inline] tcf_exts_put_net include/net/pkt_cls.h:255 [inline] u32_destroy_key.isra.0+0xa7/0x2b0 net/sched/cls_u32.c:394 u32_change+0xe01/0x3140 net/sched/cls_u32.c:909 tc_new_tfilter+0x98d/0x2200 net/sched/cls_api.c:2148 rtnetlink_rcv_msg+0x80d/0xb80 net/core/rtnetlink.c:6016 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2495 netlink_unicast_kernel net/netlink/af_netlink.c:1319 [inline] netlink_unicast+0x543/0x7f0 net/netlink/af_netlink.c:1345 netlink_sendmsg+0x904/0xe00 net/netlink/af_netlink.c:1921 sock_sendmsg_nosec net/socket.c:705 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:725 ____sys_sendmsg+0x6e2/0x800 net/socket.c:2413 ___sys_sendmsg+0xf3/0x170 net/socket.c:2467 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2496 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f25d0689049 Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f25d183e168 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 00007f25d079c030 RCX: 00007f25d0689049 RDX: 0000000000000000 RSI: 0000000020000340 RDI: 0000000000000005 RBP: 00007f25d06e308d R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffd0b752e3f R14: 00007f25d183e300 R15: 0000000000022000 </TASK>
Fixes: 35c55fc156d8 ("cls_u32: use tcf_exts_get_net() before call_rcu()") Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Cc: Cong Wang xiyou.wangcong@gmail.com Cc: Jiri Pirko jiri@resnulli.us Acked-by: Jamal Hadi Salim jhs@mojatatu.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Wang Weiyang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/sched/cls_u32.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 54209a18d7fe..b61db335c49d 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -386,14 +386,19 @@ static int u32_init(struct tcf_proto *tp) return 0; }
-static int u32_destroy_key(struct tc_u_knode *n, bool free_pf) +static void __u32_destroy_key(struct tc_u_knode *n) { struct tc_u_hnode *ht = rtnl_dereference(n->ht_down);
tcf_exts_destroy(&n->exts); - tcf_exts_put_net(&n->exts); if (ht && --ht->refcnt == 0) kfree(ht); + kfree(n); +} + +static void u32_destroy_key(struct tc_u_knode *n, bool free_pf) +{ + tcf_exts_put_net(&n->exts); #ifdef CONFIG_CLS_U32_PERF if (free_pf) free_percpu(n->pf); @@ -402,8 +407,7 @@ static int u32_destroy_key(struct tc_u_knode *n, bool free_pf) if (free_pf) free_percpu(n->pcpu_success); #endif - kfree(n); - return 0; + __u32_destroy_key(n); }
/* u32_delete_key_rcu should be called when free'ing a copied @@ -898,13 +902,13 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, tca[TCA_RATE], ovr, extack);
if (err) { - u32_destroy_key(new, false); + __u32_destroy_key(new); return err; }
err = u32_replace_hw_knode(tp, new, flags, extack); if (err) { - u32_destroy_key(new, false); + __u32_destroy_key(new); return err; }
From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v5.18-rc1 commit 086d49058cd8471046ae9927524708820f5fd1c7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I58CJX?from=project-issue CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
IPv6 has this hack changing sk->sk_prot when an IPv6 socket is 'converted' to an IPv4 one with IPV6_ADDRFORM option.
This operation is only performed for TCP and UDP, knowing their 'struct proto' for the two network families are populated in the same way, and can not disappear while a reader might use and dereference sk->sk_prot.
If we think about it all reads of sk->sk_prot while either socket lock or RTNL is not acquired should be using READ_ONCE().
Also note that other layers like MPTCP, XFRM, CHELSIO_TLS also write over sk->sk_prot.
BUG: KCSAN: data-race in inet6_recvmsg / ipv6_setsockopt
write to 0xffff8881386f7aa8 of 8 bytes by task 26932 on cpu 0: do_ipv6_setsockopt net/ipv6/ipv6_sockglue.c:492 [inline] ipv6_setsockopt+0x3758/0x3910 net/ipv6/ipv6_sockglue.c:1019 udpv6_setsockopt+0x85/0x90 net/ipv6/udp.c:1649 sock_common_setsockopt+0x5d/0x70 net/core/sock.c:3489 __sys_setsockopt+0x209/0x2a0 net/socket.c:2180 __do_sys_setsockopt net/socket.c:2191 [inline] __se_sys_setsockopt net/socket.c:2188 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2188 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae
read to 0xffff8881386f7aa8 of 8 bytes by task 26911 on cpu 1: inet6_recvmsg+0x7a/0x210 net/ipv6/af_inet6.c:659 ____sys_recvmsg+0x16c/0x320 ___sys_recvmsg net/socket.c:2674 [inline] do_recvmmsg+0x3f5/0xae0 net/socket.c:2768 __sys_recvmmsg net/socket.c:2847 [inline] __do_sys_recvmmsg net/socket.c:2870 [inline] __se_sys_recvmmsg net/socket.c:2863 [inline] __x64_sys_recvmmsg+0xde/0x160 net/socket.c:2863 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae
value changed: 0xffffffff85e0e980 -> 0xffffffff85e01580
Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 26911 Comm: syz-executor.3 Not tainted 5.17.0-rc2-syzkaller-00316-g0457e5153e0e-dirty #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net
conflicts: net/ipv6/af_inet6.c
Signed-off-by: Xu Jia xujia39@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv6/af_inet6.c | 23 +++++++++++++++++------ net/ipv6/ipv6_sockglue.c | 6 ++++-- 2 files changed, 21 insertions(+), 8 deletions(-)
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index f141a441490a..9c9ab6b4aec0 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -439,11 +439,13 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + const struct proto *prot; int err = 0;
/* If the socket has its own bind function then use it. */ - if (sk->sk_prot->bind) - return sk->sk_prot->bind(sk, uaddr, addr_len); + prot = READ_ONCE(sk->sk_prot); + if (prot->bind) + return prot->bind(sk, uaddr, addr_len);
if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; @@ -551,6 +553,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) void __user *argp = (void __user *)arg; struct sock *sk = sock->sk; struct net *net = sock_net(sk); + const struct proto *prot;
switch (cmd) { case SIOCADDRT: @@ -568,9 +571,11 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFDSTADDR: return addrconf_set_dstaddr(net, argp); default: - if (!sk->sk_prot->ioctl) + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (!prot->ioctl) return -ENOIOCTLCMD; - return sk->sk_prot->ioctl(sk, cmd, arg); + return prot->ioctl(sk, cmd, arg); } /*NOTREACHED*/ return 0; @@ -632,11 +637,14 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_sendmsg(struct sock *, struct msghdr *, int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; + const struct proto *prot;
if (unlikely(inet_send_prepare(sk))) return -EAGAIN;
- return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udpv6_sendmsg, + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + return INDIRECT_CALL_2(prot->sendmsg, tcp_sendmsg, udpv6_sendmsg, sk, msg, size); }
@@ -646,13 +654,16 @@ int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; + const struct proto *prot; int addr_len = 0; int err;
if (likely(!(flags & MSG_ERRQUEUE))) sock_rps_record_flow(sk);
- err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + err = INDIRECT_CALL_2(prot->recvmsg, tcp_recvmsg, udpv6_recvmsg, sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); if (err >= 0) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 43a894bf9a1b..1ae33a882b9a 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -473,7 +473,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); local_bh_enable(); - sk->sk_prot = &tcp_prot; + /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + WRITE_ONCE(sk->sk_prot, &tcp_prot); icsk->icsk_af_ops = &ipv4_specific; sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; @@ -487,7 +488,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); local_bh_enable(); - sk->sk_prot = prot; + /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + WRITE_ONCE(sk->sk_prot, prot); sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; }
From: Zhang Yi yi.zhang@huawei.com
hulk inclusion category: bugfix bugzilla: 186737, https://gitee.com/openeuler/kernel/issues/I58COJ CVE: NA
--------------------------------
We have already check the io_error and uptodate flag before submitting the superblock buffer, and re-set the uptodate flag if it has been failed to write out. But it was lockless and could be raced by another ext4_commit_super(), and finally trigger '!uptodate' WARNING when marking buffer dirty. Fix it by submit buffer directly.
Signed-off-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/super.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a6419161f856..9c004269f8d6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5638,7 +5638,6 @@ static void ext4_update_super(struct super_block *sb) static int ext4_commit_super(struct super_block *sb) { struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; - int error = 0;
if (!sbh) return -EINVAL; @@ -5647,6 +5646,13 @@ static int ext4_commit_super(struct super_block *sb)
ext4_update_super(sb);
+ lock_buffer(sbh); + /* Buffer got discarded which means block device got invalidated */ + if (!buffer_mapped(sbh)) { + unlock_buffer(sbh); + return -EIO; + } + if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the @@ -5661,17 +5667,20 @@ static int ext4_commit_super(struct super_block *sb) clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } - BUFFER_TRACE(sbh, "marking dirty"); - mark_buffer_dirty(sbh); - error = __sync_dirty_buffer(sbh, - REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); + get_bh(sbh); + sbh->b_end_io = end_buffer_write_sync; + clear_buffer_dirty(sbh); + submit_bh(REQ_OP_WRITE, + REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0), sbh); + wait_on_buffer(sbh); if (buffer_write_io_error(sbh)) { ext4_msg(sb, KERN_ERR, "I/O error while writing " "superblock"); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); + return -EIO; } - return error; + return 0; }
/*
From: Ye Bin yebin10@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I585D4?from=project-issue CVE: NA
---------------------------
We got issue as follows: EXT4-fs (loop0): mounted filesystem without journal. Opts: ,errors=continue ext4_get_first_dir_block: bh->b_data=0xffff88810bee6000 len=34478 ext4_get_first_dir_block: *parent_de=0xffff88810beee6ae bh->b_data=0xffff88810bee6000 ext4_rename_dir_prepare: [1] parent_de=0xffff88810beee6ae
================================================================== BUG: KASAN: use-after-free in ext4_rename_dir_prepare+0x152/0x220 Read of size 4 at addr ffff88810beee6ae by task rep/1895
CPU: 13 PID: 1895 Comm: rep Not tainted 5.10.0+ #241 Call Trace: dump_stack+0xbe/0xf9 print_address_description.constprop.0+0x1e/0x220 kasan_report.cold+0x37/0x7f ext4_rename_dir_prepare+0x152/0x220 ext4_rename+0xf44/0x1ad0 ext4_rename2+0x11c/0x170 vfs_rename+0xa84/0x1440 do_renameat2+0x683/0x8f0 __x64_sys_renameat+0x53/0x60 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f45a6fc41c9 RSP: 002b:00007ffc5a470218 EFLAGS: 00000246 ORIG_RAX: 0000000000000108 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f45a6fc41c9 RDX: 0000000000000005 RSI: 0000000020000180 RDI: 0000000000000005 RBP: 00007ffc5a470240 R08: 00007ffc5a470160 R09: 0000000020000080 R10: 00000000200001c0 R11: 0000000000000246 R12: 0000000000400bb0 R13: 00007ffc5a470320 R14: 0000000000000000 R15: 0000000000000000
The buggy address belongs to the page: page:00000000440015ce refcount:0 mapcount:0 mapping:0000000000000000 index:0x1 pfn:0x10beee flags: 0x200000000000000() raw: 0200000000000000 ffffea00043ff4c8 ffffea0004325608 0000000000000000 raw: 0000000000000001 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected
Memory state around the buggy address: ffff88810beee580: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88810beee600: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
ffff88810beee680: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
^ ffff88810beee700: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ffff88810beee780: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ================================================================== Disabling lock debugging due to kernel taint ext4_rename_dir_prepare: [2] parent_de->inode=3537895424 ext4_rename_dir_prepare: [3] dir=0xffff888124170140 ext4_rename_dir_prepare: [4] ino=2 ext4_rename_dir_prepare: ent->dir->i_ino=2 parent=-757071872
Reason is first directory entry which 'rec_len' is 34478, then will get illegal parent entry. Now, we do not check directory entry after read directory block in 'ext4_get_first_dir_block'. To solve this issue, check directory entry in 'ext4_get_first_dir_block'.
Signed-off-by: Ye Bin yebin10@huawei.com Signed-off-by: Li Nan linan122@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/namei.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a2193f21f418..f7279ff5f975 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3526,6 +3526,9 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, struct buffer_head *bh;
if (!ext4_has_inline_data(inode)) { + struct ext4_dir_entry_2 *de; + unsigned int offset; + /* The first directory block must not be a hole, so * treat it as DIRENT_HTREE */ @@ -3534,9 +3537,30 @@ static struct buffer_head *ext4_get_first_dir_block(handle_t *handle, *retval = PTR_ERR(bh); return NULL; } - *parent_de = ext4_next_entry( - (struct ext4_dir_entry_2 *)bh->b_data, - inode->i_sb->s_blocksize); + + de = (struct ext4_dir_entry_2 *) bh->b_data; + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, + bh->b_size, 0) || + le32_to_cpu(de->inode) != inode->i_ino || + strcmp(".", de->name)) { + EXT4_ERROR_INODE(inode, "directory missing '.'"); + brelse(bh); + *retval = -EFSCORRUPTED; + return NULL; + } + offset = ext4_rec_len_from_disk(de->rec_len, + inode->i_sb->s_blocksize); + de = ext4_next_entry(de, inode->i_sb->s_blocksize); + if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, + bh->b_size, offset) || + le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) { + EXT4_ERROR_INODE(inode, "directory missing '..'"); + brelse(bh); + *retval = -EFSCORRUPTED; + return NULL; + } + *parent_de = de; + return bh; }
From: Ye Bin yebin10@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I58A7W?from=project-issue CVE: N/A
---------------------------
We got issue as follows: EXT4-fs error (device loop0) in ext4_reserve_inode_write:5741: Out of memory EXT4-fs error (device loop0): ext4_setattr:5462: inode #13: comm syz-executor.0: mark_inode_dirty error EXT4-fs error (device loop0) in ext4_setattr:5519: Out of memory EXT4-fs error (device loop0): ext4_ind_map_blocks:595: inode #13: comm syz-executor.0: Can't allocate blocks for non-extent mapped inodes with bigalloc ------------[ cut here ]------------ WARNING: CPU: 1 PID: 4361 at fs/ext4/file.c:301 ext4_file_write_iter+0x11c9/0x1220 Modules linked in: CPU: 1 PID: 4361 Comm: syz-executor.0 Not tainted 5.10.0+ #1 RIP: 0010:ext4_file_write_iter+0x11c9/0x1220 RSP: 0018:ffff924d80b27c00 EFLAGS: 00010282 RAX: ffffffff815a3379 RBX: 0000000000000000 RCX: 000000003b000000 RDX: ffff924d81601000 RSI: 00000000000009cc RDI: 00000000000009cd RBP: 000000000000000d R08: ffffffffbc5a2c6b R09: 0000902e0e52a96f R10: ffff902e2b7c1b40 R11: ffff902e2b7c1b40 R12: 000000000000000a R13: 0000000000000001 R14: ffff902e0e52aa10 R15: ffffffffffffff8b FS: 00007f81a7f65700(0000) GS:ffff902e3bc80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffff600400 CR3: 000000012db88001 CR4: 00000000003706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: do_iter_readv_writev+0x2e5/0x360 do_iter_write+0x112/0x4c0 do_pwritev+0x1e5/0x390 __x64_sys_pwritev2+0x7e/0xa0 do_syscall_64+0x37/0x50 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Above issue may happen as follows: Assume inode.i_size=4096 EXT4_I(inode)->i_disksize=4096
step 1: set inode->i_isize = 8192 ext4_setattr if (attr->ia_size != inode->i_size) EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty ext4_reserve_inode_write ext4_get_inode_loc __ext4_get_inode_loc sb_getblk --> return -ENOMEM ... if (!error) ->will not update i_size i_size_write(inode, attr->ia_size); Now: inode.i_size=4096 EXT4_I(inode)->i_disksize=8192
step 2: Direct write 4096 bytes ext4_file_write_iter ext4_dio_write_iter iomap_dio_rw ->return error if (extend) ext4_handle_inode_extension WARN_ON_ONCE(i_size_read(inode) < EXT4_I(inode)->i_disksize); ->Then trigger warning.
To solve above issue, if mark inode dirty failed in ext4_setattr just set 'EXT4_I(inode)->i_disksize' with old value.
Signed-off-by: Ye Bin yebin10@huawei.com Signed-off-by: Li Nan linan122@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/inode.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8a48214d97cf..6cf2eecef96b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5361,6 +5361,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; loff_t oldsize = inode->i_size; + loff_t old_disksize; int shrink = (attr->ia_size < inode->i_size);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { @@ -5434,6 +5435,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) inode->i_sb->s_blocksize_bits);
down_write(&EXT4_I(inode)->i_data_sem); + old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; rc = ext4_mark_inode_dirty(handle, inode); if (!error) @@ -5445,6 +5447,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) */ if (!error) i_size_write(inode, attr->ia_size); + else + EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); if (error)
From: Ye Bin yebin10@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I58A6T?from=project-issue CVE: NA
---------------------------
we got issue as follows: EXT4-fs error (device loop0): ext4_mb_generate_buddy:1141: group 0, block bitmap and bg descriptor inconsistent: 25 vs 31513 free cls ------------[ cut here ]------------ kernel BUG at fs/ext4/inode.c:2708! invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 2 PID: 2147 Comm: rep Not tainted 5.18.0-rc2-next-20220413+ #155 RIP: 0010:ext4_writepages+0x1977/0x1c10 RSP: 0018:ffff88811d3e7880 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffff88811c098000 RDX: 0000000000000000 RSI: ffff88811c098000 RDI: 0000000000000002 RBP: ffff888128140f50 R08: ffffffffb1ff6387 R09: 0000000000000000 R10: 0000000000000007 R11: ffffed10250281ea R12: 0000000000000001 R13: 00000000000000a4 R14: ffff88811d3e7bb8 R15: ffff888128141028 FS: 00007f443aed9740(0000) GS:ffff8883aef00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000020007200 CR3: 000000011c2a4000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: <TASK> do_writepages+0x130/0x3a0 filemap_fdatawrite_wbc+0x83/0xa0 filemap_flush+0xab/0xe0 ext4_alloc_da_blocks+0x51/0x120 __ext4_ioctl+0x1534/0x3210 __x64_sys_ioctl+0x12c/0x170 do_syscall_64+0x3b/0x90
It may happen as follows: 1. write inline_data inode vfs_write new_sync_write ext4_file_write_iter ext4_buffered_write_iter generic_perform_write ext4_da_write_begin ext4_da_write_inline_data_begin -> If inline data size too small will allocate block to write, then mapping will has dirty page ext4_da_convert_inline_data_to_extent ->clear EXT4_STATE_MAY_INLINE_DATA 2. fallocate do_vfs_ioctl ioctl_preallocate vfs_fallocate ext4_fallocate ext4_convert_inline_data ext4_convert_inline_data_nolock ext4_map_blocks -> fail will goto restore data ext4_restore_inline_data ext4_create_inline_data ext4_write_inline_data ext4_set_inode_state -> set inode EXT4_STATE_MAY_INLINE_DATA 3. writepages __ext4_ioctl ext4_alloc_da_blocks filemap_flush filemap_fdatawrite_wbc do_writepages ext4_writepages if (ext4_has_inline_data(inode)) BUG_ON(ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
The root cause of this issue is we destroy inline data until call ext4_writepages under delay allocation mode. But there maybe already covert from inline to extent. To solved this issue, we call filemap_flush firstly.
Signed-off-by: Ye Bin yebin10@huawei.com Reviewed-by: Jan Kara jack@suse.cz Signed-off-by: Li Nan linan122@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/inline.c | 12 ++++++++++++ 1 file changed, 12 insertions(+)
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index c8f19901a44b..bf70efd24519 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1981,6 +1981,18 @@ int ext4_convert_inline_data(struct inode *inode) if (!ext4_has_inline_data(inode)) { ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return 0; + } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { + /* + * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is + * cleared. This means we are in the middle of moving of + * inline data to delay allocated block. Just force writeout + * here to finish conversion. + */ + error = filemap_flush(inode->i_mapping); + if (error) + return error; + if (!ext4_has_inline_data(inode)) + return 0; }
needed_blocks = ext4_writepage_trans_blocks(inode);