From: Cong Wang cong.wang@bytedance.com
mainline inclusion from mainline-v5.13-rc1 commit 799aa7f98d53e0f541fa6b4dc9aa47b4ff2178e3 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545NW
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
We do not have to lock the sock to avoid losing sk_socket, instead we can purge all the ingress queues when we close the socket. Sending or receiving packets after orphaning socket makes no sense.
We do purge these queues when psock refcnt reaches zero but here we want to purge them explicitly in sock_map_close(). There are also some nasty race conditions on testing bit SK_PSOCK_TX_ENABLED and queuing/canceling the psock work, we can expand psock->ingress_lock a bit to protect them too.
As noticed by John, we still have to lock the psock->work, because the same work item could be running concurrently on different CPU's.
Signed-off-by: Cong Wang cong.wang@bytedance.com Signed-off-by: Alexei Starovoitov ast@kernel.org Acked-by: John Fastabend john.fastabend@gmail.com Link: https://lore.kernel.org/bpf/20210331023237.41094-5-xiyou.wangcong@gmail.com Signed-off-by: Wang Yufen wangyufen@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Liu Jian liujian56@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/skmsg.h | 3 ++- net/core/skmsg.c | 50 +++++++++++++++++++++++++++++-------------- net/core/sock_map.c | 1 + 3 files changed, 37 insertions(+), 17 deletions(-)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 2324cd5208e0..31ba9d43ffd1 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -108,6 +108,7 @@ struct sk_psock { void (*saved_close)(struct sock *sk, long timeout); void (*saved_write_space)(struct sock *sk); struct proto *sk_proto; + struct mutex work_mutex; struct sk_psock_work_state work_state; struct work_struct work; union { @@ -361,6 +362,7 @@ static inline void sk_psock_report_error(struct sk_psock *psock, int err) }
struct sk_psock *sk_psock_init(struct sock *sk, int node); +void sk_psock_stop(struct sk_psock *psock, bool wait);
int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock); void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock); @@ -451,7 +453,6 @@ static inline struct sk_psock *sk_psock_get(struct sock *sk) return psock; }
-void sk_psock_stop(struct sock *sk, struct sk_psock *psock); void sk_psock_drop(struct sock *sk, struct sk_psock *psock);
static inline void sk_psock_put(struct sock *sk, struct sk_psock *psock) diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 34c356988176..7af259e92a0c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -504,7 +504,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; - return skb_send_sock_locked(psock->sk, skb, off, len); + return skb_send_sock(psock->sk, skb, off, len); } return sk_psock_skb_ingress(psock, skb); } @@ -518,8 +518,7 @@ static void sk_psock_backlog(struct work_struct *work) u32 len, off; int ret;
- /* Lock sock to avoid losing sk_socket during loop. */ - lock_sock(psock->sk); + mutex_lock(&psock->work_mutex); if (state->skb) { skb = state->skb; len = state->len; @@ -535,7 +534,7 @@ static void sk_psock_backlog(struct work_struct *work) ingress = tcp_skb_bpf_ingress(skb); do { ret = -EIO; - if (likely(psock->sk->sk_socket)) + if (!sock_flag(psock->sk, SOCK_DEAD)) ret = sk_psock_handle_skb(psock, skb, off, len, ingress); if (ret <= 0) { @@ -559,7 +558,7 @@ static void sk_psock_backlog(struct work_struct *work) kfree_skb(skb); } end: - release_sock(psock->sk); + mutex_unlock(&psock->work_mutex); }
struct sk_psock *sk_psock_init(struct sock *sk, int node) @@ -597,6 +596,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) spin_lock_init(&psock->link_lock);
INIT_WORK(&psock->work, sk_psock_backlog); + mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); @@ -637,12 +637,10 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) } }
-static void sk_psock_zap_ingress(struct sk_psock *psock) +static void __sk_psock_zap_ingress(struct sk_psock *psock) { skb_queue_purge(&psock->ingress_skb); - spin_lock_bh(&psock->ingress_lock); __sk_psock_purge_ingress_msg(psock); - spin_unlock_bh(&psock->ingress_lock); }
static void sk_psock_link_destroy(struct sk_psock *psock) @@ -655,6 +653,18 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } }
+void sk_psock_stop(struct sk_psock *psock, bool wait) +{ + spin_lock_bh(&psock->ingress_lock); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + sk_psock_cork_free(psock); + __sk_psock_zap_ingress(psock); + spin_unlock_bh(&psock->ingress_lock); + + if (wait) + cancel_work_sync(&psock->work); +} + static void sk_psock_destroy_deferred(struct work_struct *gc) { struct sk_psock *psock = container_of(gc, struct sk_psock, gc); @@ -666,12 +676,12 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) strp_done(&psock->parser.strp);
cancel_work_sync(&psock->work); + mutex_destroy(&psock->work_mutex);
psock_progs_drop(&psock->progs);
sk_psock_link_destroy(psock); sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock);
if (psock->sk_redir) sock_put(psock->sk_redir); @@ -689,8 +699,7 @@ static void sk_psock_destroy(struct rcu_head *rcu)
void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { - sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock); + sk_psock_stop(psock, false);
write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); @@ -700,7 +709,6 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) else if (psock->progs.skb_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED);
call_rcu(&psock->rcu, sk_psock_destroy); } @@ -786,14 +794,20 @@ static int sk_psock_skb_redirect(struct sk_buff *skb) * error that caused the pipe to break. We can't send a packet on * a socket that is in this state so we drop the skb. */ - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { + kfree_skb(skb); + return -EIO; + } + spin_lock_bh(&psock_other->ingress_lock); + if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + spin_unlock_bh(&psock_other->ingress_lock); kfree_skb(skb); return -EIO; }
skb_queue_tail(&psock_other->ingress_skb, skb); schedule_work(&psock_other->work); + spin_unlock_bh(&psock_other->ingress_lock); return 0; }
@@ -859,8 +873,12 @@ static int sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, err = sk_psock_skb_ingress_self(psock, skb); } if (err < 0) { - skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + } + spin_unlock_bh(&psock->ingress_lock); } break; case __SK_REDIRECT: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 4534cd6be0c8..8525c11dbc3a 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1559,6 +1559,7 @@ void sock_map_close(struct sock *sk, long timeout) saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); + sk_psock_stop(psock, true); release_sock(sk); saved_close(sk, timeout); }