From: Kuniyuki Iwashima kuniyu@amazon.co.jp
[ Upstream commit efc6b6f6c3113e8b203b9debfb72d81e0f3dcace ]
Currently, SO_REUSEPORT does not work well if connected sockets are in a UDP reuseport group.
Then reuseport_has_conns() returns true and the result of reuseport_select_sock() is discarded. Also, unconnected sockets have the same score, hence only does the first unconnected socket in udp_hslot always receive all packets sent to unconnected sockets.
So, the result of reuseport_select_sock() should be used for load balancing.
The noteworthy point is that the unconnected sockets placed after connected sockets in sock_reuseport.socks will receive more packets than others because of the algorithm in reuseport_select_sock().
index | connected | reciprocal_scale | result --------------------------------------------- 0 | no | 20% | 40% 1 | no | 20% | 20% 2 | yes | 20% | 0% 3 | no | 20% | 40% 4 | yes | 20% | 0%
If most of the sockets are connected, this can be a problem, but it still works better than now.
Fixes: acdcecc61285 ("udp: correct reuseport selection with connected sockets") CC: Willem de Bruijn willemb@google.com Reviewed-by: Benjamin Herrenschmidt benh@amazon.com Signed-off-by: Kuniyuki Iwashima kuniyu@amazon.co.jp Acked-by: Willem de Bruijn willemb@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org --- net/ipv4/udp.c | 15 +++++++++------ net/ipv6/udp.c | 15 +++++++++------ 2 files changed, 18 insertions(+), 12 deletions(-)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d32c9074ea65..1d27d3b24a5e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -430,7 +430,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, struct udp_hslot *hslot2, struct sk_buff *skb) { - struct sock *sk, *result; + struct sock *sk, *result, *reuseport_result; int score, badness; u32 hash = 0;
@@ -440,17 +440,20 @@ static struct sock *udp4_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { + reuseport_result = NULL; + if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { hash = udp_ehashfn(net, daddr, hnum, saddr, sport); - result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (result && !reuseport_has_conns(sk, false)) - return result; + reuseport_result = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (reuseport_result && !reuseport_has_conns(sk, false)) + return reuseport_result; } + + result = reuseport_result ? : sk; badness = score; - result = sk; } } return result; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index c122acdb115d..6609f7b6994a 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -163,7 +163,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, int dif, int sdif, bool exact_dif, struct udp_hslot *hslot2, struct sk_buff *skb) { - struct sock *sk, *result; + struct sock *sk, *result, *reuseport_result; int score, badness; u32 hash = 0;
@@ -173,17 +173,20 @@ static struct sock *udp6_lib_lookup2(struct net *net, score = compute_score(sk, net, saddr, sport, daddr, hnum, dif, sdif, exact_dif); if (score > badness) { + reuseport_result = NULL; + if (sk->sk_reuseport && sk->sk_state != TCP_ESTABLISHED) { hash = udp6_ehashfn(net, daddr, hnum, saddr, sport);
- result = reuseport_select_sock(sk, hash, skb, - sizeof(struct udphdr)); - if (result && !reuseport_has_conns(sk, false)) - return result; + reuseport_result = reuseport_select_sock(sk, hash, skb, + sizeof(struct udphdr)); + if (reuseport_result && !reuseport_has_conns(sk, false)) + return reuseport_result; } - result = sk; + + result = reuseport_result ? : sk; badness = score; } }