From: Kuniyuki Iwashima kuniyu@amazon.co.jp
mainline inclusion from mainline-v5.17-rc1 commit afd20b9290e184c203fe22f2d6b80dc7127ba724 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4OM1C CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
The hash table of AF_UNIX sockets is protected by the single lock. This patch replaces it with per-hash locks.
The effect is noticeable when we handle multiple sockets simultaneously. Here is a test result on an EC2 c5.24xlarge instance. It shows latency (under 10us only) in unix_insert_unbound_socket() while 64 CPUs creating 1024 sockets for each in parallel.
Without this patch:
nsec : count distribution 0 : 179 | | 500 : 3021 |********* | 1000 : 6271 |******************* | 1500 : 6318 |******************* | 2000 : 5828 |***************** | 2500 : 5124 |*************** | 3000 : 4426 |************* | 3500 : 3672 |*********** | 4000 : 3138 |********* | 4500 : 2811 |******** | 5000 : 2384 |******* | 5500 : 2023 |****** | 6000 : 1954 |***** | 6500 : 1737 |***** | 7000 : 1749 |***** | 7500 : 1520 |**** | 8000 : 1469 |**** | 8500 : 1394 |**** | 9000 : 1232 |*** | 9500 : 1138 |*** | 10000 : 994 |*** |
With this patch:
nsec : count distribution 0 : 1634 |**** | 500 : 13170 |****************************************| 1000 : 13156 |*************************************** | 1500 : 9010 |*************************** | 2000 : 6363 |******************* | 2500 : 4443 |************* | 3000 : 3240 |********* | 3500 : 2549 |******* | 4000 : 1872 |***** | 4500 : 1504 |**** | 5000 : 1247 |*** | 5500 : 1035 |*** | 6000 : 889 |** | 6500 : 744 |** | 7000 : 634 |* | 7500 : 498 |* | 8000 : 433 |* | 8500 : 355 |* | 9000 : 336 |* | 9500 : 284 | | 10000 : 243 | |
Signed-off-by: Kuniyuki Iwashima kuniyu@amazon.co.jp Signed-off-by: Jakub Kicinski kuba@kernel.org
Conflicts: net/unix/af_unix.c
Signed-off-by: Baisong Zhong zhongbaisong@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/net/af_unix.h | 2 +- net/unix/af_unix.c | 98 ++++++++++++++++++++++++++----------------- net/unix/diag.c | 20 ++++----- 3 files changed, 71 insertions(+), 49 deletions(-)
diff --git a/include/net/af_unix.h b/include/net/af_unix.h index beb6ef56bf85..ae41d8ee970a 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -20,7 +20,7 @@ struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_BITS 8
extern unsigned int unix_tot_inflight; -extern spinlock_t unix_table_lock; +extern spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE]; extern struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
struct unix_address { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 552f95c7d9e1..b497dee9b2a4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -116,14 +116,14 @@
#include "scm.h"
+spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE]; +EXPORT_SYMBOL_GPL(unix_table_locks); struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_socket_table); -DEFINE_SPINLOCK(unix_table_lock); -EXPORT_SYMBOL_GPL(unix_table_lock); static atomic_long_t unix_nr_socks;
/* SMP locking strategy: - * hash table is protected with spinlock unix_table_lock + * hash table is protected with spinlock unix_table_locks * each socket state is protected by separate spin lock. */
@@ -156,6 +156,25 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, return hash & (UNIX_HASH_SIZE - 1); }
+static void unix_table_double_lock(unsigned int hash1, unsigned int hash2) +{ + /* hash1 and hash2 is never the same because + * one is between 0 and UNIX_HASH_SIZE - 1, and + * another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2. + */ + if (hash1 > hash2) + swap(hash1, hash2); + + spin_lock(&unix_table_locks[hash1]); + spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING); +} + +static void unix_table_double_unlock(unsigned int hash1, unsigned int hash2) +{ + spin_unlock(&unix_table_locks[hash1]); + spin_unlock(&unix_table_locks[hash2]); +} + #ifdef CONFIG_SECURITY_NETWORK static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) { @@ -297,16 +316,16 @@ static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr,
static void unix_remove_socket(struct sock *sk) { - spin_lock(&unix_table_lock); + spin_lock(&unix_table_locks[sk->sk_hash]); __unix_remove_socket(sk); - spin_unlock(&unix_table_lock); + spin_unlock(&unix_table_locks[sk->sk_hash]); }
static void unix_insert_unbound_socket(struct sock *sk) { - spin_lock(&unix_table_lock); + spin_lock(&unix_table_locks[sk->sk_hash]); __unix_insert_socket(sk); - spin_unlock(&unix_table_lock); + spin_unlock(&unix_table_locks[sk->sk_hash]); }
static struct sock *__unix_find_socket_byname(struct net *net, @@ -334,11 +353,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net, { struct sock *s;
- spin_lock(&unix_table_lock); + spin_lock(&unix_table_locks[hash]); s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); - spin_unlock(&unix_table_lock); + spin_unlock(&unix_table_locks[hash]); return s; }
@@ -347,19 +366,18 @@ static struct sock *unix_find_socket_byinode(struct inode *i) unsigned int hash = unix_bsd_hash(i); struct sock *s;
- spin_lock(&unix_table_lock); + spin_lock(&unix_table_locks[hash]); sk_for_each(s, &unix_socket_table[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry;
if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); - goto found; + spin_unlock(&unix_table_locks[hash]); + return s; } } - s = NULL; -found: - spin_unlock(&unix_table_lock); - return s; + spin_unlock(&unix_table_locks[hash]); + return NULL; }
/* Support code for asymmetrically connected dgram sockets @@ -1010,11 +1028,11 @@ static struct sock *unix_find_other(struct net *net,
static int unix_autobind(struct sock *sk) { + unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct unix_address *addr; unsigned int retries = 0; static u32 ordernum = 1; - unsigned int new_hash; int err;
err = mutex_lock_interruptible(&u->bindlock); @@ -1038,12 +1056,13 @@ static int unix_autobind(struct sock *sk) offsetof(struct sockaddr_un, sun_path) + 1;
new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); - spin_lock(&unix_table_lock); + unix_table_double_lock(old_hash, new_hash); ordernum = (ordernum+1)&0xFFFFF;
if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, new_hash)) { - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); + /* * __unix_find_socket_byname() may take long time if many names * are already in use. @@ -1059,7 +1078,7 @@ static int unix_autobind(struct sock *sk) }
__unix_set_addr_hash(sk, addr, new_hash); - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); err = 0;
out: mutex_unlock(&u->bindlock); @@ -1071,9 +1090,9 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, { umode_t mode = S_IFSOCK | (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); + unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct unix_address *addr; - unsigned int new_hash; struct dentry *dentry; struct path parent; int err; @@ -1111,11 +1130,11 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, goto out_unlock;
new_hash = unix_bsd_hash(d_backing_inode(dentry)); - spin_lock(&unix_table_lock); + unix_table_double_lock(old_hash, new_hash); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); __unix_set_addr_hash(sk, addr, new_hash); - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; @@ -1136,9 +1155,9 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, int addr_len) { + unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); struct unix_address *addr; - unsigned int new_hash; int err;
addr = unix_create_addr(sunaddr, addr_len); @@ -1155,19 +1174,19 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, }
new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); - spin_lock(&unix_table_lock); + unix_table_double_lock(old_hash, new_hash);
if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, new_hash)) goto out_spin;
__unix_set_addr_hash(sk, addr, new_hash); - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); mutex_unlock(&u->bindlock); return 0;
out_spin: - spin_unlock(&unix_table_lock); + unix_table_double_unlock(old_hash, new_hash); err = -EADDRINUSE; out_mutex: mutex_unlock(&u->bindlock); @@ -1469,9 +1488,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found - * otheru in hash under unix_table_lock. Insertion + * otheru in hash under unix_table_locks. Insertion * into the hash chain we'd found it in had been done - * in an earlier critical area protected by unix_table_lock, + * in an earlier critical area protected by unix_table_locks, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * @@ -2900,7 +2919,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
#define get_bucket(x) ((x) >> BUCKET_SPACE) -#define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1)) +#define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) @@ -2924,7 +2943,7 @@ static struct sock *unix_next_socket(struct seq_file *seq, struct sock *sk, loff_t *pos) { - unsigned long bucket; + unsigned long bucket = get_bucket(*pos);
while (sk > (struct sock *)SEQ_START_TOKEN) { sk = sk_next(sk); @@ -2935,12 +2954,13 @@ static struct sock *unix_next_socket(struct seq_file *seq, }
do { + spin_lock(&unix_table_locks[bucket]); sk = unix_from_bucket(seq, pos); if (sk) return sk;
next_bucket: - bucket = get_bucket(*pos) + 1; + spin_unlock(&unix_table_locks[bucket++]); *pos = set_bucket_offset(bucket, 1); } while (bucket < ARRAY_SIZE(unix_socket_table));
@@ -2948,10 +2968,7 @@ static struct sock *unix_next_socket(struct seq_file *seq, }
static void *unix_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(unix_table_lock) { - spin_lock(&unix_table_lock); - if (!*pos) return SEQ_START_TOKEN;
@@ -2968,9 +2985,11 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) }
static void unix_seq_stop(struct seq_file *seq, void *v) - __releases(unix_table_lock) { - spin_unlock(&unix_table_lock); + struct sock *sk = v; + + if (sk) + spin_unlock(&unix_table_locks[sk->sk_hash]); }
static int unix_seq_show(struct seq_file *seq, void *v) @@ -2995,7 +3014,7 @@ static int unix_seq_show(struct seq_file *seq, void *v) (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s));
- if (u->addr) { // under unix_table_lock here + if (u->addr) { // under unix_table_locks here int i, len; seq_putc(seq, ' ');
@@ -3067,10 +3086,13 @@ static struct pernet_operations unix_net_ops = {
static int __init af_unix_init(void) { - int rc = -1; + int i, rc = -1;
BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
+ for (i = 0; i < 2 * UNIX_HASH_SIZE; i++) + spin_lock_init(&unix_table_locks[i]); + rc = proto_register(&unix_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); diff --git a/net/unix/diag.c b/net/unix/diag.c index dd77e81b41a0..c522ab94d0c8 100644 --- a/net/unix/diag.c +++ b/net/unix/diag.c @@ -13,7 +13,7 @@
static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) { - /* might or might not have unix_table_lock */ + /* might or might not have unix_table_locks */ struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
if (!addr) @@ -204,13 +204,13 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) s_slot = cb->args[0]; num = s_num = cb->args[1];
- spin_lock(&unix_table_lock); for (slot = s_slot; slot < ARRAY_SIZE(unix_socket_table); s_num = 0, slot++) { struct sock *sk;
num = 0; + spin_lock(&unix_table_locks[slot]); sk_for_each(sk, &unix_socket_table[slot]) { if (!net_eq(sock_net(sk), net)) continue; @@ -221,14 +221,16 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) if (sk_diag_dump(sk, skb, req, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - NLM_F_MULTI) < 0) + NLM_F_MULTI) < 0) { + spin_unlock(&unix_table_locks[slot]); goto done; + } next: num++; } + spin_unlock(&unix_table_locks[slot]); } done: - spin_unlock(&unix_table_lock); cb->args[0] = slot; cb->args[1] = num;
@@ -237,21 +239,19 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
static struct sock *unix_lookup_by_ino(unsigned int ino) { - int i; struct sock *sk; + int i;
- spin_lock(&unix_table_lock); for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) { + spin_lock(&unix_table_locks[i]); sk_for_each(sk, &unix_socket_table[i]) if (ino == sock_i_ino(sk)) { sock_hold(sk); - spin_unlock(&unix_table_lock); - + spin_unlock(&unix_table_locks[i]); return sk; } + spin_unlock(&unix_table_locks[i]); } - - spin_unlock(&unix_table_lock); return NULL; }