Terrace Service Acceleration.
Liu Jian (1): bpf: Add new bpf helper to get SO_ORIGINAL_DST/REPLY_SRC
Lu Wei (1): net: core: Add a GID field to struct sock.
Wang Yufen (1): bpf, sockmap: Add sk_rmem_alloc check for sockmap
Zhengchao Shao (1): bpf: Add bpf_get_sockops_uid_gid helper function
include/net/netfilter/nf_conntrack.h | 6 +++ include/net/sock.h | 18 +++++++ include/uapi/linux/bpf.h | 2 + include/uapi/linux/netfilter_ipv4.h | 2 + net/Kconfig | 7 +++ net/core/filter.c | 64 +++++++++++++++++++++++++ net/core/skmsg.c | 10 +++- net/core/sock.c | 6 +++ net/ipv4/tcp_bpf.c | 10 ++++ net/netfilter/nf_conntrack_proto.c | 71 ++++++++++++++++++++++++++++ net/socket.c | 8 +++- 11 files changed, 201 insertions(+), 3 deletions(-)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/3690 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/B...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/3690 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/B...
From: Lu Wei luwei32@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KU3B CVE: NA
--------------------------------
UID and GID are requested as filters for socketmap, but we can only get UID from sock structure. This patch adds GID field to struct sock as UID.
Signed-off-by: Lu Wei luwei32@huawei.com Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com --- include/net/sock.h | 18 ++++++++++++++++++ net/Kconfig | 7 +++++++ net/core/sock.c | 6 ++++++ net/socket.c | 8 ++++++-- 4 files changed, 37 insertions(+), 2 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h index 7753354d59c0..873b81ceee5f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -300,6 +300,7 @@ struct sk_filter; * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner + * @sk_gid: group id of owner * @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_busy_poll_budget: napi processing budget when busypolling * @sk_priority: %SO_PRIORITY setting @@ -545,6 +546,13 @@ struct sock { struct rcu_head sk_rcu; netns_tracker ns_tracker; struct hlist_node sk_bind2_node; + +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + union { + kgid_t sk_gid; + u64 sk_gid_padding; + }; +#endif };
enum sk_pacing { @@ -2116,6 +2124,9 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) parent->sk = sk; sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + sk->sk_gid = SOCK_INODE(parent)->i_gid; +#endif security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } @@ -2129,6 +2140,13 @@ static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); }
+#if IS_ENABLED(CONFIG_NETACC_TERRACE) +static inline kgid_t sock_net_gid(const struct net *net, const struct sock *sk) +{ + return sk ? sk->sk_gid : make_kgid(net->user_ns, 0); +} +#endif + static inline u32 net_tx_rndhash(void) { u32 v = get_random_u32(); diff --git a/net/Kconfig b/net/Kconfig index 7fbd17e188a5..c976c72de26b 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -514,4 +514,11 @@ config NETACC_BPF help Network acceleration in bpf.
+config NETACC_TERRACE + bool "Terrace Service Acceleration" + default y + help + Accelerating intra-node communication on the data plane of the + Terrace service. + endif # if NET diff --git a/net/core/sock.c b/net/core/sock.c index bfaf47b3f3c7..c77326a07906 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3426,8 +3426,14 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) sk->sk_type = sock->type; RCU_INIT_POINTER(sk->sk_wq, &sock->wq); sock->sk = sk; +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + sk->sk_gid = SOCK_INODE(sock)->i_gid; +#endif } else { RCU_INIT_POINTER(sk->sk_wq, NULL); +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + sk->sk_gid = make_kgid(sock_net(sk)->user_ns, 0); +#endif } sk->sk_uid = uid;
diff --git a/net/socket.c b/net/socket.c index c4a6f5532955..84d42997abaf 100644 --- a/net/socket.c +++ b/net/socket.c @@ -604,10 +604,14 @@ static int sockfs_setattr(struct mnt_idmap *idmap, if (!err && (iattr->ia_valid & ATTR_UID)) { struct socket *sock = SOCKET_I(d_inode(dentry));
- if (sock->sk) + if (sock->sk) { sock->sk->sk_uid = iattr->ia_uid; - else +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + sock->sk->sk_gid = iattr->ia_gid; +#endif + } else { err = -ENOENT; + } }
return err;
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KU3B CVE: NA
--------------------------------
Add the function for bpf sock_ops hook to get sock's uid and gid.
Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com --- include/uapi/linux/bpf.h | 2 ++ net/core/filter.c | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4924f0cde1bc..fd9e24be0956 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -6898,6 +6898,8 @@ enum { TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ };
+#define SK_BPF_GID_UID 18000 + enum { BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), }; diff --git a/net/core/filter.c b/net/core/filter.c index d6905153cba2..a48ed6f36ddf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5576,6 +5576,27 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, return ret; }
+#if IS_ENABLED(CONFIG_NETACC_TERRACE) +static int bpf_sock_ops_get_uid_gid(struct bpf_sock_ops_kern *bpf_sock, + char *optval, int optlen) +{ + struct sock *sk = bpf_sock->sk; + kuid_t uid; + kgid_t gid; + + if (!sk || !sk_fullsock(sk) || optlen < sizeof(u64)) + return -EINVAL; + + uid = sock_net_uid(sock_net(sk), sk); + gid = sock_net_gid(sock_net(sk), sk); + + *(u32 *)optval = from_kgid_munged(sock_net(sk)->user_ns, gid); + *((u32 *)optval + 1) = from_kuid_munged(sock_net(sk)->user_ns, uid); + + return sizeof(u64); +} +#endif + BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -5600,6 +5621,10 @@ BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock,
return ret; } +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + if (level == SOL_IP && optname == SK_BPF_GID_UID) + return bpf_sock_ops_get_uid_gid(bpf_sock, optval, optlen); +#endif
return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); }
From: Liu Jian liujian56@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KU3B CVE: NA
--------------------------------
Add new optname(BPF_SO_ORIGINAL_DST 800, BPF_SO_REPLY_SRC 801) to get origdst/reply src for bpf progs. Now only support IPv4.
Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com --- include/net/netfilter/nf_conntrack.h | 6 +++ include/uapi/linux/netfilter_ipv4.h | 2 + net/core/filter.c | 39 +++++++++++++++ net/netfilter/nf_conntrack_proto.c | 71 ++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 4085765c3370..bb7fd376a8c5 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -377,4 +377,10 @@ int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb, #define MODULE_ALIAS_NFCT_HELPER(helper) \ MODULE_ALIAS("nfct-helper-" helper)
+#if IS_ENABLED(CONFIG_NETACC_TERRACE) +typedef int (*bpf_getorigdst_opt_func)(struct sock *sk, int optname, + void *optval, int *optlen, int dir); +extern bpf_getorigdst_opt_func bpf_getorigdst_opt; +#endif + #endif /* _NF_CONNTRACK_H */ diff --git a/include/uapi/linux/netfilter_ipv4.h b/include/uapi/linux/netfilter_ipv4.h index 155e77d6a42d..00e78cc2782b 100644 --- a/include/uapi/linux/netfilter_ipv4.h +++ b/include/uapi/linux/netfilter_ipv4.h @@ -50,6 +50,8 @@ enum nf_ip_hook_priorities { /* 2.2 firewalling (+ masq) went from 64 through 76 */ /* 2.4 firewalling went 64 through 67. */ #define SO_ORIGINAL_DST 80 +#define BPF_SO_ORIGINAL_DST 800 +#define BPF_SO_REPLY_SRC 801
#endif /* _UAPI__LINUX_IP_NETFILTER_H */ diff --git a/net/core/filter.c b/net/core/filter.c index a48ed6f36ddf..c9174667a1ed 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5577,6 +5577,12 @@ static int bpf_sock_ops_get_syn(struct bpf_sock_ops_kern *bpf_sock, }
#if IS_ENABLED(CONFIG_NETACC_TERRACE) +#include <net/netfilter/nf_conntrack.h> +#include <linux/netfilter_ipv4.h> + +bpf_getorigdst_opt_func bpf_getorigdst_opt; +EXPORT_SYMBOL(bpf_getorigdst_opt); + static int bpf_sock_ops_get_uid_gid(struct bpf_sock_ops_kern *bpf_sock, char *optval, int optlen) { @@ -5595,6 +5601,36 @@ static int bpf_sock_ops_get_uid_gid(struct bpf_sock_ops_kern *bpf_sock,
return sizeof(u64); } + +static int bpf_sk_original_addr(struct bpf_sock_ops_kern *bpf_sock, + int optname, char *optval, int optlen) +{ + struct sock *sk = bpf_sock->sk; + int ret = -EINVAL; + + if (!sk_fullsock(sk)) + goto err_clear; + + if (!bpf_getorigdst_opt) + goto err_clear; + +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + if (optname == BPF_SO_ORIGINAL_DST) + ret = bpf_getorigdst_opt(sk, optname, optval, &optlen, + IP_CT_DIR_ORIGINAL); + else + ret = bpf_getorigdst_opt(sk, optname, optval, &optlen, + IP_CT_DIR_REPLY); + if (ret < 0) + goto err_clear; + return ret; +#endif + +err_clear: + memset(optval, 0, optlen); + return ret; +} + #endif
BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, @@ -5624,6 +5660,9 @@ BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, #if IS_ENABLED(CONFIG_NETACC_TERRACE) if (level == SOL_IP && optname == SK_BPF_GID_UID) return bpf_sock_ops_get_uid_gid(bpf_sock, optval, optlen); + else if (level == SOL_IP && (optname == BPF_SO_ORIGINAL_DST || + optname == BPF_SO_REPLY_SRC)) + return bpf_sk_original_addr(bpf_sock, optname, optval, optlen); #endif
return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index c928ff63b10e..37f1ae00497a 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -311,6 +311,69 @@ getorigdst(struct sock *sk, int optval, void __user *user, int *len) return -ENOENT; }
+#if IS_ENABLED(CONFIG_NETACC_TERRACE) +static int +bpf_getorigdst_impl(struct sock *sk, int optval, void *user, int *len, int dir) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + + memset(&tuple, 0, sizeof(tuple)); + + tuple.src.u3.ip = inet->inet_rcv_saddr; + tuple.src.u.tcp.port = inet->inet_sport; + tuple.dst.u3.ip = inet->inet_daddr; + tuple.dst.u.tcp.port = inet->inet_dport; + tuple.src.l3num = PF_INET; + tuple.dst.protonum = sk->sk_protocol; + + /* We only do TCP and SCTP at the moment: is there a better way? */ + if (tuple.dst.protonum != IPPROTO_TCP && + tuple.dst.protonum != IPPROTO_SCTP) { + pr_debug("SO_ORIGINAL_DST: Not a TCP/SCTP socket\n"); + return -ENOPROTOOPT; + } + + if ((unsigned int)*len < sizeof(struct sockaddr_in)) { + pr_debug("SO_ORIGINAL_DST: len %d not %zu\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = nf_conntrack_find_get(sock_net(sk), &nf_ct_zone_dflt, &tuple); + if (h) { + struct sockaddr_in sin; + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + sin.sin_family = AF_INET; + if (dir == IP_CT_DIR_REPLY) { + sin.sin_port = ct->tuplehash[IP_CT_DIR_REPLY] + .tuple.src.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_REPLY] + .tuple.src.u3.ip; + } else { + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u3.ip; + } + memset(sin.sin_zero, 0, sizeof(sin.sin_zero)); + + pr_debug("SO_ORIGINAL_DST: %pI4 %u\n", + &sin.sin_addr.s_addr, ntohs(sin.sin_port)); + nf_ct_put(ct); + + memcpy(user, &sin, sizeof(sin)); + return 0; + } + pr_debug("SO_ORIGINAL_DST: Can't find %pI4/%u-%pI4/%u.\n", + &tuple.src.u3.ip, ntohs(tuple.src.u.tcp.port), + &tuple.dst.u3.ip, ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} +#endif + static struct nf_sockopt_ops so_getorigdst = { .pf = PF_INET, .get_optmin = SO_ORIGINAL_DST, @@ -655,6 +718,10 @@ int nf_conntrack_proto_init(void) goto cleanup_sockopt; #endif
+#if IS_ENABLED(CONFIG_NETACC_TERRACE) + bpf_getorigdst_opt = bpf_getorigdst_impl; +#endif + return ret;
#if IS_ENABLED(CONFIG_IPV6) @@ -666,6 +733,10 @@ int nf_conntrack_proto_init(void)
void nf_conntrack_proto_fini(void) { +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + bpf_getorigdst_opt = NULL; +#endif + nf_unregister_sockopt(&so_getorigdst); #if IS_ENABLED(CONFIG_IPV6) nf_unregister_sockopt(&so_getorigdst6);
From: Wang Yufen wangyufen@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I8KU3B
--------------------------------
A tcp socket in a sockmap. If the packets transmission rate is very fast and the packets receiving rate is very slow, a large number of packets are stacked in the ingress queue on the packets receiving side. As a result the memory is exhausted and the system ooms.
To fix, we add sk_rmem_alloc while sk_msg queued in the ingress queue and subtract sk_rmem_alloc while sk_msg dequeued from the ingress queue and check sk_rmem_alloc at the beginning of bpf_tcp_ingress().
Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Zhengchao Shao shaozhengchao@huawei.com --- net/core/skmsg.c | 10 +++++++++- net/ipv4/tcp_bpf.c | 10 ++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 93ecfceac1bc..54a8300b4b3e 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -444,8 +444,12 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (likely(!peek)) { sge->offset += copy; sge->length -= copy; - if (!msg_rx->skb) + if (!msg_rx->skb) { +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + atomic_sub(copy, &sk->sk_rmem_alloc); +#endif sk_mem_uncharge(sk, copy); + } msg_rx->sg.size -= copy;
if (!sge->length) { @@ -771,6 +775,10 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock)
list_for_each_entry_safe(msg, tmp, &psock->ingress_msg, list) { list_del(&msg->list); +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + if (!msg->skb) + atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc); +#endif sk_msg_free(psock->sk, msg); kfree(msg); } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 53b0d62fd2c2..835e65ae361e 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -43,6 +43,13 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, return -ENOMEM;
lock_sock(sk); +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { + kfree(tmp); + release_sock(sk); + return -EAGAIN; + } +#endif tmp->sg.start = msg->sg.start; i = msg->sg.start; do { @@ -75,6 +82,9 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, if (!ret) { msg->sg.start = i; sk_psock_queue_msg(psock, tmp); +#if IS_ENABLED(CONFIG_NETACC_TERRACE) + atomic_add(tmp->sg.size, &sk->sk_rmem_alloc); +#endif sk_psock_data_ready(sk, psock); } else { sk_msg_free(sk, tmp);