hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
And add sysctl net.core.numa_rship_ms to control frequency. Add bpf_sched_net_rship_submit bpf helper function to submit the relationship to schedule subsystem.
Signed-off-by: Liu Jian liujian56@huawei.com --- include/linux/filter.h | 17 +- include/linux/skbuff.h | 4 + include/net/net_rship.h | 329 +++++++++++++++++++++++++++++++++ include/net/sock.h | 4 + include/uapi/linux/bpf.h | 19 +- init/Kconfig | 1 + kernel/bpf/syscall.c | 5 +- net/core/dev.c | 4 + net/core/filter.c | 62 +++++++ net/core/skbuff.c | 17 +- net/core/sock.c | 21 +++ net/core/sysctl_net_core.c | 18 ++ net/ipv4/tcp.c | 6 + net/ipv4/tcp_output.c | 3 + tools/bpf/bpftool/prog.c | 5 +- tools/include/uapi/linux/bpf.h | 23 ++- tools/lib/bpf/libbpf.c | 10 +- 17 files changed, 539 insertions(+), 9 deletions(-) create mode 100644 include/net/net_rship.h
diff --git a/include/linux/filter.h b/include/linux/filter.h index 10901c4f5b20..a9ae21f08ce2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1477,11 +1477,21 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, #ifdef CONFIG_BPF_NET_GLOBAL_PROG struct bpf_gnet_ctx_kern { struct sock *sk; + int curr_tid; + int peer_tid; + int numa_node; + __u64 rxtx_bytes; + int rx_dev_idx; + int rx_dev_queue_idx; + __u64 rx_dev_netns_cookie; };
enum gnet_bpf_attach_type { GNET_BPF_ATTACH_TYPE_INVALID = -1, - GNET_RESERVE0 = 0, + GNET_TCP_RECVMSG = 0, + GNET_SK_DST_SET, + GNET_RCV_NIC_NODE, + GNET_SEND_NIC_NODE, MAX_GNET_BPF_ATTACH_TYPE };
@@ -1492,7 +1502,10 @@ static inline enum gnet_bpf_attach_type to_gnet_bpf_attach_type(enum bpf_attach_type attach_type) { switch (attach_type) { - GNET_ATYPE(GNET_RESERVE0); + GNET_ATYPE(GNET_TCP_RECVMSG); + GNET_ATYPE(GNET_SK_DST_SET); + GNET_ATYPE(GNET_RCV_NIC_NODE); + GNET_ATYPE(GNET_SEND_NIC_NODE); default: return GNET_BPF_ATTACH_TYPE_INVALID; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ce3dfed6b915..469d7e6a2cec 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -924,7 +924,11 @@ struct sk_buff { /* public: */
KABI_USE2(1, __u8 scm_io_uring:1, __u8 local_skb:1) +#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP) + KABI_USE(2, struct sched_net_rship_skb *net_rship) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4)
diff --git a/include/net/net_rship.h b/include/net/net_rship.h new file mode 100644 index 000000000000..ad8af5a5cb9b --- /dev/null +++ b/include/net/net_rship.h @@ -0,0 +1,329 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Common code for task relationship aware + * + * Copyright (C) 2024 Huawei Technologies Co., Ltd + * + */ + +#ifndef __LINUX_NET_RSHIP_H__ +#define __LINUX_NET_RSHIP_H__ + +#include <linux/types.h> +#include <linux/jiffies.h> +#include <linux/socket.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/net.h> +#include <linux/interrupt.h> +#include <linux/static_key.h> + +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/filter.h> + +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + +struct sched_net_rship_skb { + /* for loopback traffic */ + pid_t alloc_tid; + + /* for phy nic */ + union { + u32 rx_dev_idx; /* rx */ + int dev_numa_node; /* tx */ + }; + u16 alloc_cpu; + u16 rx_queue_idx; + u64 rx_dev_net_cookie; +}; + +struct sk_buff_fclones_net_rship { + struct sk_buff_fclones fclones; + struct sched_net_rship_skb ext1; + struct sched_net_rship_skb ext2; +}; + +struct sk_buff_net_rship { + struct sk_buff skb; + struct sched_net_rship_skb ext; +}; + +struct sched_net_rship_sock { + /* for loopback traffic */ + pid_t sk_peer_tid; + u64 tid_rx_bytes; + unsigned long last_rx_update; + + /* for recv from phy nic */ + int rcv_numa_node; + u64 rcv_numa_node_bytes; + unsigned long last_rcv_numa_node_update; + + /* for send to phy nic */ + pid_t sk_send_tid; + int send_numa_node; + u64 send_numa_node_bytes; + unsigned long last_send_numa_node_update; +}; +#endif + +#if defined(CONFIG_SCHED_TASK_RELATIONSHIP) && defined(CONFIG_BPF_NET_GLOBAL_PROG) + +#define NET_RSHIP_HEAD_RESERVE 40 +extern unsigned long net_numa_rship_jiffies; + +static inline void net_rship_sock_init(struct sock *sk, unsigned int offset) +{ + sk->net_rship = (void *)(((char *)sk) + offset); + memset(sk->net_rship, 0, sizeof(struct sched_net_rship_sock)); + sk->net_rship->rcv_numa_node = NUMA_NO_NODE; + sk->net_rship->send_numa_node = NUMA_NO_NODE; +} + +static inline struct sched_net_rship_skb *__get_skb_net_rship(struct sk_buff *skb) +{ + return skb->net_rship; +} + +static inline bool net_rship_refresh_timeout(unsigned long last_update) +{ + return time_after(jiffies, net_numa_rship_jiffies + last_update); +} + +static inline void net_rship_sk_dst_set(struct sock *sk, struct dst_entry *dst) +{ + if (!gnet_bpf_enabled(GNET_SK_DST_SET)) + return; + + if (!in_task() || !dst) + return; + + if (dev_to_node(&dst->dev->dev) != NUMA_NO_NODE) { + struct bpf_gnet_ctx_kern ctx = {0}; + + ctx.numa_node = dev_to_node(&dst->dev->dev); + if (sk->net_rship->sk_send_tid) + ctx.curr_tid = sk->net_rship->sk_send_tid; + else + ctx.curr_tid = task_pid_nr(current); + ctx.sk = sk; + run_gnet_bpf(GNET_SK_DST_SET, &ctx); + } +} + +static inline void __net_rship_tcp_rcvmsg(struct sock *sk, pid_t tid) +{ + struct bpf_gnet_ctx_kern ctx = {0}; + + ctx.sk = sk; + ctx.curr_tid = task_pid_nr(current); + ctx.peer_tid = tid; + ctx.rxtx_bytes = sk->net_rship->tid_rx_bytes; + sk->net_rship->last_rx_update = jiffies; + run_gnet_bpf(GNET_TCP_RECVMSG, &ctx); + sk->net_rship->tid_rx_bytes = 0; +} + +static inline void net_rship_tcp_local(struct sock *sk, struct sk_buff *skb) +{ + struct sched_net_rship_skb *ext; + + if (!gnet_bpf_enabled(GNET_TCP_RECVMSG)) + return; + + ext = __get_skb_net_rship(skb); + if (!ext->alloc_tid) + return; + + if (sk->net_rship->sk_peer_tid != ext->alloc_tid) { + sk->net_rship->sk_peer_tid = ext->alloc_tid; + sk->net_rship->tid_rx_bytes = skb->len + NET_RSHIP_HEAD_RESERVE; + __net_rship_tcp_rcvmsg(sk, ext->alloc_tid); + } else { + sk->net_rship->tid_rx_bytes += (skb->len + NET_RSHIP_HEAD_RESERVE); + if (net_rship_refresh_timeout(sk->net_rship->last_rx_update)) + __net_rship_tcp_rcvmsg(sk, ext->alloc_tid); + } +} + +static inline void net_rship_recv_nic_node(struct sock *sk, struct sk_buff *skb) +{ + struct sched_net_rship_skb *ext; + + if (!gnet_bpf_enabled(GNET_RCV_NIC_NODE)) + return; + + ext = __get_skb_net_rship(skb); + if (ext->alloc_tid || ext->rx_dev_idx == -1) + return; + + sk->net_rship->rcv_numa_node_bytes += (skb->len + NET_RSHIP_HEAD_RESERVE); + if (net_rship_refresh_timeout(sk->net_rship->last_rcv_numa_node_update)) { + struct bpf_gnet_ctx_kern ctx = {0}; + + ctx.sk = sk; + ctx.curr_tid = task_pid_nr(current); + ctx.numa_node = cpu_to_node(ext->alloc_cpu); + ctx.rxtx_bytes = sk->net_rship->rcv_numa_node_bytes; + ctx.rx_dev_idx = ext->rx_dev_idx; + ctx.rx_dev_queue_idx = skb_get_rx_queue(skb); + ctx.rx_dev_netns_cookie = ext->rx_dev_net_cookie; + run_gnet_bpf(GNET_RCV_NIC_NODE, &ctx); + sk->net_rship->last_rcv_numa_node_update = jiffies; + sk->net_rship->rcv_numa_node_bytes = 0; + } +} + +static inline void net_rship_tcp_recvmsg(struct sock *sk, struct sk_buff *skb) +{ + net_rship_tcp_local(sk, skb); + net_rship_recv_nic_node(sk, skb); +} + +static inline void net_rship_send_nic_node(struct sock *sk, struct sk_buff *skb) +{ + struct sched_net_rship_skb *ext; + + if (!gnet_bpf_enabled(GNET_SEND_NIC_NODE)) + return; + + ext = __get_skb_net_rship(skb); + if ((ext->dev_numa_node != NUMA_NO_NODE) && + sk->net_rship->sk_send_tid) { + sk->net_rship->send_numa_node_bytes += skb->len; + if (net_rship_refresh_timeout(sk->net_rship->last_send_numa_node_update)) { + struct bpf_gnet_ctx_kern ctx = {0}; + + ctx.sk = sk; + ctx.curr_tid = sk->net_rship->sk_send_tid; + ctx.rxtx_bytes = sk->net_rship->send_numa_node_bytes; + ctx.numa_node = ext->dev_numa_node; + + run_gnet_bpf(GNET_SEND_NIC_NODE, &ctx); + sk->net_rship->send_numa_node_bytes = 0; + sk->net_rship->last_send_numa_node_update = jiffies; + } + } +} + +static inline void net_rship_skb_record_dev_numa_node(struct sk_buff *skb, struct net_device *dev) +{ + if (gnet_bpf_enabled(GNET_SEND_NIC_NODE)) { + struct sched_net_rship_skb *ext = __get_skb_net_rship(skb); + + ext->dev_numa_node = dev_to_node(&dev->dev); + } +} + +static inline void net_rship_skb_record_dev_rxinfo(struct sk_buff *skb, struct net_device *dev) +{ + if (gnet_bpf_enabled(GNET_RCV_NIC_NODE)) { + struct sched_net_rship_skb *ext = __get_skb_net_rship(skb); + + ext->rx_dev_idx = dev->ifindex; + ext->rx_dev_net_cookie = dev_net(dev)->net_cookie; + } +} + +static inline void __net_rship_skb_clear(struct sched_net_rship_skb *ext) +{ + ext->alloc_tid = 0; + /* dev_name_node and rx_dev_idx */ + ext->dev_numa_node = NUMA_NO_NODE; +} + +static inline void net_rship_skb_clear(struct sk_buff *skb) +{ + struct sched_net_rship_skb *ext = __get_skb_net_rship(skb); + + __net_rship_skb_clear(ext); +} + +static inline void __net_rship_skb_init(struct sk_buff *skb) +{ + __net_rship_skb_clear(skb->net_rship); + skb->net_rship->alloc_cpu = raw_smp_processor_id(); +} + +static inline void net_rship_skb_init(struct sk_buff *skb) +{ + struct sk_buff_net_rship *rskb = (void *)skb; + + skb->net_rship = &rskb->ext; + __net_rship_skb_init(skb); +} + +static inline void net_rship_skb_init_flags(struct sk_buff *skb, int flags) +{ + if (flags & SKB_ALLOC_FCLONE) { + struct sk_buff_fclones_net_rship *rskbs; + + rskbs = (void *)container_of(skb, struct sk_buff_fclones, skb1); + skb->net_rship = &rskbs->ext1; + rskbs->fclones.skb2.net_rship = &rskbs->ext2; + + __net_rship_skb_init(skb); + __net_rship_skb_init(&rskbs->fclones.skb2); + } else + net_rship_skb_init(skb); +} + +static inline void net_rship_skb_clone(struct sk_buff *n, struct sk_buff *skb) +{ + n->net_rship->alloc_tid = skb->net_rship->alloc_tid; +} + +/* Make sure it is a process context */ +static inline void net_rship_record_sendmsginfo(struct sk_buff *skb, struct sock *sk) +{ + if (gnet_bpf_enabled(GNET_TCP_RECVMSG) || gnet_bpf_enabled(GNET_RCV_NIC_NODE)) { + struct sched_net_rship_skb *ext = __get_skb_net_rship(skb); + + ext->alloc_tid = task_pid_nr(current); + } + if (gnet_bpf_enabled(GNET_SK_DST_SET) || gnet_bpf_enabled(GNET_SEND_NIC_NODE)) + sk->net_rship->sk_send_tid = task_pid_nr(current); +} + +#else + +static inline void net_rship_sock_init(struct sock *sk, unsigned int offset) +{} + +static inline void net_rship_sk_dst_set(struct sock *sk, struct dst_entry *dst) +{} + +static inline void net_rship_tcp_recvmsg(struct sock *sk, struct sk_buff *skb) +{} + +static inline void net_rship_send_nic_node(struct sock *sk, struct sk_buff *skb) +{} + +static inline void net_rship_skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue) +{} + +static inline void net_rship_skb_record_dev_numa_node(struct sk_buff *skb, struct net_device *dev) +{} + +static inline void net_rship_skb_record_dev_rxinfo(struct sk_buff *skb, struct net_device *dev) +{} + +static inline void net_rship_skb_clear(struct sk_buff *skb) +{} + +static inline void net_rship_skb_init(struct sk_buff *skb) +{} + +static inline void net_rship_skb_init_flags(struct sk_buff *skb, int flags) +{} + +static inline void net_rship_skb_clone(struct sk_buff *n, struct sk_buff *skb) +{} + +static inline void net_rship_record_sendmsginfo(struct sk_buff *skb, struct sock *sk) +{} +#endif + +#endif diff --git a/include/net/sock.h b/include/net/sock.h index 00051f2558fa..7078c98f9726 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -533,7 +533,11 @@ struct sock { #else KABI_RESERVE(1) #endif +#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP) + KABI_USE(2, struct sched_net_rship_sock *net_rship) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b4ddcba26377..c086cc287b47 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -246,7 +246,10 @@ enum bpf_attach_type { BPF_XDP, #ifndef __GENKSYMS__ BPF_SCHED, - BPF_GNET_RESERVE0, + BPF_GNET_TCP_RECVMSG, + BPF_GNET_SK_DST_SET, + BPF_GNET_RCV_NIC_NODE, + BPF_GNET_SEND_NIC_NODE, #endif __MAX_BPF_ATTACH_TYPE }; @@ -3922,6 +3925,12 @@ union bpf_attr { * get resource statistics of *nid* and store in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_net_rship_submit(void *buf, size_t sz, u64 flags) + * Description + * update network's relationship to sched subsystem. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4098,6 +4107,7 @@ union bpf_attr { FN(get_task_relationship_stats),\ FN(sched_set_curr_preferred_node),\ FN(get_node_stats), \ + FN(sched_net_rship_submit), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5254,6 +5264,13 @@ enum {
struct bpf_gnet_ctx { __bpf_md_ptr(struct bpf_sock *, sk); + int curr_tid; + int peer_tid; + int numa_node; + __u64 rxtx_bytes; + int rx_dev_idx; + int rx_dev_queue_idx; + __u64 rx_dev_netns_cookie; };
#endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/init/Kconfig b/init/Kconfig index 758b9988d742..c329e031689c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1084,6 +1084,7 @@ config QOS_SCHED_DYNAMIC_AFFINITY config SCHED_TASK_RELATIONSHIP bool "task relationship" depends on NUMA_BALANCING + select BPF_NET_GLOBAL_PROG default n help This feature enables the scheduler to identify tasks relationship by diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 172d4005c940..7ccdb89b08c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3021,7 +3021,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_XDP: return BPF_PROG_TYPE_XDP; #ifdef CONFIG_BPF_NET_GLOBAL_PROG - case BPF_GNET_RESERVE0: + case BPF_GNET_TCP_RECVMSG: + case BPF_GNET_SK_DST_SET: + case BPF_GNET_RCV_NIC_NODE: + case BPF_GNET_SEND_NIC_NODE: return BPF_PROG_TYPE_NET_GLOBAL; #endif default: diff --git a/net/core/dev.c b/net/core/dev.c index 1f1f93aad71c..8e0f4690e157 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -146,6 +146,7 @@ #include <net/devlink.h> #include <linux/pm_runtime.h> #include <linux/prandom.h> +#include <net/net_rship.h>
#include "net-sysfs.h"
@@ -3595,6 +3596,8 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, if (dev_nit_active(dev)) dev_queue_xmit_nit(skb, dev);
+ net_rship_skb_record_dev_numa_node(skb, dev); + len = skb->len; PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies); trace_net_dev_start_xmit(skb, dev); @@ -6197,6 +6200,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; + net_rship_skb_record_dev_rxinfo(skb, napi->dev);
/* eth_type_trans() assumes pkt_type is PACKET_HOST */ skb->pkt_type = PACKET_HOST; diff --git a/net/core/filter.c b/net/core/filter.c index a5b497043eda..4f4e832f3e9f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10755,6 +10755,30 @@ static int __init gnet_bpf_init(void) } late_initcall(gnet_bpf_init);
+#include <linux/sched/relationship.h> +BPF_CALL_3(bpf_sched_net_rship_submit, void *, reqbuf, size_t, sz, u64, flags) +{ +#if defined(CONFIG_SCHED_TASK_RELATIONSHIP) + struct net_relationship_req *req = reqbuf; + + if (sz != sizeof(struct net_relationship_req)) + return -EINVAL; + + return sched_net_relationship_submit(req); +#else + return 0; +#endif +} + +const struct bpf_func_proto bpf_sched_net_rship_submit_proto = { + .func = bpf_sched_net_rship_submit, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_gnet_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -10763,6 +10787,8 @@ bpf_gnet_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_event_output_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; + case BPF_FUNC_sched_net_rship_submit: + return &bpf_sched_net_rship_submit_proto; default: break; } @@ -10810,6 +10836,42 @@ static u32 bpf_gnet_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct bpf_gnet_ctx_kern, sk)); break; + case offsetof(struct bpf_gnet_ctx, numa_node): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, numa_node), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, numa_node)); + break; + case offsetof(struct bpf_gnet_ctx, curr_tid): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, curr_tid), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, curr_tid)); + break; + case offsetof(struct bpf_gnet_ctx, peer_tid): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, peer_tid), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, peer_tid)); + break; + case offsetof(struct bpf_gnet_ctx, rxtx_bytes): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, rxtx_bytes), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, rxtx_bytes)); + break; + case offsetof(struct bpf_gnet_ctx, rx_dev_idx): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, rx_dev_idx), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, rx_dev_idx)); + break; + case offsetof(struct bpf_gnet_ctx, rx_dev_queue_idx): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, rx_dev_queue_idx), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, rx_dev_queue_idx)); + break; + case offsetof(struct bpf_gnet_ctx, rx_dev_netns_cookie): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, + rx_dev_netns_cookie), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, rx_dev_netns_cookie)); + break; } return insn - insn_buf; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b290db716392..0a2578100e27 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -71,6 +71,7 @@ #include <net/mptcp.h> #include <net/page_pool.h> #include <net/tcp_ext.h> +#include <net/net_rship.h>
#include <linux/uaccess.h> #include <trace/events/skb.h> @@ -254,6 +255,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb_set_kcov_handle(skb, kcov_common_handle());
+ net_rship_skb_init_flags(skb, flags); out: return skb; nodata: @@ -289,6 +291,7 @@ static struct sk_buff *__build_skb_around(struct sk_buff *skb,
skb_set_kcov_handle(skb, kcov_common_handle());
+ net_rship_skb_init(skb); return skb; }
@@ -485,6 +488,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, skb_success: skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; + net_rship_skb_record_dev_rxinfo(skb, dev);
skb_fail: return skb; @@ -549,6 +553,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, skb_success: skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); skb->dev = napi->dev; + net_rship_skb_record_dev_rxinfo(skb, napi->dev);
skb_fail: return skb; @@ -996,7 +1001,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #ifdef CONFIG_NET_SCHED CHECK_SKB_FIELD(tc_index); #endif - + net_rship_skb_clone(new, (void *)old); }
/* @@ -1476,6 +1481,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return NULL;
n->fclone = SKB_FCLONE_UNAVAILABLE; + net_rship_skb_init(n); }
return __skb_clone(n, skb); @@ -3428,6 +3434,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ skb_split_no_header(skb, skb1, len, pos); + net_rship_skb_clone(skb1, skb); } EXPORT_SYMBOL(skb_split);
@@ -4438,14 +4445,22 @@ static void skb_extensions_init(void) {} void __init skb_init(void) { skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", +#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP) + sizeof(struct sk_buff_net_rship), +#else sizeof(struct sk_buff), +#endif 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", +#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP) + sizeof(struct sk_buff_fclones_net_rship), +#else sizeof(struct sk_buff_fclones), +#endif 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); diff --git a/net/core/sock.c b/net/core/sock.c index 8f0b8e1eaadd..da0c980ad238 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -138,6 +138,7 @@
#include <net/tcp.h> #include <net/busy_poll.h> +#include <net/net_rship.h>
static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); @@ -1676,12 +1677,18 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) const struct proto *prot = READ_ONCE(osk->sk_prot); #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; +#endif +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + void *net_rship = nsk->net_rship; #endif memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + nsk->net_rship = net_rship; +#endif #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; security_sk_clone(osk, nsk); @@ -1702,7 +1709,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (want_init_on_alloc(priority)) sk_prot_clear_nulls(sk, prot->obj_size); } else +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + sk = kmalloc(prot->obj_size + sizeof(struct sched_net_rship_sock), + priority); +#else sk = kmalloc(prot->obj_size, priority); +#endif
if (sk != NULL) { if (security_sk_alloc(sk, family, priority)) @@ -1711,6 +1723,9 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; sk_tx_queue_clear(sk); +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + net_rship_sock_init(sk, prot->obj_size); +#endif }
return sk; @@ -2045,6 +2060,8 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); + + net_rship_sk_dst_set(sk, dst); } EXPORT_SYMBOL_GPL(sk_setup_caps);
@@ -3512,7 +3529,11 @@ int proto_register(struct proto *prot, int alloc_slab)
if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + prot->obj_size + sizeof(struct sched_net_rship_sock), 0, +#else prot->obj_size, 0, +#endif SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | prot->slab_flags, prot->useroffset, prot->usersize, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0dfe9f255ab3..ea0ee32f27bb 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -21,6 +21,7 @@ #include <net/net_ratelimit.h> #include <net/busy_poll.h> #include <net/pkt_sched.h> +#include <net/net_rship.h>
static int two = 2; static int three = 3; @@ -45,6 +46,12 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP +unsigned long net_numa_rship_jiffies __read_mostly = HZ / 10; /* 100ms */ +static unsigned long net_numa_rship_ms_min = HZ / 10; /* 100ms */ +static unsigned long net_numa_rship_ms_max = 100 * HZ; /* 100s */ +#endif + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -575,6 +582,17 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + { + .procname = "numa_rship_ms", + .data = &net_numa_rship_jiffies, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = &net_numa_rship_ms_min, + .extra2 = &net_numa_rship_ms_max, + }, +#endif { } };
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 830d6b2039f5..0f13dc167730 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,6 +279,7 @@ #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h> +#include <net/net_rship.h>
DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); @@ -884,6 +885,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_shinfo(skb)->tx_flags = 0; memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb)); + net_rship_skb_clear(skb); return skb; } } @@ -1321,6 +1323,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (!skb) goto wait_for_space;
+ net_rship_record_sendmsginfo(skb, sk); + process_backlog++; skb->ip_summed = CHECKSUM_PARTIAL;
@@ -2367,6 +2371,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (used + offset < skb->len) continue;
+ net_rship_tcp_recvmsg(sk, skb); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d9b50a3addee..762f2009d61d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -39,6 +39,7 @@
#include <net/tcp.h> #include <net/mptcp.h> +#include <net/net_rship.h>
#include <linux/compiler.h> #include <linux/gfp.h> @@ -1196,6 +1197,8 @@ void tcp_wfree(struct sk_buff *skb) */ WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
+ net_rship_send_nic_node(sk, skb); + /* If this softirq is serviced by ksoftirqd, we are likely under stress. * Wait until our queues (qdisc + devices) are drained. * This gives : diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index d9b2fe1c451a..22be05e8dbb4 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -80,7 +80,10 @@ static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", [BPF_SK_MSG_VERDICT] = "msg_verdict", [BPF_FLOW_DISSECTOR] = "flow_dissector", - [BPF_GNET_RESERVE0] = "gnet_reserve0", + [BPF_GNET_TCP_RECVMSG] = "gnet_tcp_recvmsg", + [BPF_GNET_SK_DST_SET] = "gnet_sk_dst_set", + [BPF_GNET_RCV_NIC_NODE] = "gnet_rcv_nic_node", + [BPF_GNET_SEND_NIC_NODE] = "gnet_send_nic_node", [__MAX_BPF_ATTACH_TYPE] = NULL, };
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index dc493193174f..254b5118921d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -246,7 +246,10 @@ enum bpf_attach_type { BPF_XDP, #ifndef __GENKSYMS__ BPF_SCHED, - BPF_GNET_RESERVE0, + BPF_GNET_TCP_RECVMSG, + BPF_GNET_SK_DST_SET, + BPF_GNET_RCV_NIC_NODE, + BPF_GNET_SEND_NIC_NODE, #endif __MAX_BPF_ATTACH_TYPE }; @@ -3922,6 +3925,12 @@ union bpf_attr { * get resource statistics of *nid* and store in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_net_rship_submit(void *buf, size_t sz, u64 flags) + * Description + * update network's relationship to sched subsystem. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4098,6 +4107,7 @@ union bpf_attr { FN(get_task_relationship_stats),\ FN(sched_set_curr_preferred_node),\ FN(get_node_stats), \ + FN(sched_net_rship_submit), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4384,6 +4394,10 @@ struct bpf_sock { __u32 dst_ip6[4]; __u32 state; __s32 rx_queue_mapping; + __s32 sk_send_tid; + __s32 sk_peer_tid; + __u64 rcv_tid_bytes; + __u64 rcv_numa_node_bytes; };
struct bpf_tcp_sock { @@ -5254,6 +5268,13 @@ enum {
struct bpf_gnet_ctx { __bpf_md_ptr(struct bpf_sock *, sk); + int curr_tid; + int peer_tid; + int numa_node; + __u64 rxtx_bytes; + int rx_dev_idx; + int rx_dev_queue_idx; + __u64 rx_dev_netns_cookie; };
#endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1a04ac5395bb..b7f71d2d7d53 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8494,8 +8494,14 @@ static const struct bpf_sec_def section_defs[] = { BPF_PROG_SEC("struct_ops", BPF_PROG_TYPE_STRUCT_OPS), BPF_EAPROG_SEC("sk_lookup/", BPF_PROG_TYPE_SK_LOOKUP, BPF_SK_LOOKUP), - BPF_EAPROG_SEC("gnet/reserve0", BPF_PROG_TYPE_NET_GLOBAL, - BPF_GNET_RESERVE0), + BPF_EAPROG_SEC("gnet/tcp_recvmsg", BPF_PROG_TYPE_NET_GLOBAL, + BPF_GNET_TCP_RECVMSG), + BPF_EAPROG_SEC("gnet/sk_dst_set", BPF_PROG_TYPE_NET_GLOBAL, + BPF_GNET_SK_DST_SET), + BPF_EAPROG_SEC("gnet/rcv_nic_node", BPF_PROG_TYPE_NET_GLOBAL, + BPF_GNET_RCV_NIC_NODE), + BPF_EAPROG_SEC("gnet/send_nic_node", BPF_PROG_TYPE_NET_GLOBAL, + BPF_GNET_SEND_NIC_NODE), };
#undef BPF_PROG_SEC_IMPL