Add network relationship for NUMA isolation and consolidation.
Eric Dumazet (1): net: initialize net->net_cookie at netns setup
Liu Jian (3): net: fix kabi breakage in struct net net: add one bpf prog type for network numa relationship net: add some bpf hooks in tcp stack for network numa relationship
Martynas Pumputis (1): net: retrieve netns cookie via getsocketopt
arch/alpha/include/uapi/asm/socket.h | 2 + arch/mips/include/uapi/asm/socket.h | 2 + arch/parisc/include/uapi/asm/socket.h | 2 + arch/sparc/include/uapi/asm/socket.h | 2 + include/linux/bpf_types.h | 4 + include/linux/filter.h | 67 ++++++ include/linux/skbuff.h | 4 + include/net/net_namespace.h | 8 +- include/net/net_rship.h | 329 ++++++++++++++++++++++++++ include/net/sock.h | 4 + include/uapi/asm-generic/socket.h | 2 + include/uapi/linux/bpf.h | 23 ++ init/Kconfig | 1 + kernel/bpf/syscall.c | 19 ++ net/Kconfig | 6 + net/core/dev.c | 4 + net/core/filter.c | 218 ++++++++++++++++- net/core/net_namespace.c | 19 +- net/core/skbuff.c | 17 +- net/core/sock.c | 28 +++ net/core/sysctl_net_core.c | 18 ++ net/ipv4/tcp.c | 6 + net/ipv4/tcp_output.c | 3 + tools/bpf/bpftool/prog.c | 5 + tools/include/uapi/linux/bpf.h | 27 +++ tools/lib/bpf/libbpf.c | 8 + tools/lib/bpf/libbpf_probes.c | 1 + 27 files changed, 804 insertions(+), 25 deletions(-) create mode 100644 include/net/net_rship.h
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,转换为PR失败! 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/P... 失败原因:应用补丁/补丁集失败,Patch failed at 0005 net: add some bpf hooks in tcp stack for network numa relationship 建议解决方法:请查看失败原因, 确认补丁是否可以应用在当前期望分支的最新代码上
FeedBack: The patch(es) which you have sent to kernel@openeuler.org has been converted to PR failed! Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/P... Failed Reason: apply patch(es) failed, Patch failed at 0005 net: add some bpf hooks in tcp stack for network numa relationship Suggest Solution: please checkout if the failed patch(es) can work on the newest codes in expected branch
From: Eric Dumazet edumazet@google.com
mainline inclusion from mainline-v5.12-rc1-dontuse commit 3d368ab87cf6681f928de1ddf804d69600671bb2 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
It is simpler to make net->net_cookie a plain u64 written once in setup_net() instead of looping and using atomic64 helpers.
Lorenz Bauer wants to add SO_NETNS_COOKIE socket option and this patch would makes his patch series simpler.
Signed-off-by: Eric Dumazet edumazet@google.com Cc: Daniel Borkmann daniel@iogearbox.net Cc: Lorenz Bauer lmb@cloudflare.com Acked-by: Daniel Borkmann daniel@iogearbox.net Tested-by: Lorenz Bauer lmb@cloudflare.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Liu Jian liujian56@huawei.com --- include/net/net_namespace.h | 4 +--- net/core/filter.c | 8 +++----- net/core/net_namespace.c | 19 +++---------------- 3 files changed, 7 insertions(+), 24 deletions(-)
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index d415ecbd8958..05fd41222a81 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -173,7 +173,7 @@ struct net { struct netns_xfrm xfrm; #endif
- atomic64_t net_cookie; /* written once */ + u64 net_cookie; /* written once */
#if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs; @@ -247,8 +247,6 @@ extern struct list_head net_namespace_list; struct net *get_net_ns_by_pid(pid_t pid); struct net *get_net_ns_by_fd(int fd);
-u64 __net_gen_cookie(struct net *net); - #ifdef CONFIG_SYSCTL void ipx_register_sysctl(void); void ipx_unregister_sysctl(void); diff --git a/net/core/filter.c b/net/core/filter.c index fff5d2d7c6c3..16a2774eecd6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4698,11 +4698,9 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_ops_proto = {
static u64 __bpf_get_netns_cookie(struct sock *sk) { -#ifdef CONFIG_NET_NS - return __net_gen_cookie(sk ? sk->sk_net.net : &init_net); -#else - return 0; -#endif + const struct net *net = sk ? sock_net(sk) : &init_net; + + return net->net_cookie; }
BPF_CALL_1(bpf_get_netns_cookie_sock, struct sock *, ctx) diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index e05dd4f3279a..20a0fc4d059b 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -72,18 +72,6 @@ static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS;
DEFINE_COOKIE(net_cookie);
-u64 __net_gen_cookie(struct net *net) -{ - while (1) { - u64 res = atomic64_read(&net->net_cookie); - - if (res) - return res; - res = gen_cookie_next(&net_cookie); - atomic64_cmpxchg(&net->net_cookie, 0, res); - } -} - static struct net_generic *net_alloc_generic(void) { struct net_generic *ng; @@ -341,6 +329,9 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) refcount_set(&net->count, 1); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); + preempt_disable(); + net->net_cookie = gen_cookie_next(&net_cookie); + preempt_enable(); net->dev_base_seq = 1; net->user_ns = user_ns; idr_init(&net->netns_ids); @@ -1128,10 +1119,6 @@ static int __init net_ns_init(void)
rcu_assign_pointer(init_net.gen, ng);
- preempt_disable(); - __net_gen_cookie(&init_net); - preempt_enable(); - down_write(&pernet_ops_rwsem); if (setup_net(&init_net, &init_user_ns)) panic("Could not setup the initial network namespace");
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Fix kabi breakage in struct net.
Fixes: 3d368ab87cf6 ("net: initialize net->net_cookie at netns setup") Signed-off-by: Liu Jian liujian56@huawei.com --- include/net/net_namespace.h | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 05fd41222a81..e6a8d9dc972f 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -173,7 +173,11 @@ struct net { struct netns_xfrm xfrm; #endif
+#ifdef __GENKSYMS__ + atomic64_t net_cookie; /* written once */ +#else u64 net_cookie; /* written once */ +#endif
#if IS_ENABLED(CONFIG_IP_VS) struct netns_ipvs *ipvs;
From: Martynas Pumputis m@lambda.lt
mainline inclusion from mainline-v5.14-rc1 commit e8b9eab99232c4e62ada9d7976c80fd5e8118289 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
It's getting more common to run nested container environments for testing cloud software. One of such examples is Kind [1] which runs a Kubernetes cluster in Docker containers on a single host. Each container acts as a Kubernetes node, and thus can run any Pod (aka container) inside the former. This approach simplifies testing a lot, as it eliminates complicated VM setups.
Unfortunately, such a setup breaks some functionality when cgroupv2 BPF programs are used for load-balancing. The load-balancer BPF program needs to detect whether a request originates from the host netns or a container netns in order to allow some access, e.g. to a service via a loopback IP address. Typically, the programs detect this by comparing netns cookies with the one of the init ns via a call to bpf_get_netns_cookie(NULL). However, in nested environments the latter cannot be used given the Kubernetes node's netns is outside the init ns. To fix this, we need to pass the Kubernetes node netns cookie to the program in a different way: by extending getsockopt() with a SO_NETNS_COOKIE option, the orchestrator which runs in the Kubernetes node netns can retrieve the cookie and pass it to the program instead.
Thus, this is following up on Eric's commit 3d368ab87cf6 ("net: initialize net->net_cookie at netns setup") to allow retrieval via SO_NETNS_COOKIE. This is also in line in how we retrieve socket cookie via SO_COOKIE.
Signed-off-by: Lorenz Bauer lmb@cloudflare.com Signed-off-by: Martynas Pumputis m@lambda.lt Cc: Eric Dumazet edumazet@google.com Reviewed-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net
Conflicts: arch/alpha/include/uapi/asm/socket.h arch/mips/include/uapi/asm/socket.h arch/parisc/include/uapi/asm/socket.h arch/sparc/include/uapi/asm/socket.h include/uapi/asm-generic/socket.h [This is because we did not backport commit 7c951cafc0cb, 7fd3253a7de6] Signed-off-by: Liu Jian liujian56@huawei.com --- arch/alpha/include/uapi/asm/socket.h | 2 ++ arch/mips/include/uapi/asm/socket.h | 2 ++ arch/parisc/include/uapi/asm/socket.h | 2 ++ arch/sparc/include/uapi/asm/socket.h | 2 ++ include/uapi/asm-generic/socket.h | 2 ++ net/core/sock.c | 7 +++++++ 6 files changed, 17 insertions(+)
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index de6c4df61082..d033d3f92d6d 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -124,6 +124,8 @@
#define SO_DETACH_REUSEPORT_BPF 68
+#define SO_NETNS_COOKIE 71 + #if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index d0a9ed2ca2d6..ff3ab771e769 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -135,6 +135,8 @@
#define SO_DETACH_REUSEPORT_BPF 68
+#define SO_NETNS_COOKIE 71 + #if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 10173c32195e..1a8ec3838c9b 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -116,6 +116,8 @@
#define SO_DETACH_REUSEPORT_BPF 0x4042
+#define SO_NETNS_COOKIE 0x4045 + #if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 8029b681fc7c..08f9bbbf5bf2 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -117,6 +117,8 @@
#define SO_DETACH_REUSEPORT_BPF 0x0047
+#define SO_NETNS_COOKIE 0x0050 + #if !defined(__KERNEL__)
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 77f7c1638eb1..645606824258 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -119,6 +119,8 @@
#define SO_DETACH_REUSEPORT_BPF 68
+#define SO_NETNS_COOKIE 71 + #if !defined(__KERNEL__)
#if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__)) diff --git a/net/core/sock.c b/net/core/sock.c index a15e984bd385..8f0b8e1eaadd 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1619,6 +1619,13 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_bound_dev_if; break;
+ case SO_NETNS_COOKIE: + lv = sizeof(u64); + if (len != lv) + return -EINVAL; + v.val64 = sock_net(sk)->net_cookie; + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7).
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
Add one bpf prog type for network numa relationship.
Signed-off-by: Liu Jian liujian56@huawei.com --- include/linux/bpf_types.h | 4 + include/linux/filter.h | 54 ++++++++++++ include/uapi/linux/bpf.h | 6 ++ kernel/bpf/syscall.c | 16 ++++ net/Kconfig | 6 ++ net/core/filter.c | 148 +++++++++++++++++++++++++++++++++ tools/bpf/bpftool/prog.c | 2 + tools/include/uapi/linux/bpf.h | 6 ++ tools/lib/bpf/libbpf.c | 2 + tools/lib/bpf/libbpf_probes.c | 1 + 10 files changed, 245 insertions(+)
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 5732b485c539..57954e35fd36 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -81,6 +81,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LSM, lsm, BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED, bpf_sched, void *, void *) #endif /* CONFIG_BPF_SCHED */ +#ifdef CONFIG_BPF_NET_GLOBAL_PROG +BPF_PROG_TYPE(BPF_PROG_TYPE_NET_GLOBAL, bpf_gnet, + struct bpf_gnet_ctx, struct bpf_gnet_ctx) +#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/filter.h b/include/linux/filter.h index 4479a49a4f7c..10901c4f5b20 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1474,4 +1474,58 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, } #endif /* IS_ENABLED(CONFIG_IPV6) */
+#ifdef CONFIG_BPF_NET_GLOBAL_PROG +struct bpf_gnet_ctx_kern { + struct sock *sk; +}; + +enum gnet_bpf_attach_type { + GNET_BPF_ATTACH_TYPE_INVALID = -1, + GNET_RESERVE0 = 0, + MAX_GNET_BPF_ATTACH_TYPE +}; + +#define GNET_ATYPE(type) \ + case BPF_##type: return type + +static inline enum gnet_bpf_attach_type +to_gnet_bpf_attach_type(enum bpf_attach_type attach_type) +{ + switch (attach_type) { + GNET_ATYPE(GNET_RESERVE0); + default: + return GNET_BPF_ATTACH_TYPE_INVALID; + } +} + +struct gnet_bpf { + struct bpf_prog __rcu *progs[MAX_GNET_BPF_ATTACH_TYPE]; + u32 flags[MAX_GNET_BPF_ATTACH_TYPE]; +}; + +extern struct static_key_false gnet_bpf_enabled_key[MAX_GNET_BPF_ATTACH_TYPE]; +#define gnet_bpf_enabled(atype) static_branch_unlikely(&gnet_bpf_enabled_key[atype]) +extern struct gnet_bpf gnet_bpf_progs; + +int gnet_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog); +int gnet_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype); + +static inline void run_gnet_bpf(enum gnet_bpf_attach_type atype, + struct bpf_gnet_ctx_kern *ctx) +{ + struct bpf_prog *prog; + + rcu_read_lock(); + prog = rcu_dereference(gnet_bpf_progs.progs[atype]); + if (unlikely(!prog)) + goto out; + + bpf_prog_run_pin_on_cpu(prog, ctx); +out: + rcu_read_unlock(); +} + +#endif + #endif /* __LINUX_FILTER_H__ */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b87934003c40..b4ddcba26377 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -201,6 +201,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, #ifndef __GENKSYMS__ BPF_PROG_TYPE_SCHED, + BPF_PROG_TYPE_NET_GLOBAL, #endif };
@@ -245,6 +246,7 @@ enum bpf_attach_type { BPF_XDP, #ifndef __GENKSYMS__ BPF_SCHED, + BPF_GNET_RESERVE0, #endif __MAX_BPF_ATTACH_TYPE }; @@ -5250,4 +5252,8 @@ enum { BTF_F_ZERO = (1ULL << 3), };
+struct bpf_gnet_ctx { + __bpf_md_ptr(struct bpf_sock *, sk); +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index ba690c210f57..172d4005c940 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2107,6 +2107,9 @@ static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_EXT: /* extends any prog */ +#ifdef CONFIG_BPF_NET_GLOBAL_PROG + case BPF_PROG_TYPE_NET_GLOBAL: +#endif return true; case BPF_PROG_TYPE_CGROUP_SKB: /* always unpriv */ @@ -3017,6 +3020,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) return BPF_PROG_TYPE_SK_LOOKUP; case BPF_XDP: return BPF_PROG_TYPE_XDP; +#ifdef CONFIG_BPF_NET_GLOBAL_PROG + case BPF_GNET_RESERVE0: + return BPF_PROG_TYPE_NET_GLOBAL; +#endif default: return BPF_PROG_TYPE_UNSPEC; } @@ -3072,6 +3079,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_SOCK_OPS: ret = cgroup_bpf_prog_attach(attr, ptype, prog); break; +#ifdef CONFIG_BPF_NET_GLOBAL_PROG + case BPF_PROG_TYPE_NET_GLOBAL: + ret = gnet_bpf_prog_attach(attr, ptype, prog); + break; +#endif default: ret = -EINVAL; } @@ -3108,6 +3120,10 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_CGROUP_SYSCTL: case BPF_PROG_TYPE_SOCK_OPS: return cgroup_bpf_prog_detach(attr, ptype); +#ifdef CONFIG_BPF_NET_GLOBAL_PROG + case BPF_PROG_TYPE_NET_GLOBAL: + return gnet_bpf_prog_detach(attr, ptype); +#endif default: return -EINVAL; } diff --git a/net/Kconfig b/net/Kconfig index 232075ae15e2..6186e9ad88a3 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -470,6 +470,12 @@ config ETHTOOL_NETLINK netlink. It provides better extensibility and some new features, e.g. notification messages.
+config BPF_NET_GLOBAL_PROG + bool "Network global bpf prog type" + depends on NET + depends on BPF_SYSCALL + default n + endif # if NET
# Used by archs to tell that they support BPF JIT compiler plus which flavour. diff --git a/net/core/filter.c b/net/core/filter.c index 16a2774eecd6..a5b497043eda 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10682,3 +10682,151 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id)
return func; } + +#ifdef CONFIG_BPF_NET_GLOBAL_PROG +static DEFINE_MUTEX(gnet_bpf_mutex); +struct gnet_bpf gnet_bpf_progs; +EXPORT_SYMBOL(gnet_bpf_progs); +struct static_key_false gnet_bpf_enabled_key[MAX_GNET_BPF_ATTACH_TYPE]; +EXPORT_SYMBOL(gnet_bpf_enabled_key); + +int gnet_bpf_prog_attach(const union bpf_attr *attr, + enum bpf_prog_type ptype, struct bpf_prog *prog) +{ + enum gnet_bpf_attach_type atype; + struct bpf_prog *attached; + int ret = 0; + + if (attr->attach_flags || attr->replace_bpf_fd) + return -EINVAL; + + atype = to_gnet_bpf_attach_type(attr->attach_type); + if (atype < 0) + return -EINVAL; + + mutex_lock(&gnet_bpf_mutex); + attached = gnet_bpf_progs.progs[atype]; + if (attached == prog) { + ret = -EINVAL; + goto out_unlock; + } + + rcu_assign_pointer(gnet_bpf_progs.progs[atype], prog); + gnet_bpf_progs.flags[atype] = attr->attach_flags; + if (attached) + bpf_prog_put(attached); + else + static_branch_inc(&gnet_bpf_enabled_key[atype]); + +out_unlock: + mutex_unlock(&gnet_bpf_mutex); + return ret; +} + +int gnet_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype) +{ + enum gnet_bpf_attach_type atype; + struct bpf_prog *attached; + int ret = 0; + + atype = to_gnet_bpf_attach_type(attr->attach_type); + if (atype < 0) + return -EINVAL; + + mutex_lock(&gnet_bpf_mutex); + attached = gnet_bpf_progs.progs[atype]; + if (!attached) { + ret = -ENOENT; + goto out_unlock; + } + + static_branch_dec(&gnet_bpf_enabled_key[atype]); + gnet_bpf_progs.flags[atype] = 0; + rcu_assign_pointer(gnet_bpf_progs.progs[atype], NULL); + bpf_prog_put(attached); +out_unlock: + mutex_unlock(&gnet_bpf_mutex); + return ret; +} + +static int __init gnet_bpf_init(void) +{ + return 0; +} +late_initcall(gnet_bpf_init); + +static const struct bpf_func_proto * +bpf_gnet_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_skb_event_output_proto; + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; + default: + break; + } + + return bpf_sk_base_func_proto(func_id); +} + +static bool bpf_gnet_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_gnet_ctx)) + return false; + + /* The verifier guarantees that size > 0. */ + if (off % size != 0) + return false; + + if (type == BPF_WRITE) + return false; + + switch (off) { + case offsetof(struct bpf_gnet_ctx, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET_OR_NULL; + break; + default: + break; + } + return true; +} + +static u32 bpf_gnet_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_gnet_ctx, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, sk)); + break; + } + return insn - insn_buf; +} + +static int bpf_gnet_gen_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + return 0; +} + +const struct bpf_verifier_ops bpf_gnet_verifier_ops = { + .get_func_proto = bpf_gnet_func_proto, + .is_valid_access = bpf_gnet_is_valid_access, + .convert_ctx_access = bpf_gnet_convert_ctx_access, + .gen_prologue = bpf_gnet_gen_prologue, +}; + +const struct bpf_prog_ops bpf_gnet_prog_ops = { +}; +#endif diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 2c85586ec224..d9b2fe1c451a 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -65,6 +65,7 @@ const char * const prog_type_name[] = { [BPF_PROG_TYPE_LSM] = "lsm", [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", [BPF_PROG_TYPE_SCHED] = "sched", + [BPF_PROG_TYPE_NET_GLOBAL] = "gnet", };
const size_t prog_type_name_size = ARRAY_SIZE(prog_type_name); @@ -79,6 +80,7 @@ static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", [BPF_SK_MSG_VERDICT] = "msg_verdict", [BPF_FLOW_DISSECTOR] = "flow_dissector", + [BPF_GNET_RESERVE0] = "gnet_reserve0", [__MAX_BPF_ATTACH_TYPE] = NULL, };
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 5a153a1a8f18..dc493193174f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -201,6 +201,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_LOOKUP, #ifndef __GENKSYMS__ BPF_PROG_TYPE_SCHED, + BPF_PROG_TYPE_NET_GLOBAL, #endif };
@@ -245,6 +246,7 @@ enum bpf_attach_type { BPF_XDP, #ifndef __GENKSYMS__ BPF_SCHED, + BPF_GNET_RESERVE0, #endif __MAX_BPF_ATTACH_TYPE }; @@ -5250,4 +5252,8 @@ enum { BTF_F_ZERO = (1ULL << 3), };
+struct bpf_gnet_ctx { + __bpf_md_ptr(struct bpf_sock *, sk); +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 283881242222..1a04ac5395bb 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8494,6 +8494,8 @@ static const struct bpf_sec_def section_defs[] = { BPF_PROG_SEC("struct_ops", BPF_PROG_TYPE_STRUCT_OPS), BPF_EAPROG_SEC("sk_lookup/", BPF_PROG_TYPE_SK_LOOKUP, BPF_SK_LOOKUP), + BPF_EAPROG_SEC("gnet/reserve0", BPF_PROG_TYPE_NET_GLOBAL, + BPF_GNET_RESERVE0), };
#undef BPF_PROG_SEC_IMPL diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index 13393f0eab25..73aef4467823 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -111,6 +111,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_NET_GLOBAL: default: break; }
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9GZAQ CVE: NA
--------------------------------
And add sysctl net.core.numa_rship_ms to control frequency. Add bpf_sched_net_rship_submit bpf helper function to submit the relationship to schedule subsystem.
Signed-off-by: Liu Jian liujian56@huawei.com --- include/linux/filter.h | 17 +- include/linux/skbuff.h | 4 + include/net/net_rship.h | 329 +++++++++++++++++++++++++++++++++ include/net/sock.h | 4 + include/uapi/linux/bpf.h | 19 +- init/Kconfig | 1 + kernel/bpf/syscall.c | 5 +- net/core/dev.c | 4 + net/core/filter.c | 62 +++++++ net/core/skbuff.c | 17 +- net/core/sock.c | 21 +++ net/core/sysctl_net_core.c | 18 ++ net/ipv4/tcp.c | 6 + net/ipv4/tcp_output.c | 3 + tools/bpf/bpftool/prog.c | 5 +- tools/include/uapi/linux/bpf.h | 23 ++- tools/lib/bpf/libbpf.c | 10 +- 17 files changed, 539 insertions(+), 9 deletions(-) create mode 100644 include/net/net_rship.h
diff --git a/include/linux/filter.h b/include/linux/filter.h index 10901c4f5b20..a9ae21f08ce2 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1477,11 +1477,21 @@ static inline bool bpf_sk_lookup_run_v6(struct net *net, int protocol, #ifdef CONFIG_BPF_NET_GLOBAL_PROG struct bpf_gnet_ctx_kern { struct sock *sk; + int curr_tid; + int peer_tid; + int numa_node; + __u64 rxtx_bytes; + int rx_dev_idx; + int rx_dev_queue_idx; + __u64 rx_dev_netns_cookie; };
enum gnet_bpf_attach_type { GNET_BPF_ATTACH_TYPE_INVALID = -1, - GNET_RESERVE0 = 0, + GNET_TCP_RECVMSG = 0, + GNET_SK_DST_SET, + GNET_RCV_NIC_NODE, + GNET_SEND_NIC_NODE, MAX_GNET_BPF_ATTACH_TYPE };
@@ -1492,7 +1502,10 @@ static inline enum gnet_bpf_attach_type to_gnet_bpf_attach_type(enum bpf_attach_type attach_type) { switch (attach_type) { - GNET_ATYPE(GNET_RESERVE0); + GNET_ATYPE(GNET_TCP_RECVMSG); + GNET_ATYPE(GNET_SK_DST_SET); + GNET_ATYPE(GNET_RCV_NIC_NODE); + GNET_ATYPE(GNET_SEND_NIC_NODE); default: return GNET_BPF_ATTACH_TYPE_INVALID; } diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index ce3dfed6b915..469d7e6a2cec 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -924,7 +924,11 @@ struct sk_buff { /* public: */
KABI_USE2(1, __u8 scm_io_uring:1, __u8 local_skb:1) +#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP) + KABI_USE(2, struct sched_net_rship_skb *net_rship) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4)
diff --git a/include/net/net_rship.h b/include/net/net_rship.h new file mode 100644 index 000000000000..ad8af5a5cb9b --- /dev/null +++ b/include/net/net_rship.h @@ -0,0 +1,329 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Common code for task relationship aware + * + * Copyright (C) 2024 Huawei Technologies Co., Ltd + * + */ + +#ifndef __LINUX_NET_RSHIP_H__ +#define __LINUX_NET_RSHIP_H__ + +#include <linux/types.h> +#include <linux/jiffies.h> +#include <linux/socket.h> +#include <linux/sched.h> +#include <linux/timer.h> +#include <linux/net.h> +#include <linux/interrupt.h> +#include <linux/static_key.h> + +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <net/sock.h> +#include <linux/filter.h> + +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + +struct sched_net_rship_skb { + /* for loopback traffic */ + pid_t alloc_tid; + + /* for phy nic */ + union { + u32 rx_dev_idx; /* rx */ + int dev_numa_node; /* tx */ + }; + u16 alloc_cpu; + u16 rx_queue_idx; + u64 rx_dev_net_cookie; +}; + +struct sk_buff_fclones_net_rship { + struct sk_buff_fclones fclones; + struct sched_net_rship_skb ext1; + struct sched_net_rship_skb ext2; +}; + +struct sk_buff_net_rship { + struct sk_buff skb; + struct sched_net_rship_skb ext; +}; + +struct sched_net_rship_sock { + /* for loopback traffic */ + pid_t sk_peer_tid; + u64 tid_rx_bytes; + unsigned long last_rx_update; + + /* for recv from phy nic */ + int rcv_numa_node; + u64 rcv_numa_node_bytes; + unsigned long last_rcv_numa_node_update; + + /* for send to phy nic */ + pid_t sk_send_tid; + int send_numa_node; + u64 send_numa_node_bytes; + unsigned long last_send_numa_node_update; +}; +#endif + +#if defined(CONFIG_SCHED_TASK_RELATIONSHIP) && defined(CONFIG_BPF_NET_GLOBAL_PROG) + +#define NET_RSHIP_HEAD_RESERVE 40 +extern unsigned long net_numa_rship_jiffies; + +static inline void net_rship_sock_init(struct sock *sk, unsigned int offset) +{ + sk->net_rship = (void *)(((char *)sk) + offset); + memset(sk->net_rship, 0, sizeof(struct sched_net_rship_sock)); + sk->net_rship->rcv_numa_node = NUMA_NO_NODE; + sk->net_rship->send_numa_node = NUMA_NO_NODE; +} + +static inline struct sched_net_rship_skb *__get_skb_net_rship(struct sk_buff *skb) +{ + return skb->net_rship; +} + +static inline bool net_rship_refresh_timeout(unsigned long last_update) +{ + return time_after(jiffies, net_numa_rship_jiffies + last_update); +} + +static inline void net_rship_sk_dst_set(struct sock *sk, struct dst_entry *dst) +{ + if (!gnet_bpf_enabled(GNET_SK_DST_SET)) + return; + + if (!in_task() || !dst) + return; + + if (dev_to_node(&dst->dev->dev) != NUMA_NO_NODE) { + struct bpf_gnet_ctx_kern ctx = {0}; + + ctx.numa_node = dev_to_node(&dst->dev->dev); + if (sk->net_rship->sk_send_tid) + ctx.curr_tid = sk->net_rship->sk_send_tid; + else + ctx.curr_tid = task_pid_nr(current); + ctx.sk = sk; + run_gnet_bpf(GNET_SK_DST_SET, &ctx); + } +} + +static inline void __net_rship_tcp_rcvmsg(struct sock *sk, pid_t tid) +{ + struct bpf_gnet_ctx_kern ctx = {0}; + + ctx.sk = sk; + ctx.curr_tid = task_pid_nr(current); + ctx.peer_tid = tid; + ctx.rxtx_bytes = sk->net_rship->tid_rx_bytes; + sk->net_rship->last_rx_update = jiffies; + run_gnet_bpf(GNET_TCP_RECVMSG, &ctx); + sk->net_rship->tid_rx_bytes = 0; +} + +static inline void net_rship_tcp_local(struct sock *sk, struct sk_buff *skb) +{ + struct sched_net_rship_skb *ext; + + if (!gnet_bpf_enabled(GNET_TCP_RECVMSG)) + return; + + ext = __get_skb_net_rship(skb); + if (!ext->alloc_tid) + return; + + if (sk->net_rship->sk_peer_tid != ext->alloc_tid) { + sk->net_rship->sk_peer_tid = ext->alloc_tid; + sk->net_rship->tid_rx_bytes = skb->len + NET_RSHIP_HEAD_RESERVE; + __net_rship_tcp_rcvmsg(sk, ext->alloc_tid); + } else { + sk->net_rship->tid_rx_bytes += (skb->len + NET_RSHIP_HEAD_RESERVE); + if (net_rship_refresh_timeout(sk->net_rship->last_rx_update)) + __net_rship_tcp_rcvmsg(sk, ext->alloc_tid); + } +} + +static inline void net_rship_recv_nic_node(struct sock *sk, struct sk_buff *skb) +{ + struct sched_net_rship_skb *ext; + + if (!gnet_bpf_enabled(GNET_RCV_NIC_NODE)) + return; + + ext = __get_skb_net_rship(skb); + if (ext->alloc_tid || ext->rx_dev_idx == -1) + return; + + sk->net_rship->rcv_numa_node_bytes += (skb->len + NET_RSHIP_HEAD_RESERVE); + if (net_rship_refresh_timeout(sk->net_rship->last_rcv_numa_node_update)) { + struct bpf_gnet_ctx_kern ctx = {0}; + + ctx.sk = sk; + ctx.curr_tid = task_pid_nr(current); + ctx.numa_node = cpu_to_node(ext->alloc_cpu); + ctx.rxtx_bytes = sk->net_rship->rcv_numa_node_bytes; + ctx.rx_dev_idx = ext->rx_dev_idx; + ctx.rx_dev_queue_idx = skb_get_rx_queue(skb); + ctx.rx_dev_netns_cookie = ext->rx_dev_net_cookie; + run_gnet_bpf(GNET_RCV_NIC_NODE, &ctx); + sk->net_rship->last_rcv_numa_node_update = jiffies; + sk->net_rship->rcv_numa_node_bytes = 0; + } +} + +static inline void net_rship_tcp_recvmsg(struct sock *sk, struct sk_buff *skb) +{ + net_rship_tcp_local(sk, skb); + net_rship_recv_nic_node(sk, skb); +} + +static inline void net_rship_send_nic_node(struct sock *sk, struct sk_buff *skb) +{ + struct sched_net_rship_skb *ext; + + if (!gnet_bpf_enabled(GNET_SEND_NIC_NODE)) + return; + + ext = __get_skb_net_rship(skb); + if ((ext->dev_numa_node != NUMA_NO_NODE) && + sk->net_rship->sk_send_tid) { + sk->net_rship->send_numa_node_bytes += skb->len; + if (net_rship_refresh_timeout(sk->net_rship->last_send_numa_node_update)) { + struct bpf_gnet_ctx_kern ctx = {0}; + + ctx.sk = sk; + ctx.curr_tid = sk->net_rship->sk_send_tid; + ctx.rxtx_bytes = sk->net_rship->send_numa_node_bytes; + ctx.numa_node = ext->dev_numa_node; + + run_gnet_bpf(GNET_SEND_NIC_NODE, &ctx); + sk->net_rship->send_numa_node_bytes = 0; + sk->net_rship->last_send_numa_node_update = jiffies; + } + } +} + +static inline void net_rship_skb_record_dev_numa_node(struct sk_buff *skb, struct net_device *dev) +{ + if (gnet_bpf_enabled(GNET_SEND_NIC_NODE)) { + struct sched_net_rship_skb *ext = __get_skb_net_rship(skb); + + ext->dev_numa_node = dev_to_node(&dev->dev); + } +} + +static inline void net_rship_skb_record_dev_rxinfo(struct sk_buff *skb, struct net_device *dev) +{ + if (gnet_bpf_enabled(GNET_RCV_NIC_NODE)) { + struct sched_net_rship_skb *ext = __get_skb_net_rship(skb); + + ext->rx_dev_idx = dev->ifindex; + ext->rx_dev_net_cookie = dev_net(dev)->net_cookie; + } +} + +static inline void __net_rship_skb_clear(struct sched_net_rship_skb *ext) +{ + ext->alloc_tid = 0; + /* dev_name_node and rx_dev_idx */ + ext->dev_numa_node = NUMA_NO_NODE; +} + +static inline void net_rship_skb_clear(struct sk_buff *skb) +{ + struct sched_net_rship_skb *ext = __get_skb_net_rship(skb); + + __net_rship_skb_clear(ext); +} + +static inline void __net_rship_skb_init(struct sk_buff *skb) +{ + __net_rship_skb_clear(skb->net_rship); + skb->net_rship->alloc_cpu = raw_smp_processor_id(); +} + +static inline void net_rship_skb_init(struct sk_buff *skb) +{ + struct sk_buff_net_rship *rskb = (void *)skb; + + skb->net_rship = &rskb->ext; + __net_rship_skb_init(skb); +} + +static inline void net_rship_skb_init_flags(struct sk_buff *skb, int flags) +{ + if (flags & SKB_ALLOC_FCLONE) { + struct sk_buff_fclones_net_rship *rskbs; + + rskbs = (void *)container_of(skb, struct sk_buff_fclones, skb1); + skb->net_rship = &rskbs->ext1; + rskbs->fclones.skb2.net_rship = &rskbs->ext2; + + __net_rship_skb_init(skb); + __net_rship_skb_init(&rskbs->fclones.skb2); + } else + net_rship_skb_init(skb); +} + +static inline void net_rship_skb_clone(struct sk_buff *n, struct sk_buff *skb) +{ + n->net_rship->alloc_tid = skb->net_rship->alloc_tid; +} + +/* Make sure it is a process context */ +static inline void net_rship_record_sendmsginfo(struct sk_buff *skb, struct sock *sk) +{ + if (gnet_bpf_enabled(GNET_TCP_RECVMSG) || gnet_bpf_enabled(GNET_RCV_NIC_NODE)) { + struct sched_net_rship_skb *ext = __get_skb_net_rship(skb); + + ext->alloc_tid = task_pid_nr(current); + } + if (gnet_bpf_enabled(GNET_SK_DST_SET) || gnet_bpf_enabled(GNET_SEND_NIC_NODE)) + sk->net_rship->sk_send_tid = task_pid_nr(current); +} + +#else + +static inline void net_rship_sock_init(struct sock *sk, unsigned int offset) +{} + +static inline void net_rship_sk_dst_set(struct sock *sk, struct dst_entry *dst) +{} + +static inline void net_rship_tcp_recvmsg(struct sock *sk, struct sk_buff *skb) +{} + +static inline void net_rship_send_nic_node(struct sock *sk, struct sk_buff *skb) +{} + +static inline void net_rship_skb_record_rx_queue(struct sk_buff *skb, u16 rx_queue) +{} + +static inline void net_rship_skb_record_dev_numa_node(struct sk_buff *skb, struct net_device *dev) +{} + +static inline void net_rship_skb_record_dev_rxinfo(struct sk_buff *skb, struct net_device *dev) +{} + +static inline void net_rship_skb_clear(struct sk_buff *skb) +{} + +static inline void net_rship_skb_init(struct sk_buff *skb) +{} + +static inline void net_rship_skb_init_flags(struct sk_buff *skb, int flags) +{} + +static inline void net_rship_skb_clone(struct sk_buff *n, struct sk_buff *skb) +{} + +static inline void net_rship_record_sendmsginfo(struct sk_buff *skb, struct sock *sk) +{} +#endif + +#endif diff --git a/include/net/sock.h b/include/net/sock.h index 00051f2558fa..7078c98f9726 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -533,7 +533,11 @@ struct sock { #else KABI_RESERVE(1) #endif +#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP) + KABI_USE(2, struct sched_net_rship_sock *net_rship) +#else KABI_RESERVE(2) +#endif KABI_RESERVE(3) KABI_RESERVE(4) KABI_RESERVE(5) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b4ddcba26377..c086cc287b47 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -246,7 +246,10 @@ enum bpf_attach_type { BPF_XDP, #ifndef __GENKSYMS__ BPF_SCHED, - BPF_GNET_RESERVE0, + BPF_GNET_TCP_RECVMSG, + BPF_GNET_SK_DST_SET, + BPF_GNET_RCV_NIC_NODE, + BPF_GNET_SEND_NIC_NODE, #endif __MAX_BPF_ATTACH_TYPE }; @@ -3922,6 +3925,12 @@ union bpf_attr { * get resource statistics of *nid* and store in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_net_rship_submit(void *buf, size_t sz, u64 flags) + * Description + * update network's relationship to sched subsystem. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4098,6 +4107,7 @@ union bpf_attr { FN(get_task_relationship_stats),\ FN(sched_set_curr_preferred_node),\ FN(get_node_stats), \ + FN(sched_net_rship_submit), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -5254,6 +5264,13 @@ enum {
struct bpf_gnet_ctx { __bpf_md_ptr(struct bpf_sock *, sk); + int curr_tid; + int peer_tid; + int numa_node; + __u64 rxtx_bytes; + int rx_dev_idx; + int rx_dev_queue_idx; + __u64 rx_dev_netns_cookie; };
#endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/init/Kconfig b/init/Kconfig index 758b9988d742..c329e031689c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1084,6 +1084,7 @@ config QOS_SCHED_DYNAMIC_AFFINITY config SCHED_TASK_RELATIONSHIP bool "task relationship" depends on NUMA_BALANCING + select BPF_NET_GLOBAL_PROG default n help This feature enables the scheduler to identify tasks relationship by diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 172d4005c940..7ccdb89b08c7 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3021,7 +3021,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_XDP: return BPF_PROG_TYPE_XDP; #ifdef CONFIG_BPF_NET_GLOBAL_PROG - case BPF_GNET_RESERVE0: + case BPF_GNET_TCP_RECVMSG: + case BPF_GNET_SK_DST_SET: + case BPF_GNET_RCV_NIC_NODE: + case BPF_GNET_SEND_NIC_NODE: return BPF_PROG_TYPE_NET_GLOBAL; #endif default: diff --git a/net/core/dev.c b/net/core/dev.c index 1f1f93aad71c..8e0f4690e157 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -146,6 +146,7 @@ #include <net/devlink.h> #include <linux/pm_runtime.h> #include <linux/prandom.h> +#include <net/net_rship.h>
#include "net-sysfs.h"
@@ -3595,6 +3596,8 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, if (dev_nit_active(dev)) dev_queue_xmit_nit(skb, dev);
+ net_rship_skb_record_dev_numa_node(skb, dev); + len = skb->len; PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies); trace_net_dev_start_xmit(skb, dev); @@ -6197,6 +6200,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) __vlan_hwaccel_clear_tag(skb); skb->dev = napi->dev; skb->skb_iif = 0; + net_rship_skb_record_dev_rxinfo(skb, napi->dev);
/* eth_type_trans() assumes pkt_type is PACKET_HOST */ skb->pkt_type = PACKET_HOST; diff --git a/net/core/filter.c b/net/core/filter.c index a5b497043eda..4f4e832f3e9f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10755,6 +10755,30 @@ static int __init gnet_bpf_init(void) } late_initcall(gnet_bpf_init);
+#include <linux/sched/relationship.h> +BPF_CALL_3(bpf_sched_net_rship_submit, void *, reqbuf, size_t, sz, u64, flags) +{ +#if defined(CONFIG_SCHED_TASK_RELATIONSHIP) + struct net_relationship_req *req = reqbuf; + + if (sz != sizeof(struct net_relationship_req)) + return -EINVAL; + + return sched_net_relationship_submit(req); +#else + return 0; +#endif +} + +const struct bpf_func_proto bpf_sched_net_rship_submit_proto = { + .func = bpf_sched_net_rship_submit, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_MEM, + .arg2_type = ARG_CONST_SIZE, + .arg3_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_gnet_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -10763,6 +10787,8 @@ bpf_gnet_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_event_output_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; + case BPF_FUNC_sched_net_rship_submit: + return &bpf_sched_net_rship_submit_proto; default: break; } @@ -10810,6 +10836,42 @@ static u32 bpf_gnet_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct bpf_gnet_ctx_kern, sk)); break; + case offsetof(struct bpf_gnet_ctx, numa_node): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, numa_node), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, numa_node)); + break; + case offsetof(struct bpf_gnet_ctx, curr_tid): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, curr_tid), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, curr_tid)); + break; + case offsetof(struct bpf_gnet_ctx, peer_tid): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, peer_tid), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, peer_tid)); + break; + case offsetof(struct bpf_gnet_ctx, rxtx_bytes): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, rxtx_bytes), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, rxtx_bytes)); + break; + case offsetof(struct bpf_gnet_ctx, rx_dev_idx): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, rx_dev_idx), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, rx_dev_idx)); + break; + case offsetof(struct bpf_gnet_ctx, rx_dev_queue_idx): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, rx_dev_queue_idx), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, rx_dev_queue_idx)); + break; + case offsetof(struct bpf_gnet_ctx, rx_dev_netns_cookie): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_gnet_ctx_kern, + rx_dev_netns_cookie), + si->dst_reg, si->src_reg, + offsetof(struct bpf_gnet_ctx_kern, rx_dev_netns_cookie)); + break; } return insn - insn_buf; } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b290db716392..0a2578100e27 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -71,6 +71,7 @@ #include <net/mptcp.h> #include <net/page_pool.h> #include <net/tcp_ext.h> +#include <net/net_rship.h>
#include <linux/uaccess.h> #include <trace/events/skb.h> @@ -254,6 +255,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
skb_set_kcov_handle(skb, kcov_common_handle());
+ net_rship_skb_init_flags(skb, flags); out: return skb; nodata: @@ -289,6 +291,7 @@ static struct sk_buff *__build_skb_around(struct sk_buff *skb,
skb_set_kcov_handle(skb, kcov_common_handle());
+ net_rship_skb_init(skb); return skb; }
@@ -485,6 +488,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, skb_success: skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; + net_rship_skb_record_dev_rxinfo(skb, dev);
skb_fail: return skb; @@ -549,6 +553,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, skb_success: skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); skb->dev = napi->dev; + net_rship_skb_record_dev_rxinfo(skb, napi->dev);
skb_fail: return skb; @@ -996,7 +1001,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #ifdef CONFIG_NET_SCHED CHECK_SKB_FIELD(tc_index); #endif - + net_rship_skb_clone(new, (void *)old); }
/* @@ -1476,6 +1481,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) return NULL;
n->fclone = SKB_FCLONE_UNAVAILABLE; + net_rship_skb_init(n); }
return __skb_clone(n, skb); @@ -3428,6 +3434,7 @@ void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) skb_split_inside_header(skb, skb1, len, pos); else /* Second chunk has no header, nothing to copy. */ skb_split_no_header(skb, skb1, len, pos); + net_rship_skb_clone(skb1, skb); } EXPORT_SYMBOL(skb_split);
@@ -4438,14 +4445,22 @@ static void skb_extensions_init(void) {} void __init skb_init(void) { skbuff_head_cache = kmem_cache_create_usercopy("skbuff_head_cache", +#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP) + sizeof(struct sk_buff_net_rship), +#else sizeof(struct sk_buff), +#endif 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, offsetof(struct sk_buff, cb), sizeof_field(struct sk_buff, cb), NULL); skbuff_fclone_cache = kmem_cache_create("skbuff_fclone_cache", +#if IS_ENABLED(CONFIG_SCHED_TASK_RELATIONSHIP) + sizeof(struct sk_buff_fclones_net_rship), +#else sizeof(struct sk_buff_fclones), +#endif 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); diff --git a/net/core/sock.c b/net/core/sock.c index 8f0b8e1eaadd..da0c980ad238 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -138,6 +138,7 @@
#include <net/tcp.h> #include <net/busy_poll.h> +#include <net/net_rship.h>
static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); @@ -1676,12 +1677,18 @@ static void sock_copy(struct sock *nsk, const struct sock *osk) const struct proto *prot = READ_ONCE(osk->sk_prot); #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; +#endif +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + void *net_rship = nsk->net_rship; #endif memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + nsk->net_rship = net_rship; +#endif #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; security_sk_clone(osk, nsk); @@ -1702,7 +1709,12 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (want_init_on_alloc(priority)) sk_prot_clear_nulls(sk, prot->obj_size); } else +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + sk = kmalloc(prot->obj_size + sizeof(struct sched_net_rship_sock), + priority); +#else sk = kmalloc(prot->obj_size, priority); +#endif
if (sk != NULL) { if (security_sk_alloc(sk, family, priority)) @@ -1711,6 +1723,9 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, if (!try_module_get(prot->owner)) goto out_free_sec; sk_tx_queue_clear(sk); +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + net_rship_sock_init(sk, prot->obj_size); +#endif }
return sk; @@ -2045,6 +2060,8 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); + + net_rship_sk_dst_set(sk, dst); } EXPORT_SYMBOL_GPL(sk_setup_caps);
@@ -3512,7 +3529,11 @@ int proto_register(struct proto *prot, int alloc_slab)
if (alloc_slab) { prot->slab = kmem_cache_create_usercopy(prot->name, +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + prot->obj_size + sizeof(struct sched_net_rship_sock), 0, +#else prot->obj_size, 0, +#endif SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | prot->slab_flags, prot->useroffset, prot->usersize, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0dfe9f255ab3..ea0ee32f27bb 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -21,6 +21,7 @@ #include <net/net_ratelimit.h> #include <net/busy_poll.h> #include <net/pkt_sched.h> +#include <net/net_rship.h>
static int two = 2; static int three = 3; @@ -45,6 +46,12 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net);
+#ifdef CONFIG_SCHED_TASK_RELATIONSHIP +unsigned long net_numa_rship_jiffies __read_mostly = HZ / 10; /* 100ms */ +static unsigned long net_numa_rship_ms_min = HZ / 10; /* 100ms */ +static unsigned long net_numa_rship_ms_max = 100 * HZ; /* 100s */ +#endif + #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -575,6 +582,17 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, +#ifdef CONFIG_SCHED_TASK_RELATIONSHIP + { + .procname = "numa_rship_ms", + .data = &net_numa_rship_jiffies, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_ms_jiffies_minmax, + .extra1 = &net_numa_rship_ms_min, + .extra2 = &net_numa_rship_ms_max, + }, +#endif { } };
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 830d6b2039f5..0f13dc167730 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -279,6 +279,7 @@ #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h> +#include <net/net_rship.h>
DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); @@ -884,6 +885,7 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); skb_shinfo(skb)->tx_flags = 0; memset(TCP_SKB_CB(skb), 0, sizeof(struct tcp_skb_cb)); + net_rship_skb_clear(skb); return skb; } } @@ -1321,6 +1323,8 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) if (!skb) goto wait_for_space;
+ net_rship_record_sendmsginfo(skb, sk); + process_backlog++; skb->ip_summed = CHECKSUM_PARTIAL;
@@ -2367,6 +2371,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, if (used + offset < skb->len) continue;
+ net_rship_tcp_recvmsg(sk, skb); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d9b50a3addee..762f2009d61d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -39,6 +39,7 @@
#include <net/tcp.h> #include <net/mptcp.h> +#include <net/net_rship.h>
#include <linux/compiler.h> #include <linux/gfp.h> @@ -1196,6 +1197,8 @@ void tcp_wfree(struct sk_buff *skb) */ WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));
+ net_rship_send_nic_node(sk, skb); + /* If this softirq is serviced by ksoftirqd, we are likely under stress. * Wait until our queues (qdisc + devices) are drained. * This gives : diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index d9b2fe1c451a..22be05e8dbb4 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -80,7 +80,10 @@ static const char * const attach_type_strings[] = { [BPF_SK_SKB_STREAM_VERDICT] = "stream_verdict", [BPF_SK_MSG_VERDICT] = "msg_verdict", [BPF_FLOW_DISSECTOR] = "flow_dissector", - [BPF_GNET_RESERVE0] = "gnet_reserve0", + [BPF_GNET_TCP_RECVMSG] = "gnet_tcp_recvmsg", + [BPF_GNET_SK_DST_SET] = "gnet_sk_dst_set", + [BPF_GNET_RCV_NIC_NODE] = "gnet_rcv_nic_node", + [BPF_GNET_SEND_NIC_NODE] = "gnet_send_nic_node", [__MAX_BPF_ATTACH_TYPE] = NULL, };
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index dc493193174f..254b5118921d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -246,7 +246,10 @@ enum bpf_attach_type { BPF_XDP, #ifndef __GENKSYMS__ BPF_SCHED, - BPF_GNET_RESERVE0, + BPF_GNET_TCP_RECVMSG, + BPF_GNET_SK_DST_SET, + BPF_GNET_RCV_NIC_NODE, + BPF_GNET_SEND_NIC_NODE, #endif __MAX_BPF_ATTACH_TYPE }; @@ -3922,6 +3925,12 @@ union bpf_attr { * get resource statistics of *nid* and store in *ctx*. * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_sched_net_rship_submit(void *buf, size_t sz, u64 flags) + * Description + * update network's relationship to sched subsystem. + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -4098,6 +4107,7 @@ union bpf_attr { FN(get_task_relationship_stats),\ FN(sched_set_curr_preferred_node),\ FN(get_node_stats), \ + FN(sched_net_rship_submit), \ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4384,6 +4394,10 @@ struct bpf_sock { __u32 dst_ip6[4]; __u32 state; __s32 rx_queue_mapping; + __s32 sk_send_tid; + __s32 sk_peer_tid; + __u64 rcv_tid_bytes; + __u64 rcv_numa_node_bytes; };
struct bpf_tcp_sock { @@ -5254,6 +5268,13 @@ enum {
struct bpf_gnet_ctx { __bpf_md_ptr(struct bpf_sock *, sk); + int curr_tid; + int peer_tid; + int numa_node; + __u64 rxtx_bytes; + int rx_dev_idx; + int rx_dev_queue_idx; + __u64 rx_dev_netns_cookie; };
#endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 1a04ac5395bb..b7f71d2d7d53 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -8494,8 +8494,14 @@ static const struct bpf_sec_def section_defs[] = { BPF_PROG_SEC("struct_ops", BPF_PROG_TYPE_STRUCT_OPS), BPF_EAPROG_SEC("sk_lookup/", BPF_PROG_TYPE_SK_LOOKUP, BPF_SK_LOOKUP), - BPF_EAPROG_SEC("gnet/reserve0", BPF_PROG_TYPE_NET_GLOBAL, - BPF_GNET_RESERVE0), + BPF_EAPROG_SEC("gnet/tcp_recvmsg", BPF_PROG_TYPE_NET_GLOBAL, + BPF_GNET_TCP_RECVMSG), + BPF_EAPROG_SEC("gnet/sk_dst_set", BPF_PROG_TYPE_NET_GLOBAL, + BPF_GNET_SK_DST_SET), + BPF_EAPROG_SEC("gnet/rcv_nic_node", BPF_PROG_TYPE_NET_GLOBAL, + BPF_GNET_RCV_NIC_NODE), + BPF_EAPROG_SEC("gnet/send_nic_node", BPF_PROG_TYPE_NET_GLOBAL, + BPF_GNET_SEND_NIC_NODE), };
#undef BPF_PROG_SEC_IMPL