hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PNEK CVE: NA
-------------------------------------------------
Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com --- arch/arm64/configs/openeuler_defconfig | 1 + arch/x86/configs/openeuler_defconfig | 1 + include/linux/tcp.h | 8 +- include/net/inet_sock.h | 5 +- include/net/sock.h | 1 + include/net/tcp.h | 40 ++ net/ipv4/Kconfig | 10 + net/ipv4/Makefile | 1 + net/ipv4/syncookies.c | 2 + net/ipv4/sysctl_net_ipv4.c | 42 ++ net/ipv4/tcp.c | 5 + net/ipv4/tcp_comp.c | 912 +++++++++++++++++++++++++ net/ipv4/tcp_input.c | 26 + net/ipv4/tcp_ipv4.c | 2 + net/ipv4/tcp_minisocks.c | 3 + net/ipv4/tcp_output.c | 52 ++ net/ipv6/syncookies.c | 2 + 17 files changed, 1111 insertions(+), 2 deletions(-) create mode 100644 net/ipv4/tcp_comp.c
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 33ba39711884..abc33d2c29a6 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1222,6 +1222,7 @@ CONFIG_DEFAULT_CUBIC=y # CONFIG_DEFAULT_RENO is not set CONFIG_DEFAULT_TCP_CONG="cubic" CONFIG_TCP_MD5SIG=y +CONFIG_TCP_COMP=y CONFIG_IPV6=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 44040b835333..177a6fdcce58 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1244,6 +1244,7 @@ CONFIG_DEFAULT_CUBIC=y # CONFIG_DEFAULT_RENO is not set CONFIG_DEFAULT_TCP_CONG="cubic" CONFIG_TCP_MD5SIG=y +CONFIG_TCP_COMP=y CONFIG_IPV6=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 3c5efeeb024f..d94a84c291f0 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -123,7 +123,10 @@ struct tcp_options_received { snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ u8 saw_unknown:1, /* Received unknown option */ - unused:7; +#if IS_ENABLED(CONFIG_TCP_COMP) + comp_ok:1, /* COMP seen on SYN packet */ +#endif + unused:6; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ @@ -136,6 +139,9 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt) #if IS_ENABLED(CONFIG_SMC) rx_opt->smc_ok = 0; #endif +#if IS_ENABLED(CONFIG_TCP_COMP) + rx_opt->comp_ok = 0; +#endif }
/* This is the max number of SACKS that we'll generate and process. It's safe diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index 2de0e4d4a027..2b6da8a6282d 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -87,7 +87,10 @@ struct inet_request_sock { ecn_ok : 1, acked : 1, no_srccheck: 1, - smc_ok : 1; + smc_ok : 1, +#if IS_ENABLED(CONFIG_TCP_COMP) + comp_ok : 1; +#endif u32 ir_mark; union { struct ip_options_rcu __rcu *ireq_opt; diff --git a/include/net/sock.h b/include/net/sock.h index 7753354d59c0..c86845136ec5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -959,6 +959,7 @@ enum sock_flags { SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ SOCK_RCVMARK, /* Receive SO_MARK ancillary data with packet */ + SOCK_COMP, };
#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) diff --git a/include/net/tcp.h b/include/net/tcp.h index e9d387fffe22..cb33a2c46b2f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -195,6 +195,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 #define TCPOPT_SMC_MAGIC 0xE2D4C3D9 +#define TCPOPT_COMP_MAGIC 0x7954
/* * TCP option lengths @@ -208,6 +209,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); #define TCPOLEN_FASTOPEN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 +#define TCPOLEN_EXP_COMP_BASE 4
/* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED 12 @@ -2531,4 +2533,42 @@ static inline u64 tcp_transmit_time(const struct sock *sk) return 0; }
+#if IS_ENABLED(CONFIG_TCP_COMP) +extern struct static_key_false tcp_have_comp; + +extern unsigned long *sysctl_tcp_compression_ports; +extern int sysctl_tcp_compression_local; + +bool tcp_syn_comp_enabled(struct sock *sk); +bool tcp_synack_comp_enabled(struct sock *sk, + const struct inet_request_sock *ireq); +void tcp_init_compression(struct sock *sk); +void tcp_cleanup_compression(struct sock *sk); +int tcp_comp_init(void); +#else +static inline bool tcp_syn_comp_enabled(struct tcp_sock *tp) +{ + return false; +} + +static inline bool tcp_synack_comp_enabled(struct sock *sk, + const struct inet_request_sock *ireq) +{ + return false; +} + +static inline void tcp_init_compression(struct sock *sk) +{ +} + +static inline void tcp_cleanup_compression(struct sock *sk) +{ +} + +static inline int tcp_comp_init(void) +{ + return 0; +} +#endif + #endif /* _TCP_H */ diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 2dfb12230f08..a4405ea38338 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -751,3 +751,13 @@ config TCP_MD5SIG on the Internet.
If unsure, say N. + +config TCP_COMP + bool "TCP: Transport Layer Compression support" + depends on CRYPTO_ZSTD=y + select STREAM_PARSER + help + Enable kernel payload compression support for TCP protocol. This allows + payload compression handling of the TCP protocol to be done in-kernel. + + If unsure, say Y. diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index b18ba8ef93ad..aa458d9f534a 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -65,6 +65,7 @@ obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o obj-$(CONFIG_NET_SOCK_MSG) += tcp_bpf.o obj-$(CONFIG_BPF_SYSCALL) += udp_bpf.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o +obj-$(CONFIG_TCP_COMP) += tcp_comp.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o xfrm4_protocol.o diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 3b4dafefb4b0..42c577bfc26b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -390,6 +390,8 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
if (IS_ENABLED(CONFIG_SMC)) ireq->smc_ok = 0; + if (IS_ENABLED(CONFIG_TCP_COMP)) + ireq->comp_ok = 0;
ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index b17eb28a9690..f212133b05e9 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -469,6 +469,30 @@ static int proc_fib_multipath_hash_fields(struct ctl_table *table, int write, } #endif
+#if IS_ENABLED(CONFIG_TCP_COMP) +static int proc_tcp_compression_ports(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + unsigned long *bitmap = *(unsigned long **)table->data; + unsigned long bitmap_len = table->maxlen; + int ret; + + ret = proc_do_large_bitmap(table, write, buffer, lenp, ppos); + if (write && ret == 0) { + if (bitmap_empty(bitmap, bitmap_len)) { + if (static_key_enabled(&tcp_have_comp)) + static_branch_disable(&tcp_have_comp); + } else { + if (!static_key_enabled(&tcp_have_comp)) + static_branch_enable(&tcp_have_comp); + } + } + + return ret; +} +#endif + static struct ctl_table ipv4_table[] = { { .procname = "tcp_max_orphans", @@ -587,6 +611,24 @@ static struct ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#if IS_ENABLED(CONFIG_TCP_COMP) + { + .procname = "tcp_compression_ports", + .data = &sysctl_tcp_compression_ports, + .maxlen = 65536, + .mode = 0644, + .proc_handler = proc_tcp_compression_ports, + }, + { + .procname = "tcp_compression_local", + .data = &sysctl_tcp_compression_local, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif { } };
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3d3a24f79573..2703be8a7316 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -302,6 +302,10 @@ DEFINE_STATIC_KEY_FALSE(tcp_have_smc); EXPORT_SYMBOL(tcp_have_smc); #endif
+#if IS_ENABLED(CONFIG_TCP_COMP) +DEFINE_STATIC_KEY_FALSE(tcp_have_comp); +#endif + /* * Current number of TCP sockets. */ @@ -4707,5 +4711,6 @@ void __init tcp_init(void) tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); + tcp_comp_init(); mptcp_init(); } diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c new file mode 100644 index 000000000000..53157a413b58 --- /dev/null +++ b/net/ipv4/tcp_comp.c @@ -0,0 +1,912 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * TCP compression support + * + * Copyright(c) 2021 Huawei Technologies Co., Ltd + */ + +#include <linux/skmsg.h> +#include <linux/zstd.h> + +#define TCP_COMP_MAX_PADDING 64 +#define TCP_COMP_DATA_SIZE 65536 +#define TCP_COMP_SCRATCH_SIZE (TCP_COMP_DATA_SIZE - 1) +#define TCP_COMP_MAX_CSIZE (TCP_COMP_SCRATCH_SIZE + TCP_COMP_MAX_PADDING) +#define TCP_COMP_ALLOC_ORDER get_order(TCP_COMP_DATA_SIZE) +#define TCP_COMP_MAX_WINDOWLOG 17 +#define TCP_COMP_MAX_INPUT (1 << TCP_COMP_MAX_WINDOWLOG) + +#define TCP_COMP_SEND_PENDING 1 +#define ZSTD_COMP_DEFAULT_LEVEL 1 + +static unsigned long tcp_compression_ports[65536 / 8]; + +unsigned long *sysctl_tcp_compression_ports = tcp_compression_ports; +int sysctl_tcp_compression_local __read_mostly; + +static struct proto tcp_prot_override; + +struct tcp_comp_context_tx { + ZSTD_CStream *cstream; + void *cworkspace; + void *plaintext_data; + void *compressed_data; + struct sk_msg msg; + bool in_tcp_sendpages; +}; + +struct tcp_comp_context_rx { + ZSTD_DStream *dstream; + void *dworkspace; + void *plaintext_data; + + struct strparser strp; + void (*saved_data_ready)(struct sock *sk); + struct sk_buff *pkt; + struct sk_buff *dpkt; +}; + +struct tcp_comp_context { + struct rcu_head rcu; + + struct proto *sk_proto; + void (*sk_write_space)(struct sock *sk); + + struct tcp_comp_context_tx tx; + struct tcp_comp_context_rx rx; + + unsigned long flags; +}; + +static bool tcp_comp_is_write_pending(struct tcp_comp_context *ctx) +{ + return test_bit(TCP_COMP_SEND_PENDING, &ctx->flags); +} + +static void tcp_comp_err_abort(struct sock *sk, int err) +{ + sk->sk_err = err; + sk->sk_error_report(sk); +} + +static bool tcp_comp_enabled(__be32 saddr, __be32 daddr, int port) +{ + if (!sysctl_tcp_compression_local && + (saddr == daddr || ipv4_is_loopback(daddr))) + return false; + + return test_bit(port, sysctl_tcp_compression_ports); +} + +bool tcp_syn_comp_enabled(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + + return tcp_comp_enabled(inet->inet_saddr, inet->inet_daddr, + ntohs(inet->inet_dport)); +} + +bool tcp_synack_comp_enabled(struct sock *sk, + const struct inet_request_sock *ireq) +{ + struct inet_sock *inet = inet_sk(sk); + + if (!ireq->comp_ok) + return false; + + return tcp_comp_enabled(ireq->ir_loc_addr, ireq->ir_rmt_addr, + ntohs(inet->inet_sport)); +} + +static struct tcp_comp_context *comp_get_ctx(const struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + return (__force void *)icsk->icsk_ulp_data; +} + +static int tcp_comp_tx_context_init(struct tcp_comp_context *ctx) +{ + ZSTD_parameters params; + int csize; + + params = ZSTD_getParams(ZSTD_COMP_DEFAULT_LEVEL, PAGE_SIZE, 0); + csize = zstd_cstream_workspace_bound(¶ms.cParams); + if (csize <= 0) + return -EINVAL; + + ctx->tx.cworkspace = kmalloc(csize, GFP_KERNEL); + if (!ctx->tx.cworkspace) + return -ENOMEM; + + ctx->tx.cstream = zstd_init_cstream(¶ms, 0, ctx->tx.cworkspace, + csize); + if (!ctx->tx.cstream) + goto err_cstream; + + ctx->tx.plaintext_data = kvmalloc(TCP_COMP_SCRATCH_SIZE, GFP_KERNEL); + if (!ctx->tx.plaintext_data) + goto err_cstream; + + ctx->tx.compressed_data = kvmalloc(TCP_COMP_MAX_CSIZE, GFP_KERNEL); + if (!ctx->tx.compressed_data) + goto err_compressed; + + return 0; + +err_compressed: + kvfree(ctx->tx.plaintext_data); + ctx->tx.plaintext_data = NULL; +err_cstream: + kfree(ctx->tx.cworkspace); + ctx->tx.cworkspace = NULL; + + return -ENOMEM; +} + +static void *tcp_comp_get_tx_stream(struct sock *sk) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + if (!ctx->tx.plaintext_data) + tcp_comp_tx_context_init(ctx); + + return ctx->tx.plaintext_data; +} + +static int alloc_compressed_msg(struct sock *sk, int len) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct sk_msg *msg = &ctx->tx.msg; + + sk_msg_init(msg); + + return sk_msg_alloc(sk, msg, len, 0); +} + +static int memcopy_from_iter(struct sock *sk, struct iov_iter *from, int copy) +{ + void *dest; + int rc; + + dest = tcp_comp_get_tx_stream(sk); + if (!dest) + return -ENOSPC; + + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + rc = copy_from_iter_nocache(dest, copy, from); + else + rc = copy_from_iter(dest, copy, from); + + if (rc != copy) + rc = -EFAULT; + + return rc; +} + +static int memcopy_to_msg(struct sock *sk, int bytes) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct sk_msg *msg = &ctx->tx.msg; + int i = msg->sg.curr; + struct scatterlist *sge; + u32 copy, buf_size; + void *from, *to; + + from = ctx->tx.compressed_data; + do { + sge = sk_msg_elem(msg, i); + /* This is possible if a trim operation shrunk the buffer */ + if (msg->sg.copybreak >= sge->length) { + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + if (i == msg->sg.end) + break; + sge = sk_msg_elem(msg, i); + } + buf_size = sge->length - msg->sg.copybreak; + copy = (buf_size > bytes) ? bytes : buf_size; + to = sg_virt(sge) + msg->sg.copybreak; + msg->sg.copybreak += copy; + memcpy(to, from, copy); + bytes -= copy; + from += copy; + if (!bytes) + break; + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + msg->sg.curr = i; + return bytes; +} + +static int tcp_comp_compress_to_msg(struct sock *sk, int bytes) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + ZSTD_outBuffer outbuf; + ZSTD_inBuffer inbuf; + size_t ret; + + inbuf.src = ctx->tx.plaintext_data; + outbuf.dst = ctx->tx.compressed_data; + inbuf.size = bytes; + outbuf.size = TCP_COMP_MAX_CSIZE; + inbuf.pos = 0; + outbuf.pos = 0; + + ret = ZSTD_compressStream(ctx->tx.cstream, &outbuf, &inbuf); + if (ZSTD_isError(ret)) + return -EIO; + + ret = ZSTD_flushStream(ctx->tx.cstream, &outbuf); + if (ZSTD_isError(ret)) + return -EIO; + + if (inbuf.pos != inbuf.size) + return -EIO; + + if (memcopy_to_msg(sk, outbuf.pos)) + return -EIO; + + sk_msg_trim(sk, &ctx->tx.msg, outbuf.pos); + + return 0; +} + +static int tcp_comp_push_msg(struct sock *sk, struct sk_msg *msg, int flags) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct msghdr mh; + struct bio_vec bvec; + struct scatterlist *sg; + int ret, offset; + struct page *p; + size_t size; + + ctx->tx.in_tcp_sendpages = true; + while (1) { + sg = sk_msg_elem(msg, msg->sg.start); + offset = sg->offset; + size = sg->length; + p = sg_page(sg); +retry: + memset(&mh, 0, sizeof(struct msghdr)); + memset(&bvec, 0, sizeof(struct bio_vec)); + + mh.msg_flags = flags | MSG_SPLICE_PAGES; + bvec_set_page(&bvec, p, size, offset); + iov_iter_bvec(&mh.msg_iter, ITER_SOURCE, &bvec, 1, size); + + ret = tcp_sendmsg_locked(sk, &mh, size); + if (ret != size) { + if (ret > 0) { + sk_mem_uncharge(sk, ret); + sg->offset += ret; + sg->length -= ret; + size -= ret; + offset += ret; + goto retry; + } + ctx->tx.in_tcp_sendpages = false; + return ret; + } + + sk_mem_uncharge(sk, ret); + msg->sg.size -= size; + put_page(p); + sk_msg_iter_next(msg, start); + if (msg->sg.start == msg->sg.end) + break; + } + + clear_bit(TCP_COMP_SEND_PENDING, &ctx->flags); + ctx->tx.in_tcp_sendpages = false; + + return 0; +} + +static int tcp_comp_push(struct sock *sk, int bytes, int flags) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + int ret; + + ret = tcp_comp_compress_to_msg(sk, bytes); + if (ret < 0) { + pr_debug("%s: failed to compress sg\n", __func__); + return ret; + } + + set_bit(TCP_COMP_SEND_PENDING, &ctx->flags); + + ret = tcp_comp_push_msg(sk, &ctx->tx.msg, flags); + if (ret) { + pr_debug("%s: failed to tcp_comp_push_sg\n", __func__); + return ret; + } + + return 0; +} + +static int wait_on_pending_writer(struct sock *sk, long *timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + add_wait_queue(sk_sleep(sk), &wait); + while (1) { + if (!*timeo) { + ret = -EAGAIN; + break; + } + + if (signal_pending(current)) { + ret = sock_intr_errno(*timeo); + break; + } + + if (sk_wait_event(sk, timeo, !sk->sk_write_pending, &wait)) + break; + } + remove_wait_queue(sk_sleep(sk), &wait); + + return ret; +} + +static int tcp_comp_push_pending_msg(struct sock *sk, int flags) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct sk_msg *msg = &ctx->tx.msg; + + if (msg->sg.start == msg->sg.end) + return 0; + + return tcp_comp_push_msg(sk, msg, flags); +} + +static int tcp_comp_complete_pending_work(struct sock *sk, int flags, + long *timeo) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + int ret = 0; + + if (unlikely(sk->sk_write_pending)) + ret = wait_on_pending_writer(sk, timeo); + + if (!ret && tcp_comp_is_write_pending(ctx)) + ret = tcp_comp_push_pending_msg(sk, flags); + + return ret; +} + +static int tcp_comp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + int copied = 0, err = 0; + size_t try_to_copy; + int required_size; + long timeo; + + lock_sock(sk); + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + err = tcp_comp_complete_pending_work(sk, msg->msg_flags, &timeo); + if (err) + goto out_err; + + while (msg_data_left(msg)) { + if (sk->sk_err) { + err = -sk->sk_err; + goto out_err; + } + + try_to_copy = msg_data_left(msg); + if (try_to_copy > TCP_COMP_SCRATCH_SIZE) + try_to_copy = TCP_COMP_SCRATCH_SIZE; + required_size = try_to_copy + TCP_COMP_MAX_PADDING; + + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + +alloc_compressed: + err = alloc_compressed_msg(sk, required_size); + if (err) { + if (err != -ENOSPC) + goto wait_for_memory; + goto out_err; + } + + err = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy); + if (err < 0) + goto out_err; + + copied += try_to_copy; + + err = tcp_comp_push(sk, try_to_copy, msg->msg_flags); + if (err < 0) { + if (err == -ENOMEM) + goto wait_for_memory; + if (err != -EAGAIN) + tcp_comp_err_abort(sk, EBADMSG); + goto out_err; + } + + continue; +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_err; + if (ctx->tx.msg.sg.size < required_size) + goto alloc_compressed; + } + +out_err: + err = sk_stream_error(sk, msg->msg_flags, err); + + release_sock(sk); + + return copied ? copied : err; +} + +static struct sk_buff *comp_wait_data(struct sock *sk, int flags, + long timeo, int *err) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct sk_buff *skb; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + while (!(skb = ctx->rx.pkt)) { + if (sk->sk_err) { + *err = sock_error(sk); + return NULL; + } + + if (!skb_queue_empty(&sk->sk_receive_queue)) { + __strp_unpause(&ctx->rx.strp); + if (ctx->rx.pkt) + return ctx->rx.pkt; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return NULL; + + if (sock_flag(sk, SOCK_DONE)) + return NULL; + + if ((flags & MSG_DONTWAIT) || !timeo) { + *err = -EAGAIN; + return NULL; + } + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + sk_wait_event(sk, &timeo, ctx->rx.pkt != skb, &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + + /* Handle signals */ + if (signal_pending(current)) { + *err = sock_intr_errno(timeo); + return NULL; + } + } + + return skb; +} + +static bool comp_advance_skb(struct sock *sk, struct sk_buff *skb, + unsigned int len) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct strp_msg *rxm = strp_msg(skb); + + if (len < rxm->full_len) { + rxm->offset += len; + rxm->full_len -= len; + return false; + } + + /* Finished with message */ + ctx->rx.pkt = NULL; + kfree_skb(skb); + __strp_unpause(&ctx->rx.strp); + + return true; +} + +static bool comp_advance_dskb(struct sock *sk, struct sk_buff *skb, + unsigned int len) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct strp_msg *rxm = strp_msg(skb); + + if (len < rxm->full_len) { + rxm->offset += len; + rxm->full_len -= len; + return false; + } + + /* Finished with message */ + ctx->rx.dpkt = NULL; + kfree_skb(skb); + return true; +} + +static int tcp_comp_rx_context_init(struct tcp_comp_context *ctx) +{ + int dsize; + + dsize = zstd_dstream_workspace_bound(TCP_COMP_MAX_INPUT); + if (dsize <= 0) + return -EINVAL; + + ctx->rx.dworkspace = kmalloc(dsize, GFP_KERNEL); + if (!ctx->rx.dworkspace) + return -ENOMEM; + + ctx->rx.dstream = zstd_init_dstream(TCP_COMP_MAX_INPUT, + ctx->rx.dworkspace, dsize); + if (!ctx->rx.dstream) + goto err_dstream; + + ctx->rx.plaintext_data = kvmalloc(TCP_COMP_MAX_CSIZE * 32, GFP_KERNEL); + if (!ctx->rx.plaintext_data) + goto err_dstream; + + return 0; + +err_dstream: + kfree(ctx->rx.dworkspace); + ctx->rx.dworkspace = NULL; + + return -ENOMEM; +} + +static void *tcp_comp_get_rx_stream(struct sock *sk) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + if (!ctx->rx.plaintext_data) + tcp_comp_rx_context_init(ctx); + + return ctx->rx.plaintext_data; +} + +static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb, int flags) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct strp_msg *rxm = strp_msg(skb); + size_t ret, compressed_len = 0; + int nr_frags_over = 0; + ZSTD_outBuffer outbuf; + ZSTD_inBuffer inbuf; + struct sk_buff *nskb; + int len, plen; + void *to; + + to = tcp_comp_get_rx_stream(sk); + if (!to) + return -ENOSPC; + + if (skb_linearize_cow(skb)) + return -ENOMEM; + + nskb = skb_copy(skb, GFP_KERNEL); + if (!nskb) + return -ENOMEM; + + while (compressed_len < (skb->len - rxm->offset)) { + if (skb_shinfo(nskb)->nr_frags >= MAX_SKB_FRAGS) + break; + + len = 0; + plen = skb->len - rxm->offset - compressed_len; + if (plen > TCP_COMP_MAX_CSIZE) + plen = TCP_COMP_MAX_CSIZE; + + inbuf.src = (char *)skb->data + rxm->offset + compressed_len; + inbuf.pos = 0; + inbuf.size = plen; + + outbuf.dst = ctx->rx.plaintext_data; + outbuf.pos = 0; + outbuf.size = MAX_SKB_FRAGS * TCP_COMP_DATA_SIZE; + outbuf.size -= skb_shinfo(nskb)->nr_frags * TCP_COMP_DATA_SIZE; + + ret = ZSTD_decompressStream(ctx->rx.dstream, &outbuf, &inbuf); + if (ZSTD_isError(ret)) { + kfree_skb(nskb); + return -EIO; + } + + if (!compressed_len) { + len = outbuf.pos - skb->len; + if (len > skb_tailroom(nskb)) + len = skb_tailroom(nskb); + + __skb_put(nskb, len); + + len += skb->len; + skb_copy_to_linear_data(nskb, to, len); + } + + while ((to += len, outbuf.pos -= len) > 0) { + struct page *pages; + skb_frag_t *frag; + + if (skb_shinfo(nskb)->nr_frags >= MAX_SKB_FRAGS) { + nr_frags_over = 1; + break; + } + + frag = skb_shinfo(nskb)->frags + + skb_shinfo(nskb)->nr_frags; + pages = alloc_pages(__GFP_NOWARN | GFP_KERNEL | __GFP_COMP, + TCP_COMP_ALLOC_ORDER); + if (!pages) { + kfree_skb(nskb); + return -ENOMEM; + } + + frag->bv_page = pages; + len = PAGE_SIZE << TCP_COMP_ALLOC_ORDER; + if (outbuf.pos < len) + len = outbuf.pos; + + frag->bv_offset = 0; + skb_frag_size_set(frag, len); + memcpy(skb_frag_address(frag), to, len); + + nskb->truesize += len; + nskb->data_len += len; + nskb->len += len; + skb_shinfo(nskb)->nr_frags++; + } + + if (nr_frags_over) + break; + + compressed_len += inbuf.pos; + } + + ctx->rx.dpkt = nskb; + rxm = strp_msg(nskb); + rxm->full_len = nskb->len; + rxm->offset = 0; + comp_advance_skb(sk, skb, compressed_len); + + return 0; +} + +static int tcp_comp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int flags, int *addr_len) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct strp_msg *rxm; + struct sk_buff *skb; + ssize_t copied = 0; + int target, err = 0; + long timeo; + + if (unlikely(flags & MSG_ERRQUEUE)) + return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); + + lock_sock(sk); + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + timeo = sock_rcvtimeo(sk, flags & MSG_WAITALL); + + do { + int chunk = 0; + + if (!ctx->rx.dpkt) { + skb = comp_wait_data(sk, flags, timeo, &err); + if (!skb) + goto recv_end; + + err = tcp_comp_decompress(sk, skb, flags); + if (err < 0) { + goto recv_end; + } + } + + skb = ctx->rx.dpkt; + rxm = strp_msg(skb); + chunk = min_t(unsigned int, rxm->full_len, len); + err = skb_copy_datagram_msg(skb, rxm->offset, msg, + chunk); + if (err < 0) + goto recv_end; + + copied += chunk; + len -= chunk; + if (likely(!(flags & MSG_PEEK))) + comp_advance_dskb(sk, skb, chunk); + else + break; + + if (copied >= target && !ctx->rx.dpkt) + break; + } while (len > 0); + +recv_end: + release_sock(sk); + return copied ? : err; +} + +bool comp_stream_read(struct sock *sk) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + if (!ctx) + return false; + + if (ctx->rx.pkt || ctx->rx.dpkt) + return true; + + return false; +} + +static void comp_data_ready(struct sock *sk) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + strp_data_ready(&ctx->rx.strp); +} + +static void comp_queue(struct strparser *strp, struct sk_buff *skb) +{ + struct tcp_comp_context *ctx = comp_get_ctx(strp->sk); + + ctx->rx.pkt = skb; + strp_pause(strp); + ctx->rx.saved_data_ready(strp->sk); +} + +static int comp_read_size(struct strparser *strp, struct sk_buff *skb) +{ + struct strp_msg *rxm = strp_msg(skb); + + if (rxm->offset > skb->len) + return 0; + + return skb->len - rxm->offset; +} + +void comp_setup_strp(struct sock *sk, struct tcp_comp_context *ctx) +{ + struct strp_callbacks cb; + + memset(&cb, 0, sizeof(cb)); + cb.rcv_msg = comp_queue; + cb.parse_msg = comp_read_size; + strp_init(&ctx->rx.strp, sk, &cb); + + write_lock_bh(&sk->sk_callback_lock); + ctx->rx.saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = comp_data_ready; + write_unlock_bh(&sk->sk_callback_lock); + + strp_check_rcv(&ctx->rx.strp); +} + +static void tcp_comp_write_space(struct sock *sk) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + if (ctx->tx.in_tcp_sendpages) { + ctx->sk_write_space(sk); + return; + } + + if (!sk->sk_write_pending && tcp_comp_is_write_pending(ctx)) { + gfp_t sk_allocation = sk->sk_allocation; + int rc; + + sk->sk_allocation = GFP_ATOMIC; + rc = tcp_comp_push_pending_msg(sk, MSG_DONTWAIT | MSG_NOSIGNAL); + sk->sk_allocation = sk_allocation; + + if (rc < 0) + return; + } + + ctx->sk_write_space(sk); +} + +void tcp_init_compression(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_comp_context *ctx = NULL; + struct sk_msg *msg = NULL; + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->rx_opt.comp_ok) + return; + + ctx = kzalloc(sizeof(*ctx), GFP_ATOMIC); + if (!ctx) + return; + + msg = &ctx->tx.msg; + sk_msg_init(msg); + + ctx->sk_write_space = sk->sk_write_space; + ctx->sk_proto = sk->sk_prot; + WRITE_ONCE(sk->sk_prot, &tcp_prot_override); + sk->sk_write_space = tcp_comp_write_space; + + rcu_assign_pointer(icsk->icsk_ulp_data, ctx); + + sock_set_flag(sk, SOCK_COMP); + comp_setup_strp(sk, ctx); +} + +static void tcp_comp_context_tx_free(struct tcp_comp_context *ctx) +{ + kfree(ctx->tx.cworkspace); + ctx->tx.cworkspace = NULL; + + kvfree(ctx->tx.plaintext_data); + ctx->tx.plaintext_data = NULL; + + kvfree(ctx->tx.compressed_data); + ctx->tx.compressed_data = NULL; +} + +static void tcp_comp_context_rx_free(struct tcp_comp_context *ctx) +{ + kfree(ctx->rx.dworkspace); + ctx->rx.dworkspace = NULL; + + kvfree(ctx->rx.plaintext_data); + ctx->rx.plaintext_data = NULL; +} + +static void tcp_comp_context_free(struct rcu_head *head) +{ + struct tcp_comp_context *ctx; + + ctx = container_of(head, struct tcp_comp_context, rcu); + + tcp_comp_context_tx_free(ctx); + tcp_comp_context_rx_free(ctx); + strp_done(&ctx->rx.strp); + kfree(ctx); +} + +void tcp_cleanup_compression(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + if (!ctx || !sock_flag(sk, SOCK_COMP)) + return; + + if (ctx->rx.pkt) { + kfree_skb(ctx->rx.pkt); + ctx->rx.pkt = NULL; + } + + if (ctx->rx.dpkt) { + kfree_skb(ctx->rx.dpkt); + ctx->rx.dpkt = NULL; + } + strp_stop(&ctx->rx.strp); + + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); + call_rcu(&ctx->rcu, tcp_comp_context_free); +} + +int tcp_comp_init(void) +{ + tcp_prot_override = tcp_prot; + tcp_prot_override.sendmsg = tcp_comp_sendmsg; + tcp_prot_override.recvmsg = tcp_comp_recvmsg; + tcp_prot_override.sock_is_readable = comp_stream_read; + + return 0; +} diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fd5c13c1fbc8..a5adf4822663 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4012,6 +4012,24 @@ static bool smc_parse_options(const struct tcphdr *th, return false; }
+static bool tcp_parse_comp_option(const struct tcphdr *th, + struct tcp_options_received *opt_rx, + const unsigned char *ptr, + int opsize) +{ +#if IS_ENABLED(CONFIG_TCP_COMP) + if (static_branch_unlikely(&tcp_have_comp)) { + if (th->syn && !(opsize & 1) && + opsize >= TCPOLEN_EXP_COMP_BASE && + get_unaligned_be16(ptr) == TCPOPT_COMP_MAGIC) { + opt_rx->comp_ok = 1; + return true; + } + } +#endif + return false; +} + /* Try to parse the MSS option from the TCP header. Return 0 on failure, clamped * value on success. */ @@ -4171,6 +4189,10 @@ void tcp_parse_options(const struct net *net, if (smc_parse_options(th, opt_rx, ptr, opsize)) break;
+ if (tcp_parse_comp_option(th, opt_rx, ptr, + opsize)) + break; + opt_rx->saw_unknown = 1; break;
@@ -6097,6 +6119,7 @@ void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb) /* Initialize congestion control unless BPF initialized it already: */ if (!icsk->icsk_ca_initialized) tcp_init_congestion_control(sk); + tcp_init_compression(sk); tcp_init_buffer_space(sk); }
@@ -6835,6 +6858,9 @@ static void tcp_openreq_init(struct request_sock *req, ireq->smc_ok = rx_opt->smc_ok && !(tcp_sk(sk)->smc_hs_congested && tcp_sk(sk)->smc_hs_congested(sk)); #endif +#if IS_ENABLED(CONFIG_TCP_COMP) + ireq->comp_ok = rx_opt->comp_ok; +#endif }
struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4167e8a48b60..9cbf23c1c039 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2304,6 +2304,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_cleanup_congestion_control(sk);
+ tcp_cleanup_compression(sk); + tcp_cleanup_ulp(sk);
/* Cleanup up the write buffer. */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index b98d476f1594..d67a84c114ce 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -543,6 +543,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rcv_ssthresh = req->rsk_rcv_wnd; newtp->rcv_wnd = req->rsk_rcv_wnd; newtp->rx_opt.wscale_ok = ireq->wscale_ok; +#if IS_ENABLED(CONFIG_TCP_COMP) + newtp->rx_opt.comp_ok = ireq->comp_ok; +#endif if (newtp->rx_opt.wscale_ok) { newtp->rx_opt.snd_wscale = ireq->snd_wscale; newtp->rx_opt.rcv_wscale = ireq->rcv_wscale; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1917c62ad3bf..eee34f6c7643 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -422,6 +422,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_FAST_OPEN_COOKIE BIT(8) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) +#define OPTION_COMP BIT(11)
static void smc_options_write(__be32 *ptr, u16 *options) { @@ -438,6 +439,19 @@ static void smc_options_write(__be32 *ptr, u16 *options) #endif }
+static void comp_options_write(__be32 *ptr, u16 *options) +{ +#if IS_ENABLED(CONFIG_TCP_COMP) + if (static_branch_unlikely(&tcp_have_comp)) { + if (unlikely(OPTION_COMP & *options)) { + *ptr++ = htonl((TCPOPT_EXP << 24) | + (TCPOLEN_EXP_COMP_BASE << 16) | + (TCPOPT_COMP_MAGIC)); + } + } +#endif +} + struct tcp_out_options { u16 options; /* bit field of OPTION_* */ u16 mss; /* 0 to disable */ @@ -711,6 +725,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, smc_options_write(ptr, &options);
mptcp_options_write(th, ptr, tp, opts); + + comp_options_write(ptr, &options); }
static void smc_set_option(const struct tcp_sock *tp, @@ -729,6 +745,39 @@ static void smc_set_option(const struct tcp_sock *tp, #endif }
+static void comp_set_option(struct sock *sk, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_TCP_COMP) + if (static_branch_unlikely(&tcp_have_comp)) { + if (tcp_syn_comp_enabled(sk)) { + if (*remaining >= TCPOLEN_EXP_COMP_BASE) { + opts->options |= OPTION_COMP; + *remaining -= TCPOLEN_EXP_COMP_BASE; + } + } + } +#endif +} + +static void comp_set_option_cond(struct sock *sk, + const struct inet_request_sock *ireq, + struct tcp_out_options *opts, + unsigned int *remaining) +{ +#if IS_ENABLED(CONFIG_TCP_COMP) + if (static_branch_unlikely(&tcp_have_comp)) { + if (tcp_synack_comp_enabled(sk, ireq)) { + if (*remaining >= TCPOLEN_EXP_COMP_BASE) { + opts->options |= OPTION_COMP; + *remaining -= TCPOLEN_EXP_COMP_BASE; + } + } + } +#endif +} + static void smc_set_option_cond(const struct tcp_sock *tp, const struct inet_request_sock *ireq, struct tcp_out_options *opts, @@ -830,6 +879,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, }
smc_set_option(tp, opts, &remaining); + comp_set_option(sk, opts, &remaining);
if (sk_is_mptcp(sk)) { unsigned int size; @@ -910,6 +960,8 @@ static unsigned int tcp_synack_options(const struct sock *sk,
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+ comp_set_option_cond((struct sock *)sk, ireq, opts, &remaining); + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining);
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 8698b49dfc8d..d17389f1afe0 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -217,6 +217,8 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) treq->txhash = net_tx_rndhash(); if (IS_ENABLED(CONFIG_SMC)) ireq->smc_ok = 0; + if (IS_ENABLED(CONFIG_TCP_COMP)) + ireq->comp_ok = 0;
/* * We need to lookup the dst_entry to get the correct window size.