From: Wei Yongjun weiyongjun1@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PNEK CVE: NA
-------------------------------------------------
This patch implement software level compression for sending tcp messages. All of the TCP payload will be compressed before xmit.
Signed-off-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wang Yufen wangyufen@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv4/Kconfig | 1 + net/ipv4/tcp_comp.c | 415 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 412 insertions(+), 4 deletions(-)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 59ffbf80f7f2..22c554d3a9ab 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -745,6 +745,7 @@ config TCP_MD5SIG
config TCP_COMP bool "TCP: Transport Layer Compression support" + depends on ZSTD_COMPRESS=y help Enable kernel payload compression support for TCP protocol. This allows payload compression handling of the TCP protocol to be done in-kernel. diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c index fb90be4d9e9c..e85803da3924 100644 --- a/net/ipv4/tcp_comp.c +++ b/net/ipv4/tcp_comp.c @@ -5,7 +5,15 @@ * Copyright(c) 2021 Huawei Technologies Co., Ltd */
-#include <net/tcp.h> +#include <linux/skmsg.h> +#include <linux/zstd.h> + +#define TCP_COMP_MAX_PADDING 64 +#define TCP_COMP_SCRATCH_SIZE 65400 +#define TCP_COMP_MAX_CSIZE (TCP_COMP_SCRATCH_SIZE + TCP_COMP_MAX_PADDING) + +#define TCP_COMP_SEND_PENDING 1 +#define ZSTD_COMP_DEFAULT_LEVEL 1
static unsigned long tcp_compression_ports[65536 / 8];
@@ -14,11 +22,37 @@ int sysctl_tcp_compression_local __read_mostly;
static struct proto tcp_prot_override;
+struct tcp_comp_context_tx { + ZSTD_CStream *cstream; + void *cworkspace; + void *plaintext_data; + void *compressed_data; + struct sk_msg msg; + bool in_tcp_sendpages; +}; + struct tcp_comp_context { - struct proto *sk_proto; struct rcu_head rcu; + + struct proto *sk_proto; + void (*sk_write_space)(struct sock *sk); + + struct tcp_comp_context_tx tx; + + unsigned long flags; };
+static bool tcp_comp_is_write_pending(struct tcp_comp_context *ctx) +{ + return test_bit(TCP_COMP_SEND_PENDING, &ctx->flags); +} + +static void tcp_comp_err_abort(struct sock *sk, int err) +{ + sk->sk_err = err; + sk->sk_error_report(sk); +} + static bool tcp_comp_enabled(__be32 saddr, __be32 daddr, int port) { if (!sysctl_tcp_compression_local && @@ -55,11 +89,341 @@ static struct tcp_comp_context *comp_get_ctx(const struct sock *sk) return (__force void *)icsk->icsk_ulp_data; }
-static int tcp_comp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +static int tcp_comp_tx_context_init(struct tcp_comp_context *ctx) +{ + ZSTD_parameters params; + int csize; + + params = ZSTD_getParams(ZSTD_COMP_DEFAULT_LEVEL, PAGE_SIZE, 0); + csize = ZSTD_CStreamWorkspaceBound(params.cParams); + if (csize <= 0) + return -EINVAL; + + ctx->tx.cworkspace = kmalloc(csize, GFP_KERNEL); + if (!ctx->tx.cworkspace) + return -ENOMEM; + + ctx->tx.cstream = ZSTD_initCStream(params, 0, ctx->tx.cworkspace, + csize); + if (!ctx->tx.cstream) + goto err_cstream; + + ctx->tx.plaintext_data = kvmalloc(TCP_COMP_SCRATCH_SIZE, GFP_KERNEL); + if (!ctx->tx.plaintext_data) + goto err_cstream; + + ctx->tx.compressed_data = kvmalloc(TCP_COMP_MAX_CSIZE, GFP_KERNEL); + if (!ctx->tx.compressed_data) + goto err_compressed; + + return 0; + +err_compressed: + kvfree(ctx->tx.plaintext_data); + ctx->tx.plaintext_data = NULL; +err_cstream: + kfree(ctx->tx.cworkspace); + ctx->tx.cworkspace = NULL; + + return -ENOMEM; +} + +static void *tcp_comp_get_tx_stream(struct sock *sk) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + if (!ctx->tx.plaintext_data) + tcp_comp_tx_context_init(ctx); + + return ctx->tx.plaintext_data; +} + +static int alloc_compressed_msg(struct sock *sk, int len) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct sk_msg *msg = &ctx->tx.msg; + + sk_msg_init(msg); + + return sk_msg_alloc(sk, msg, len, 0); +} + +static int memcopy_from_iter(struct sock *sk, struct iov_iter *from, int copy) +{ + void *dest; + int rc; + + dest = tcp_comp_get_tx_stream(sk); + if (!dest) + return -ENOSPC; + + if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) + rc = copy_from_iter_nocache(dest, copy, from); + else + rc = copy_from_iter(dest, copy, from); + + if (rc != copy) + rc = -EFAULT; + + return rc; +} + +static int memcopy_to_msg(struct sock *sk, int bytes) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct sk_msg *msg = &ctx->tx.msg; + int i = msg->sg.curr; + struct scatterlist *sge; + u32 copy, buf_size; + void *from, *to; + + from = ctx->tx.compressed_data; + do { + sge = sk_msg_elem(msg, i); + /* This is possible if a trim operation shrunk the buffer */ + if (msg->sg.copybreak >= sge->length) { + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + if (i == msg->sg.end) + break; + sge = sk_msg_elem(msg, i); + } + buf_size = sge->length - msg->sg.copybreak; + copy = (buf_size > bytes) ? bytes : buf_size; + to = sg_virt(sge) + msg->sg.copybreak; + msg->sg.copybreak += copy; + memcpy(to, from, copy); + bytes -= copy; + from += copy; + if (!bytes) + break; + msg->sg.copybreak = 0; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + msg->sg.curr = i; + return bytes; +} + +static int tcp_comp_compress_to_msg(struct sock *sk, int bytes) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + ZSTD_outBuffer outbuf; + ZSTD_inBuffer inbuf; + size_t ret; + + inbuf.src = ctx->tx.plaintext_data; + outbuf.dst = ctx->tx.compressed_data; + inbuf.size = bytes; + outbuf.size = TCP_COMP_MAX_CSIZE; + inbuf.pos = 0; + outbuf.pos = 0; + + ret = ZSTD_compressStream(ctx->tx.cstream, &outbuf, &inbuf); + if (ZSTD_isError(ret)) + return -EIO; + + ret = ZSTD_flushStream(ctx->tx.cstream, &outbuf); + if (ZSTD_isError(ret)) + return -EIO; + + if (inbuf.pos != inbuf.size) + return -EIO; + + if (memcopy_to_msg(sk, outbuf.pos)) + return -EIO; + + sk_msg_trim(sk, &ctx->tx.msg, outbuf.pos); + + return 0; +} + +static int tcp_comp_push_msg(struct sock *sk, struct sk_msg *msg, int flags) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct scatterlist *sg; + int ret, offset; + struct page *p; + size_t size; + + ctx->tx.in_tcp_sendpages = true; + while (1) { + sg = sk_msg_elem(msg, msg->sg.start); + offset = sg->offset; + size = sg->length; + p = sg_page(sg); +retry: + ret = do_tcp_sendpages(sk, p, offset, size, flags); + if (ret != size) { + if (ret > 0) { + sk_mem_uncharge(sk, ret); + sg->offset += ret; + sg->length -= ret; + size -= ret; + offset += ret; + goto retry; + } + ctx->tx.in_tcp_sendpages = false; + return ret; + } + + sk_mem_uncharge(sk, ret); + msg->sg.size -= size; + put_page(p); + sk_msg_iter_next(msg, start); + if (msg->sg.start == msg->sg.end) + break; + } + + clear_bit(TCP_COMP_SEND_PENDING, &ctx->flags); + ctx->tx.in_tcp_sendpages = false; + + return 0; +} + +static int tcp_comp_push(struct sock *sk, int bytes, int flags) { struct tcp_comp_context *ctx = comp_get_ctx(sk); + int ret;
- return ctx->sk_proto->sendmsg(sk, msg, size); + ret = tcp_comp_compress_to_msg(sk, bytes); + if (ret < 0) { + pr_debug("%s: failed to compress sg\n", __func__); + return ret; + } + + set_bit(TCP_COMP_SEND_PENDING, &ctx->flags); + + ret = tcp_comp_push_msg(sk, &ctx->tx.msg, flags); + if (ret) { + pr_debug("%s: failed to tcp_comp_push_sg\n", __func__); + return ret; + } + + return 0; +} + +static int wait_on_pending_writer(struct sock *sk, long *timeo) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + add_wait_queue(sk_sleep(sk), &wait); + while (1) { + if (!*timeo) { + ret = -EAGAIN; + break; + } + + if (signal_pending(current)) { + ret = sock_intr_errno(*timeo); + break; + } + + if (sk_wait_event(sk, timeo, !sk->sk_write_pending, &wait)) + break; + } + remove_wait_queue(sk_sleep(sk), &wait); + + return ret; +} + +static int tcp_comp_push_pending_msg(struct sock *sk, int flags) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct sk_msg *msg = &ctx->tx.msg; + + if (msg->sg.start == msg->sg.end) + return 0; + + return tcp_comp_push_msg(sk, msg, flags); +} + +static int tcp_comp_complete_pending_work(struct sock *sk, int flags, + long *timeo) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + int ret = 0; + + if (unlikely(sk->sk_write_pending)) + ret = wait_on_pending_writer(sk, timeo); + + if (!ret && tcp_comp_is_write_pending(ctx)) + ret = tcp_comp_push_pending_msg(sk, flags); + + return ret; +} + +static int tcp_comp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + int copied = 0, err = 0; + size_t try_to_copy; + int required_size; + long timeo; + + lock_sock(sk); + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + + err = tcp_comp_complete_pending_work(sk, msg->msg_flags, &timeo); + if (err) + goto out_err; + + while (msg_data_left(msg)) { + if (sk->sk_err) { + err = -sk->sk_err; + goto out_err; + } + + try_to_copy = msg_data_left(msg); + if (try_to_copy > TCP_COMP_SCRATCH_SIZE) + try_to_copy = TCP_COMP_SCRATCH_SIZE; + required_size = try_to_copy + TCP_COMP_MAX_PADDING; + + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + +alloc_compressed: + err = alloc_compressed_msg(sk, required_size); + if (err) { + if (err != -ENOSPC) + goto wait_for_memory; + goto out_err; + } + + err = memcopy_from_iter(sk, &msg->msg_iter, try_to_copy); + if (err < 0) + goto out_err; + + copied += try_to_copy; + + err = tcp_comp_push(sk, try_to_copy, msg->msg_flags); + if (err < 0) { + if (err == -ENOMEM) + goto wait_for_memory; + if (err != -EAGAIN) + tcp_comp_err_abort(sk, EBADMSG); + goto out_err; + } + + continue; +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + err = sk_stream_wait_memory(sk, &timeo); + if (err) + goto out_err; + if (ctx->tx.msg.sg.size < required_size) + goto alloc_compressed; + } + +out_err: + err = sk_stream_error(sk, msg->msg_flags, err); + + release_sock(sk); + + return copied ? copied : err; }
static int tcp_comp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, @@ -70,10 +434,35 @@ static int tcp_comp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, return ctx->sk_proto->recvmsg(sk, msg, len, nonblock, flags, addr_len); }
+static void tcp_comp_write_space(struct sock *sk) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + if (ctx->tx.in_tcp_sendpages) { + ctx->sk_write_space(sk); + return; + } + + if (!sk->sk_write_pending && tcp_comp_is_write_pending(ctx)) { + gfp_t sk_allocation = sk->sk_allocation; + int rc; + + sk->sk_allocation = GFP_ATOMIC; + rc = tcp_comp_push_pending_msg(sk, MSG_DONTWAIT | MSG_NOSIGNAL); + sk->sk_allocation = sk_allocation; + + if (rc < 0) + return; + } + + ctx->sk_write_space(sk); +} + void tcp_init_compression(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_comp_context *ctx = NULL; + struct sk_msg *msg = NULL; struct tcp_sock *tp = tcp_sk(sk);
if (!tp->rx_opt.comp_ok) @@ -83,20 +472,38 @@ void tcp_init_compression(struct sock *sk) if (!ctx) return;
+ msg = &ctx->tx.msg; + sk_msg_init(msg); + + ctx->sk_write_space = sk->sk_write_space; ctx->sk_proto = sk->sk_prot; WRITE_ONCE(sk->sk_prot, &tcp_prot_override); + sk->sk_write_space = tcp_comp_write_space;
rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
sock_set_flag(sk, SOCK_COMP); }
+static void tcp_comp_context_tx_free(struct tcp_comp_context *ctx) +{ + kfree(ctx->tx.cworkspace); + ctx->tx.cworkspace = NULL; + + kvfree(ctx->tx.plaintext_data); + ctx->tx.plaintext_data = NULL; + + kvfree(ctx->tx.compressed_data); + ctx->tx.compressed_data = NULL; +} + static void tcp_comp_context_free(struct rcu_head *head) { struct tcp_comp_context *ctx;
ctx = container_of(head, struct tcp_comp_context, rcu);
+ tcp_comp_context_tx_free(ctx); kfree(ctx); }
From: Wang Yufen wangyufen@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PNEK CVE: NA
-------------------------------------------------
This patch implement software level compression for receiving tcp messages. The compressed TCP payload will be decompressed after receive.
Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv4/Kconfig | 3 +- net/ipv4/tcp_comp.c | 377 +++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 377 insertions(+), 3 deletions(-)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 22c554d3a9ab..0ce3f61658b7 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -745,7 +745,8 @@ config TCP_MD5SIG
config TCP_COMP bool "TCP: Transport Layer Compression support" - depends on ZSTD_COMPRESS=y + depends on CRYPTO_ZSTD=y + select STREAM_PARSER help Enable kernel payload compression support for TCP protocol. This allows payload compression handling of the TCP protocol to be done in-kernel. diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c index e85803da3924..1daa6d7ad5e1 100644 --- a/net/ipv4/tcp_comp.c +++ b/net/ipv4/tcp_comp.c @@ -9,8 +9,11 @@ #include <linux/zstd.h>
#define TCP_COMP_MAX_PADDING 64 -#define TCP_COMP_SCRATCH_SIZE 65400 +#define TCP_COMP_SCRATCH_SIZE 65535 #define TCP_COMP_MAX_CSIZE (TCP_COMP_SCRATCH_SIZE + TCP_COMP_MAX_PADDING) +#define TCP_COMP_ALLOC_ORDER get_order(65536) +#define TCP_COMP_MAX_WINDOWLOG 17 +#define TCP_COMP_MAX_INPUT (1 << TCP_COMP_MAX_WINDOWLOG)
#define TCP_COMP_SEND_PENDING 1 #define ZSTD_COMP_DEFAULT_LEVEL 1 @@ -31,6 +34,20 @@ struct tcp_comp_context_tx { bool in_tcp_sendpages; };
+struct tcp_comp_context_rx { + ZSTD_DStream *dstream; + void *dworkspace; + void *plaintext_data; + void *compressed_data; + void *remaining_data; + + size_t data_offset; + struct strparser strp; + void (*saved_data_ready)(struct sock *sk); + struct sk_buff *pkt; + bool decompressed; +}; + struct tcp_comp_context { struct rcu_head rcu;
@@ -38,6 +55,7 @@ struct tcp_comp_context { void (*sk_write_space)(struct sock *sk);
struct tcp_comp_context_tx tx; + struct tcp_comp_context_rx rx;
unsigned long flags; }; @@ -426,12 +444,344 @@ static int tcp_comp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) return copied ? copied : err; }
+static struct sk_buff *comp_wait_data(struct sock *sk, int flags, + long timeo, int *err) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct sk_buff *skb; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + + while (!(skb = ctx->rx.pkt)) { + if (sk->sk_err) { + *err = sock_error(sk); + return NULL; + } + + if (!skb_queue_empty(&sk->sk_receive_queue)) { + __strp_unpause(&ctx->rx.strp); + if (ctx->rx.pkt) + return ctx->rx.pkt; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return NULL; + + if (sock_flag(sk, SOCK_DONE)) + return NULL; + + if ((flags & MSG_DONTWAIT) || !timeo) { + *err = -EAGAIN; + return NULL; + } + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + sk_wait_event(sk, &timeo, ctx->rx.pkt != skb, &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + + /* Handle signals */ + if (signal_pending(current)) { + *err = sock_intr_errno(timeo); + return NULL; + } + } + + return skb; +} + +static bool comp_advance_skb(struct sock *sk, struct sk_buff *skb, + unsigned int len) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct strp_msg *rxm = strp_msg(skb); + + if (len < rxm->full_len) { + rxm->offset += len; + rxm->full_len -= len; + return false; + } + + /* Finished with message */ + ctx->rx.pkt = NULL; + kfree_skb(skb); + __strp_unpause(&ctx->rx.strp); + + return true; +} + +static int tcp_comp_rx_context_init(struct tcp_comp_context *ctx) +{ + int dsize; + + dsize = ZSTD_DStreamWorkspaceBound(TCP_COMP_MAX_INPUT); + if (dsize <= 0) + return -EINVAL; + + ctx->rx.dworkspace = kmalloc(dsize, GFP_KERNEL); + if (!ctx->rx.dworkspace) + return -ENOMEM; + + ctx->rx.dstream = ZSTD_initDStream(TCP_COMP_MAX_INPUT, + ctx->rx.dworkspace, dsize); + if (!ctx->rx.dstream) + goto err_dstream; + + ctx->rx.plaintext_data = kvmalloc(TCP_COMP_MAX_CSIZE * 32, GFP_KERNEL); + if (!ctx->rx.plaintext_data) + goto err_dstream; + + ctx->rx.compressed_data = kvmalloc(TCP_COMP_MAX_CSIZE, GFP_KERNEL); + if (!ctx->rx.compressed_data) + goto err_compressed; + + ctx->rx.remaining_data = kvmalloc(TCP_COMP_MAX_CSIZE, GFP_KERNEL); + if (!ctx->rx.remaining_data) + goto err_remaining; + + ctx->rx.data_offset = 0; + + return 0; + +err_remaining: + kvfree(ctx->rx.compressed_data); + ctx->rx.compressed_data = NULL; +err_compressed: + kvfree(ctx->rx.plaintext_data); + ctx->rx.plaintext_data = NULL; +err_dstream: + kfree(ctx->rx.dworkspace); + ctx->rx.dworkspace = NULL; + + return -ENOMEM; +} + +static void *tcp_comp_get_rx_stream(struct sock *sk) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + if (!ctx->rx.plaintext_data) + tcp_comp_rx_context_init(ctx); + + return ctx->rx.plaintext_data; +} + +static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + const int plen = skb->len; + struct strp_msg *rxm; + ZSTD_outBuffer outbuf; + ZSTD_inBuffer inbuf; + int len; + void *to; + + to = tcp_comp_get_rx_stream(sk); + if (!to) + return -ENOSPC; + + if (skb_linearize_cow(skb)) + return -ENOMEM; + + if (plen + ctx->rx.data_offset > TCP_COMP_MAX_CSIZE) + return -ENOMEM; + + if (ctx->rx.data_offset) + memcpy(ctx->rx.compressed_data, ctx->rx.remaining_data, + ctx->rx.data_offset); + + memcpy((char *)ctx->rx.compressed_data + ctx->rx.data_offset, + skb->data, plen); + + inbuf.src = ctx->rx.compressed_data; + inbuf.pos = 0; + inbuf.size = plen + ctx->rx.data_offset; + ctx->rx.data_offset = 0; + + outbuf.dst = ctx->rx.plaintext_data; + outbuf.pos = 0; + outbuf.size = TCP_COMP_MAX_CSIZE * 32; + + while (1) { + size_t ret; + + to = outbuf.dst; + + ret = ZSTD_decompressStream(ctx->rx.dstream, &outbuf, &inbuf); + if (ZSTD_isError(ret)) + return -EIO; + + len = outbuf.pos - plen; + if (len > skb_tailroom(skb)) + len = skb_tailroom(skb); + + __skb_put(skb, len); + rxm = strp_msg(skb); + rxm->full_len += len; + + len += plen; + skb_copy_to_linear_data(skb, to, len); + + while ((to += len, outbuf.pos -= len) > 0) { + struct page *pages; + skb_frag_t *frag; + + if (WARN_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) + return -EMSGSIZE; + + frag = skb_shinfo(skb)->frags + + skb_shinfo(skb)->nr_frags; + pages = alloc_pages(__GFP_NOWARN | GFP_KERNEL | __GFP_COMP, + TCP_COMP_ALLOC_ORDER); + + if (!pages) + return -ENOMEM; + + __skb_frag_set_page(frag, pages); + len = PAGE_SIZE << TCP_COMP_ALLOC_ORDER; + if (outbuf.pos < len) + len = outbuf.pos; + + frag->bv_offset = 0; + skb_frag_size_set(frag, len); + memcpy(skb_frag_address(frag), to, len); + + skb->truesize += len; + skb->data_len += len; + skb->len += len; + rxm->full_len += len; + skb_shinfo(skb)->nr_frags++; + } + + if (ret == 0) + break; + + if (inbuf.pos >= plen || !inbuf.pos) { + if (inbuf.pos < inbuf.size) { + memcpy((char *)ctx->rx.remaining_data, + (char *)inbuf.src + inbuf.pos, + inbuf.size - inbuf.pos); + ctx->rx.data_offset = inbuf.size - inbuf.pos; + } + break; + } + } + return 0; +} + static int tcp_comp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct strp_msg *rxm; + struct sk_buff *skb; + ssize_t copied = 0; + int target, err = 0; + long timeo; + + flags |= nonblock; + + if (unlikely(flags & MSG_ERRQUEUE)) + return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); + + lock_sock(sk); + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + timeo = sock_rcvtimeo(sk, flags & MSG_WAITALL); + + do { + int chunk = 0; + + skb = comp_wait_data(sk, flags, timeo, &err); + if (!skb) + goto recv_end; + + if (!ctx->rx.decompressed) { + err = tcp_comp_decompress(sk, skb); + if (err < 0) { + if (err != -ENOSPC) + tcp_comp_err_abort(sk, EBADMSG); + goto recv_end; + } + ctx->rx.decompressed = true; + } + rxm = strp_msg(skb); + + chunk = min_t(unsigned int, rxm->full_len, len); + + err = skb_copy_datagram_msg(skb, rxm->offset, msg, + chunk); + if (err < 0) + goto recv_end; + + copied += chunk; + len -= chunk; + if (likely(!(flags & MSG_PEEK))) + comp_advance_skb(sk, skb, chunk); + else + break; + + if (copied >= target && !ctx->rx.pkt) + break; + } while (len > 0); + +recv_end: + release_sock(sk); + return copied ? : err; +} + +bool comp_stream_read(const struct sock *sk) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + if (ctx->rx.pkt) + return true; + + return false; +} + +static void comp_data_ready(struct sock *sk) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + + strp_data_ready(&ctx->rx.strp); +} + +static void comp_queue(struct strparser *strp, struct sk_buff *skb) +{ + struct tcp_comp_context *ctx = comp_get_ctx(strp->sk); + + ctx->rx.decompressed = false; + ctx->rx.pkt = skb; + strp_pause(strp); + ctx->rx.saved_data_ready(strp->sk); +} + +static int comp_read_size(struct strparser *strp, struct sk_buff *skb) +{ + struct strp_msg *rxm = strp_msg(skb); + + if (rxm->offset > skb->len) + return 0;
- return ctx->sk_proto->recvmsg(sk, msg, len, nonblock, flags, addr_len); + return skb->len; +} + +void comp_setup_strp(struct sock *sk, struct tcp_comp_context *ctx) +{ + struct strp_callbacks cb; + + memset(&cb, 0, sizeof(cb)); + cb.rcv_msg = comp_queue; + cb.parse_msg = comp_read_size; + strp_init(&ctx->rx.strp, sk, &cb); + + write_lock_bh(&sk->sk_callback_lock); + ctx->rx.saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = comp_data_ready; + write_unlock_bh(&sk->sk_callback_lock); + + strp_check_rcv(&ctx->rx.strp); }
static void tcp_comp_write_space(struct sock *sk) @@ -483,6 +833,7 @@ void tcp_init_compression(struct sock *sk) rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
sock_set_flag(sk, SOCK_COMP); + comp_setup_strp(sk, ctx); }
static void tcp_comp_context_tx_free(struct tcp_comp_context *ctx) @@ -497,6 +848,21 @@ static void tcp_comp_context_tx_free(struct tcp_comp_context *ctx) ctx->tx.compressed_data = NULL; }
+static void tcp_comp_context_rx_free(struct tcp_comp_context *ctx) +{ + kfree(ctx->rx.dworkspace); + ctx->rx.dworkspace = NULL; + + kvfree(ctx->rx.plaintext_data); + ctx->rx.plaintext_data = NULL; + + kvfree(ctx->rx.compressed_data); + ctx->rx.compressed_data = NULL; + + kvfree(ctx->rx.remaining_data); + ctx->rx.remaining_data = NULL; +} + static void tcp_comp_context_free(struct rcu_head *head) { struct tcp_comp_context *ctx; @@ -504,6 +870,7 @@ static void tcp_comp_context_free(struct rcu_head *head) ctx = container_of(head, struct tcp_comp_context, rcu);
tcp_comp_context_tx_free(ctx); + tcp_comp_context_rx_free(ctx); kfree(ctx); }
@@ -515,6 +882,11 @@ void tcp_cleanup_compression(struct sock *sk) if (!ctx || !sock_flag(sk, SOCK_COMP)) return;
+ if (ctx->rx.pkt) { + kfree_skb(ctx->rx.pkt); + ctx->rx.pkt = NULL; + } + rcu_assign_pointer(icsk->icsk_ulp_data, NULL); call_rcu(&ctx->rcu, tcp_comp_context_free); } @@ -524,6 +896,7 @@ int tcp_comp_init(void) tcp_prot_override = tcp_prot; tcp_prot_override.sendmsg = tcp_comp_sendmsg; tcp_prot_override.recvmsg = tcp_comp_recvmsg; + tcp_prot_override.stream_memory_read = comp_stream_read;
return 0; }
From: Wang Yufen wangyufen@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4PNEK CVE: NA
-------------------------------------------------
Open configs for tcp compression
Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 5 +++-- arch/x86/configs/openeuler_defconfig | 5 ++++- 2 files changed, 7 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 771eb45cb362..d8eda75eb784 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1153,6 +1153,7 @@ CONFIG_DEFAULT_CUBIC=y # CONFIG_DEFAULT_RENO is not set CONFIG_DEFAULT_TCP_CONG="cubic" CONFIG_TCP_MD5SIG=y +CONFIG_TCP_COMP=y CONFIG_IPV6=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y @@ -6636,7 +6637,7 @@ CONFIG_CRYPTO_LZO=y # CONFIG_CRYPTO_842 is not set CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m -CONFIG_CRYPTO_ZSTD=m +CONFIG_CRYPTO_ZSTD=y
# # Random Number Generation @@ -6775,7 +6776,7 @@ CONFIG_LZO_DECOMPRESS=y CONFIG_LZ4_COMPRESS=m CONFIG_LZ4HC_COMPRESS=m CONFIG_LZ4_DECOMPRESS=y -CONFIG_ZSTD_COMPRESS=m +CONFIG_ZSTD_COMPRESS=y CONFIG_ZSTD_DECOMPRESS=y CONFIG_XZ_DEC=y CONFIG_XZ_DEC_X86=y diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index fc7be06e8054..443299650f6a 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1121,6 +1121,7 @@ CONFIG_DEFAULT_CUBIC=y # CONFIG_DEFAULT_RENO is not set CONFIG_DEFAULT_TCP_CONG="cubic" CONFIG_TCP_MD5SIG=y +CONFIG_TCP_COMP=y CONFIG_IPV6=y CONFIG_IPV6_ROUTER_PREF=y CONFIG_IPV6_ROUTE_INFO=y @@ -8019,7 +8020,7 @@ CONFIG_CRYPTO_LZO=y # CONFIG_CRYPTO_842 is not set # CONFIG_CRYPTO_LZ4 is not set # CONFIG_CRYPTO_LZ4HC is not set -# CONFIG_CRYPTO_ZSTD is not set +CONFIG_CRYPTO_ZSTD=y
# # Random Number Generation @@ -8152,6 +8153,8 @@ CONFIG_ZLIB_DEFLATE=y CONFIG_LZO_COMPRESS=y CONFIG_LZO_DECOMPRESS=y CONFIG_LZ4_DECOMPRESS=y +CONFIG_ZSTD_COMPRESS=y +CONFIG_ZSTD_DECOMPRESS=y CONFIG_ZSTD_DECOMPRESS=y CONFIG_XZ_DEC=y CONFIG_XZ_DEC_X86=y
From: Wang Yufen wangyufen@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I45UYC?from=project-issue CVE: NA
-------------------------------------------------
In comp_stream_read ctx might be null, add null check to avoid oops. And delete unnecessary tcp_comp_err_abort.
Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv4/tcp_comp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c index 1daa6d7ad5e1..6d6b248d02b5 100644 --- a/net/ipv4/tcp_comp.c +++ b/net/ipv4/tcp_comp.c @@ -699,8 +699,6 @@ static int tcp_comp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (!ctx->rx.decompressed) { err = tcp_comp_decompress(sk, skb); if (err < 0) { - if (err != -ENOSPC) - tcp_comp_err_abort(sk, EBADMSG); goto recv_end; } ctx->rx.decompressed = true; @@ -734,6 +732,9 @@ bool comp_stream_read(const struct sock *sk) { struct tcp_comp_context *ctx = comp_get_ctx(sk);
+ if (!ctx) + return false; + if (ctx->rx.pkt) return true;
From: Wang Yufen wangyufen@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I47SV5?from=project-issue CVE: NA
-------------------------------------------------
In comp_read_size rxm->offset should be subtracted from skb->len. And use strp_done to release resoureces when destroy sock.
Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv4/tcp_comp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c index 6d6b248d02b5..7d40c2f3981b 100644 --- a/net/ipv4/tcp_comp.c +++ b/net/ipv4/tcp_comp.c @@ -765,7 +765,7 @@ static int comp_read_size(struct strparser *strp, struct sk_buff *skb) if (rxm->offset > skb->len) return 0;
- return skb->len; + return skb->len - rxm->offset; }
void comp_setup_strp(struct sock *sk, struct tcp_comp_context *ctx) @@ -872,6 +872,7 @@ static void tcp_comp_context_free(struct rcu_head *head)
tcp_comp_context_tx_free(ctx); tcp_comp_context_rx_free(ctx); + strp_done(&ctx->rx.strp); kfree(ctx); }
@@ -887,6 +888,7 @@ void tcp_cleanup_compression(struct sock *sk) kfree_skb(ctx->rx.pkt); ctx->rx.pkt = NULL; } + strp_stop(&ctx->rx.strp);
rcu_assign_pointer(icsk->icsk_ulp_data, NULL); call_rcu(&ctx->rcu, tcp_comp_context_free);
From: Wang Yufen wangyufen@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I48H9Z?from=project-issue CVE: NA
-------------------------------------------------
This patch fixes possible ZSTD_decompressStream failures. When decompressing skb->data, should skip the previous rxm->offset data.
Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv4/tcp_comp.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c index 7d40c2f3981b..1a907d9a51e0 100644 --- a/net/ipv4/tcp_comp.c +++ b/net/ipv4/tcp_comp.c @@ -569,8 +569,8 @@ static void *tcp_comp_get_rx_stream(struct sock *sk) static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb) { struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct strp_msg *rxm = strp_msg(skb); const int plen = skb->len; - struct strp_msg *rxm; ZSTD_outBuffer outbuf; ZSTD_inBuffer inbuf; int len; @@ -591,11 +591,11 @@ static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb) ctx->rx.data_offset);
memcpy((char *)ctx->rx.compressed_data + ctx->rx.data_offset, - skb->data, plen); + (char *)skb->data + rxm->offset, plen - rxm->offset);
inbuf.src = ctx->rx.compressed_data; inbuf.pos = 0; - inbuf.size = plen + ctx->rx.data_offset; + inbuf.size = plen - rxm->offset + ctx->rx.data_offset; ctx->rx.data_offset = 0;
outbuf.dst = ctx->rx.plaintext_data; @@ -606,7 +606,6 @@ static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb) size_t ret;
to = outbuf.dst; - ret = ZSTD_decompressStream(ctx->rx.dstream, &outbuf, &inbuf); if (ZSTD_isError(ret)) return -EIO; @@ -616,8 +615,8 @@ static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb) len = skb_tailroom(skb);
__skb_put(skb, len); - rxm = strp_msg(skb); - rxm->full_len += len; + rxm->full_len += (len + rxm->offset); + rxm->offset = 0;
len += plen; skb_copy_to_linear_data(skb, to, len);
From: Wang Yufen wangyufen@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I48H9Z?from=project-issue CVE: NA
-------------------------------------------------
In order to separate the compressed data and decompressed data, this patch adds dpkt to tcp_comp_context_rx, dpkt is used to save decompressed skb.
Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv4/tcp_comp.c | 94 ++++++++++++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 30 deletions(-)
diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c index 1a907d9a51e0..67b09c1b4669 100644 --- a/net/ipv4/tcp_comp.c +++ b/net/ipv4/tcp_comp.c @@ -45,7 +45,7 @@ struct tcp_comp_context_rx { struct strparser strp; void (*saved_data_ready)(struct sock *sk); struct sk_buff *pkt; - bool decompressed; + struct sk_buff *dpkt; };
struct tcp_comp_context { @@ -510,6 +510,24 @@ static bool comp_advance_skb(struct sock *sk, struct sk_buff *skb, return true; }
+static bool comp_advance_dskb(struct sock *sk, struct sk_buff *skb, + unsigned int len) +{ + struct tcp_comp_context *ctx = comp_get_ctx(sk); + struct strp_msg *rxm = strp_msg(skb); + + if (len < rxm->full_len) { + rxm->offset += len; + rxm->full_len -= len; + return false; + } + + /* Finished with message */ + ctx->rx.dpkt = NULL; + kfree_skb(skb); + return true; +} + static int tcp_comp_rx_context_init(struct tcp_comp_context *ctx) { int dsize; @@ -566,13 +584,14 @@ static void *tcp_comp_get_rx_stream(struct sock *sk) return ctx->rx.plaintext_data; }
-static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb) +static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb, int flags) { struct tcp_comp_context *ctx = comp_get_ctx(sk); struct strp_msg *rxm = strp_msg(skb); const int plen = skb->len; ZSTD_outBuffer outbuf; ZSTD_inBuffer inbuf; + struct sk_buff *nskb; int len; void *to;
@@ -586,6 +605,10 @@ static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb) if (plen + ctx->rx.data_offset > TCP_COMP_MAX_CSIZE) return -ENOMEM;
+ nskb = skb_copy(skb, GFP_KERNEL); + if (!nskb) + return -ENOMEM; + if (ctx->rx.data_offset) memcpy(ctx->rx.compressed_data, ctx->rx.remaining_data, ctx->rx.data_offset); @@ -607,34 +630,38 @@ static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb)
to = outbuf.dst; ret = ZSTD_decompressStream(ctx->rx.dstream, &outbuf, &inbuf); - if (ZSTD_isError(ret)) + if (ZSTD_isError(ret)) { + kfree_skb(nskb); return -EIO; + }
len = outbuf.pos - plen; - if (len > skb_tailroom(skb)) - len = skb_tailroom(skb); + if (len > skb_tailroom(nskb)) + len = skb_tailroom(nskb);
- __skb_put(skb, len); - rxm->full_len += (len + rxm->offset); - rxm->offset = 0; + __skb_put(nskb, len);
len += plen; - skb_copy_to_linear_data(skb, to, len); + skb_copy_to_linear_data(nskb, to, len);
while ((to += len, outbuf.pos -= len) > 0) { struct page *pages; skb_frag_t *frag;
- if (WARN_ON(skb_shinfo(skb)->nr_frags >= MAX_SKB_FRAGS)) + if (WARN_ON(skb_shinfo(nskb)->nr_frags >= MAX_SKB_FRAGS)) { + kfree_skb(nskb); return -EMSGSIZE; + }
- frag = skb_shinfo(skb)->frags + - skb_shinfo(skb)->nr_frags; + frag = skb_shinfo(nskb)->frags + + skb_shinfo(nskb)->nr_frags; pages = alloc_pages(__GFP_NOWARN | GFP_KERNEL | __GFP_COMP, TCP_COMP_ALLOC_ORDER);
- if (!pages) + if (!pages) { + kfree_skb(nskb); return -ENOMEM; + }
__skb_frag_set_page(frag, pages); len = PAGE_SIZE << TCP_COMP_ALLOC_ORDER; @@ -645,11 +672,10 @@ static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb) skb_frag_size_set(frag, len); memcpy(skb_frag_address(frag), to, len);
- skb->truesize += len; - skb->data_len += len; - skb->len += len; - rxm->full_len += len; - skb_shinfo(skb)->nr_frags++; + nskb->truesize += len; + nskb->data_len += len; + nskb->len += len; + skb_shinfo(nskb)->nr_frags++; }
if (ret == 0) @@ -665,6 +691,13 @@ static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb) break; } } + + ctx->rx.dpkt = nskb; + rxm = strp_msg(nskb); + rxm->full_len = nskb->len; + rxm->offset = 0; + comp_advance_skb(sk, skb, plen - rxm->offset); + return 0; }
@@ -691,21 +724,19 @@ static int tcp_comp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, do { int chunk = 0;
- skb = comp_wait_data(sk, flags, timeo, &err); - if (!skb) - goto recv_end; + if (!ctx->rx.dpkt) { + skb = comp_wait_data(sk, flags, timeo, &err); + if (!skb) + goto recv_end;
- if (!ctx->rx.decompressed) { - err = tcp_comp_decompress(sk, skb); + err = tcp_comp_decompress(sk, skb, flags); if (err < 0) { goto recv_end; } - ctx->rx.decompressed = true; } + skb = ctx->rx.dpkt; rxm = strp_msg(skb); - chunk = min_t(unsigned int, rxm->full_len, len); - err = skb_copy_datagram_msg(skb, rxm->offset, msg, chunk); if (err < 0) @@ -714,11 +745,11 @@ static int tcp_comp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, copied += chunk; len -= chunk; if (likely(!(flags & MSG_PEEK))) - comp_advance_skb(sk, skb, chunk); + comp_advance_dskb(sk, skb, chunk); else break;
- if (copied >= target && !ctx->rx.pkt) + if (copied >= target && !ctx->rx.dpkt) break; } while (len > 0);
@@ -734,7 +765,7 @@ bool comp_stream_read(const struct sock *sk) if (!ctx) return false;
- if (ctx->rx.pkt) + if (ctx->rx.pkt || ctx->rx.dpkt) return true;
return false; @@ -751,7 +782,6 @@ static void comp_queue(struct strparser *strp, struct sk_buff *skb) { struct tcp_comp_context *ctx = comp_get_ctx(strp->sk);
- ctx->rx.decompressed = false; ctx->rx.pkt = skb; strp_pause(strp); ctx->rx.saved_data_ready(strp->sk); @@ -887,6 +917,10 @@ void tcp_cleanup_compression(struct sock *sk) kfree_skb(ctx->rx.pkt); ctx->rx.pkt = NULL; } + if (ctx->rx.dpkt) { + kfree_skb(ctx->rx.dpkt); + ctx->rx.dpkt = NULL; + } strp_stop(&ctx->rx.strp);
rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
From: Wang Yufen wangyufen@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I48H9Z?from=project-issue CVE: NA
-------------------------------------------------
The compressed data and decompressed data is separated. There is no need to save the uncompressed data to remaining_data buffer, can directly read data from the uncompressed skb.
Signed-off-by: Wang Yufen wangyufen@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Lu Wei luwei32@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- net/ipv4/tcp_comp.c | 98 ++++++++++++++------------------------------- 1 file changed, 30 insertions(+), 68 deletions(-)
diff --git a/net/ipv4/tcp_comp.c b/net/ipv4/tcp_comp.c index 67b09c1b4669..ffddbd6d3a6b 100644 --- a/net/ipv4/tcp_comp.c +++ b/net/ipv4/tcp_comp.c @@ -38,10 +38,7 @@ struct tcp_comp_context_rx { ZSTD_DStream *dstream; void *dworkspace; void *plaintext_data; - void *compressed_data; - void *remaining_data;
- size_t data_offset; struct strparser strp; void (*saved_data_ready)(struct sock *sk); struct sk_buff *pkt; @@ -549,24 +546,8 @@ static int tcp_comp_rx_context_init(struct tcp_comp_context *ctx) if (!ctx->rx.plaintext_data) goto err_dstream;
- ctx->rx.compressed_data = kvmalloc(TCP_COMP_MAX_CSIZE, GFP_KERNEL); - if (!ctx->rx.compressed_data) - goto err_compressed; - - ctx->rx.remaining_data = kvmalloc(TCP_COMP_MAX_CSIZE, GFP_KERNEL); - if (!ctx->rx.remaining_data) - goto err_remaining; - - ctx->rx.data_offset = 0; - return 0;
-err_remaining: - kvfree(ctx->rx.compressed_data); - ctx->rx.compressed_data = NULL; -err_compressed: - kvfree(ctx->rx.plaintext_data); - ctx->rx.plaintext_data = NULL; err_dstream: kfree(ctx->rx.dworkspace); ctx->rx.dworkspace = NULL; @@ -588,11 +569,12 @@ static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb, int flags) { struct tcp_comp_context *ctx = comp_get_ctx(sk); struct strp_msg *rxm = strp_msg(skb); - const int plen = skb->len; + size_t ret, compressed_len = 0; + int nr_frags_over = 0; ZSTD_outBuffer outbuf; ZSTD_inBuffer inbuf; struct sk_buff *nskb; - int len; + int len, plen; void *to;
to = tcp_comp_get_rx_stream(sk); @@ -602,62 +584,54 @@ static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb, int flags) if (skb_linearize_cow(skb)) return -ENOMEM;
- if (plen + ctx->rx.data_offset > TCP_COMP_MAX_CSIZE) - return -ENOMEM; - nskb = skb_copy(skb, GFP_KERNEL); if (!nskb) return -ENOMEM;
- if (ctx->rx.data_offset) - memcpy(ctx->rx.compressed_data, ctx->rx.remaining_data, - ctx->rx.data_offset); - - memcpy((char *)ctx->rx.compressed_data + ctx->rx.data_offset, - (char *)skb->data + rxm->offset, plen - rxm->offset); - - inbuf.src = ctx->rx.compressed_data; - inbuf.pos = 0; - inbuf.size = plen - rxm->offset + ctx->rx.data_offset; - ctx->rx.data_offset = 0; + while (compressed_len < (skb->len - rxm->offset)) { + len = 0; + plen = skb->len - rxm->offset - compressed_len; + if (plen > TCP_COMP_MAX_CSIZE) + plen = TCP_COMP_MAX_CSIZE;
- outbuf.dst = ctx->rx.plaintext_data; - outbuf.pos = 0; - outbuf.size = TCP_COMP_MAX_CSIZE * 32; + inbuf.src = (char *)skb->data + rxm->offset + compressed_len; + inbuf.pos = 0; + inbuf.size = plen;
- while (1) { - size_t ret; + outbuf.dst = ctx->rx.plaintext_data; + outbuf.pos = 0; + outbuf.size = TCP_COMP_MAX_CSIZE * 32;
- to = outbuf.dst; ret = ZSTD_decompressStream(ctx->rx.dstream, &outbuf, &inbuf); if (ZSTD_isError(ret)) { kfree_skb(nskb); return -EIO; }
- len = outbuf.pos - plen; - if (len > skb_tailroom(nskb)) - len = skb_tailroom(nskb); + if (!compressed_len) { + len = outbuf.pos - skb->len; + if (len > skb_tailroom(nskb)) + len = skb_tailroom(nskb);
- __skb_put(nskb, len); + __skb_put(nskb, len);
- len += plen; - skb_copy_to_linear_data(nskb, to, len); + len += skb->len; + skb_copy_to_linear_data(nskb, to, len); + }
while ((to += len, outbuf.pos -= len) > 0) { struct page *pages; skb_frag_t *frag;
- if (WARN_ON(skb_shinfo(nskb)->nr_frags >= MAX_SKB_FRAGS)) { - kfree_skb(nskb); - return -EMSGSIZE; + if (skb_shinfo(nskb)->nr_frags >= MAX_SKB_FRAGS) { + nr_frags_over = 1; + break; }
frag = skb_shinfo(nskb)->frags + skb_shinfo(nskb)->nr_frags; pages = alloc_pages(__GFP_NOWARN | GFP_KERNEL | __GFP_COMP, TCP_COMP_ALLOC_ORDER); - if (!pages) { kfree_skb(nskb); return -ENOMEM; @@ -678,25 +652,17 @@ static int tcp_comp_decompress(struct sock *sk, struct sk_buff *skb, int flags) skb_shinfo(nskb)->nr_frags++; }
- if (ret == 0) + if (nr_frags_over) break;
- if (inbuf.pos >= plen || !inbuf.pos) { - if (inbuf.pos < inbuf.size) { - memcpy((char *)ctx->rx.remaining_data, - (char *)inbuf.src + inbuf.pos, - inbuf.size - inbuf.pos); - ctx->rx.data_offset = inbuf.size - inbuf.pos; - } - break; - } + compressed_len += inbuf.pos; }
ctx->rx.dpkt = nskb; rxm = strp_msg(nskb); rxm->full_len = nskb->len; rxm->offset = 0; - comp_advance_skb(sk, skb, plen - rxm->offset); + comp_advance_skb(sk, skb, compressed_len);
return 0; } @@ -734,6 +700,7 @@ static int tcp_comp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto recv_end; } } + skb = ctx->rx.dpkt; rxm = strp_msg(skb); chunk = min_t(unsigned int, rxm->full_len, len); @@ -885,12 +852,6 @@ static void tcp_comp_context_rx_free(struct tcp_comp_context *ctx)
kvfree(ctx->rx.plaintext_data); ctx->rx.plaintext_data = NULL; - - kvfree(ctx->rx.compressed_data); - ctx->rx.compressed_data = NULL; - - kvfree(ctx->rx.remaining_data); - ctx->rx.remaining_data = NULL; }
static void tcp_comp_context_free(struct rcu_head *head) @@ -917,6 +878,7 @@ void tcp_cleanup_compression(struct sock *sk) kfree_skb(ctx->rx.pkt); ctx->rx.pkt = NULL; } + if (ctx->rx.dpkt) { kfree_skb(ctx->rx.dpkt); ctx->rx.dpkt = NULL;
From: Wei Li liwei391@huawei.com
hulk inclusion category: bugfix bugzilla: 173968, https://gitee.com/openeuler/kernel/issues/I3J87Y CVE: NA
-------------------------------------------------
In '6ab918569ad4 ("watchdog: Fix check_preemption_disabled() error")', we tried to fix check_preemption_disabled() error by disabling preemption in hardlockup_detector_perf_init(), but missed that function perf_event_create_kernel_counter() may sleep.
To fix the issue fully, reimplement hardlockup_detector_perf_init() through smp_call_on_cpu() instead of disabling preemption.
Fixes: 6ab918569ad4 ("watchdog: Fix check_preemption_disabled() error") Signed-off-by: Wei Li liwei391@huawei.com Reviewed-by: Cheng Jian cj.chengjian@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/watchdog_hld.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-)
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c index f535ddd76315..b8a2d3b2cd9c 100644 --- a/kernel/watchdog_hld.c +++ b/kernel/watchdog_hld.c @@ -499,22 +499,25 @@ void __init hardlockup_detector_perf_restart(void) } }
-/** - * hardlockup_detector_perf_init - Probe whether NMI event is available at all - */ -int __init hardlockup_detector_perf_init(void) +int __init __hardlockup_detector_perf_init(void *not_used) { - int ret; + int ret = hardlockup_detector_event_create();
- preempt_disable(); - ret = hardlockup_detector_event_create(); if (ret) { pr_info("Perf NMI watchdog permanently disabled\n"); } else { perf_event_release_kernel(this_cpu_read(watchdog_ev)); this_cpu_write(watchdog_ev, NULL); } - preempt_enable(); return ret; } + +/** + * hardlockup_detector_perf_init - Probe whether NMI event is available at all + */ +int __init hardlockup_detector_perf_init(void) +{ + return smp_call_on_cpu(get_boot_cpu_id(), + __hardlockup_detector_perf_init, NULL, false); +} #endif /* CONFIG_HARDLOCKUP_DETECTOR_PERF */
From: Miaohe Lin linmiaohe@huawei.com
euler inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4IHL1 CVE: NA
-------------------------------------------------
Ipvlan l2e mode will cache skbuff for local xmit in ipvlan_xmit_mode_l2e. But when tso/gso is disabled, this would result in performance loss.
So we should stop caching the skbuff when tso/gso is disabled.
Signed-off-by: Miaohe Lin linmiaohe@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Zhang Xiaoxu zhangxiaoxu5@huawei.com Signed-off-by: Keefe Liu liuqifa@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: Yuehaibing yuehaibing@huawei.com Reviewed-by: Wenan Mao maowenan@huawei.com Signed-off-by: Chen Zhou chenzhou10@huawei.com Signed-off-by: Wang Hai wanghai38@huawei.com Reviewed-by: Wei Yongjun weiyongjun1@huawei.com Reviewed-by: Yue Haibing yuehaibing@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/net/ipvlan/ipvlan_core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 8de1f58d2fab..56ae0814982e 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -793,8 +793,9 @@ static int ipvlan_xmit_mode_l2e(struct sk_buff *skb, struct net_device *dev) return NET_XMIT_DROP; }
- if (unlikely(ipvlan_l2e_local_xmit_event(ipvlan, - &skb))) + if (unlikely((dev->features & + (NETIF_F_GSO | NETIF_F_TSO)) && + ipvlan_l2e_local_xmit_event(ipvlan, &skb))) return NET_XMIT_DROP; return ipvlan_rcv_frame(addr, &skb, true); }
From: Zhen Lei thunder.leizhen@huawei.com
hulk inclusion category: bugfix bugzilla: https://e.gitee.com/open_euler/issues/list?issue=I4QSVV CVE: NA
-------------------------------------------------------------------------
The commit 587e6c10a7ce ("iommu/arm-smmu-v3: Add and use static helper function arm_smmu_cmdq_issue_cmd_with_sync()") is backported from RFC version. Some minor changes are made to the patch when it is merged into the mainline, see the following link: Link: https://www.spinics.net/lists/arm-kernel/msg914548.html
In addition, there are two patches based on earlier mainline that still incorrectly use arm_smmu_cmdq_issue_sync(), fix them.
Fixes: 3e63033675c9 ("iommu/arm-smmu-v3: Seize private ASID") Fixes: 04039cc97a88 ("iommu/smmuv3: Implement cache_invalidate") Signed-off-by: Zhen Lei thunder.leizhen@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 28 ++++++++------------- 1 file changed, 10 insertions(+), 18 deletions(-)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c index 0702408997c9..f6868511ad01 100644 --- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c +++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c @@ -1030,8 +1030,9 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu, return ret; }
-static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, - struct arm_smmu_cmdq_ent *ent) +static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_ent *ent, + bool sync) { u64 cmd[CMDQ_ENT_DWORDS];
@@ -1041,26 +1042,19 @@ static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, return -EINVAL; }
- return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, false); + return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, sync); }
-static int __maybe_unused arm_smmu_cmdq_issue_sync(struct arm_smmu_device *smmu) +static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu, + struct arm_smmu_cmdq_ent *ent) { - return arm_smmu_cmdq_issue_cmdlist(smmu, NULL, 0, true); + return __arm_smmu_cmdq_issue_cmd(smmu, ent, false); }
static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu, struct arm_smmu_cmdq_ent *ent) { - u64 cmd[CMDQ_ENT_DWORDS]; - - if (arm_smmu_cmdq_build_cmd(cmd, ent)) { - dev_warn(smmu->dev, "ignoring unknown CMDQ opcode 0x%x\n", - ent->opcode); - return -EINVAL; - } - - return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, true); + return __arm_smmu_cmdq_issue_cmd(smmu, ent, true); }
static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu, @@ -2255,8 +2249,7 @@ static void __arm_smmu_tlb_inv_context(struct arm_smmu_domain *smmu_domain, cmd.opcode = CMDQ_OP_TLBI_NH_ASID; cmd.tlbi.asid = ext_asid; cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); } else if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) { arm_smmu_tlb_inv_asid(smmu, smmu_domain->s1_cfg.cd.asid); } else { @@ -3853,8 +3846,7 @@ arm_smmu_cache_invalidate(struct iommu_domain *domain, struct device *dev,
/* Global S1 invalidation */ cmd.tlbi.vmid = smmu_domain->s2_cfg.vmid; - arm_smmu_cmdq_issue_cmd(smmu, &cmd); - arm_smmu_cmdq_issue_sync(smmu); + arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd); return 0; }
From: Cui GaoSheng cuigaosheng1@huawei.com
hulk inclusion category: bugfix bugzilla: 186133 https://gitee.com/openeuler/kernel/issues/I4RGWS?from=project-issue CVE: NA
-----------------------------------------------------------------
When we add "audit=1" to the cmdline, if we keep the audit_hold_queue non-empty, flush the hold queue will fall into an infinite loop. So we need to fix it by stoping flush the hold queue when netlink abnormal.
Fixes: bd8698d87053 ("audit: improve robustness of the audit queue handling") Signed-off-by: Cui GaoSheng cuigaosheng1@huawei.com Reviewed-by: weiyang wang wangweiyang2@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/audit.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/kernel/audit.c b/kernel/audit.c index 2a38cbaf3ddb..21be62bc8205 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -732,6 +732,8 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, if (!sk) { if (err_hook) (*err_hook)(skb); + if (queue == &audit_hold_queue) + goto out; continue; }
@@ -748,6 +750,8 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, (*err_hook)(skb); if (rc == -EAGAIN) rc = 0; + if (queue == &audit_hold_queue) + goto out; /* continue to drain the queue */ continue; } else @@ -759,6 +763,7 @@ static int kauditd_send_queue(struct sock *sk, u32 portid, } }
+out: return (rc >= 0 ? 0 : rc); }
From: Jingxian He hejingxian@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4REE5 CVE: NA
------------ 1.Refactor pin memory mem reserve: Move the memory reserve functions to pin_mem.c, and avoid using unnecessary macros.
2.Refactor pid reserve code: In oder to avoid using unnecessary compile macros in pid.c, add stub functions for free_reserved_pid and reserve_pids.
Signed-off-by: Jingxian He hejingxian@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/kernel/setup.c | 8 +-- arch/arm64/mm/init.c | 53 +----------------- include/linux/pin_mem.h | 34 +++++++++--- kernel/pid.c | 12 ++--- mm/pin_mem.c | 110 ++++++++++++++++++++++++++++++-------- 5 files changed, 119 insertions(+), 98 deletions(-)
diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 92d75e381bb1..58d69e2e7538 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -30,9 +30,7 @@ #include <linux/psci.h> #include <linux/sched/task.h> #include <linux/mm.h> -#ifdef CONFIG_PIN_MEMORY #include <linux/pin_mem.h> -#endif
#include <asm/acpi.h> #include <asm/fixmap.h> @@ -297,11 +295,7 @@ static void __init request_standard_resources(void) request_resource(res, &quick_kexec_res); #endif
-#ifdef CONFIG_PIN_MEMORY - if (pin_memory_resource.end && pin_memory_resource.start >= res->start && - pin_memory_resource.end <= res->end) - request_resource(res, &pin_memory_resource); -#endif + request_pin_mem_res(res); } }
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 6ebfabde16f3..1364d52cbaa8 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -30,9 +30,7 @@ #include <linux/crash_dump.h> #include <linux/hugetlb.h> #include <linux/acpi_iort.h> -#ifdef CONFIG_PIN_MEMORY #include <linux/pin_mem.h> -#endif
#include <asm/boot.h> #include <asm/fixmap.h> @@ -56,52 +54,6 @@ s64 memstart_addr __ro_after_init = -1; EXPORT_SYMBOL(memstart_addr);
-#ifdef CONFIG_PIN_MEMORY -struct resource pin_memory_resource = { - .name = "Pin memory", - .start = 0, - .end = 0, - .flags = IORESOURCE_MEM, - .desc = IORES_DESC_RESERVED -}; - -static void __init reserve_pin_memory_res(void) -{ - unsigned long long mem_start, mem_len; - int ret; - - ret = parse_pin_memory(boot_command_line, memblock_phys_mem_size(), - &mem_len, &mem_start); - if (ret || !mem_len) - return; - - mem_len = PAGE_ALIGN(mem_len); - - if (!memblock_is_region_memory(mem_start, mem_len)) { - pr_warn("cannot reserve for pin memory: region is not memory!\n"); - return; - } - - if (memblock_is_region_reserved(mem_start, mem_len)) { - pr_warn("cannot reserve for pin memory: region overlaps reserved memory!\n"); - return; - } - - if (!IS_ALIGNED(mem_start, SZ_2M)) { - pr_warn("cannot reserve for pin memory: base address is not 2MB aligned\n"); - return; - } - - memblock_reserve(mem_start, mem_len); - pin_memory_resource.start = mem_start; - pin_memory_resource.end = mem_start + mem_len - 1; -} -#else -static void __init reserve_pin_memory_res(void) -{ -} -#endif /* CONFIG_PIN_MEMORY */ - /* * If the corresponding config options are enabled, we create both ZONE_DMA * and ZONE_DMA32. By default ZONE_DMA covers the 32-bit addressable memory @@ -729,11 +681,8 @@ void __init mem_init(void) /* this will put all unused low memory onto the freelists */ memblock_free_all();
-#ifdef CONFIG_PIN_MEMORY /* pre alloc the pages for pin memory */ - init_reserve_page_map((unsigned long)pin_memory_resource.start, - (unsigned long)(pin_memory_resource.end - pin_memory_resource.start)); -#endif + init_reserve_page_map();
mem_init_print_info(NULL);
diff --git a/include/linux/pin_mem.h b/include/linux/pin_mem.h index 6c54482a42a1..24e64efe4e51 100644 --- a/include/linux/pin_mem.h +++ b/include/linux/pin_mem.h @@ -83,17 +83,35 @@ extern int pagemap_get(struct mm_struct *mm, void *mem_walk, unsigned long *pte_entry, unsigned int *count);
extern int init_pagemap_read(void); -/* reserve space for pin memory*/ -#ifdef CONFIG_ARM64 -extern struct resource pin_memory_resource; -#endif -extern void init_reserve_page_map(unsigned long map_addr, unsigned long map_size); + +extern void __init reserve_pin_memory_res(void); + +extern void request_pin_mem_res(struct resource *res); + +extern void init_reserve_page_map(void); + +#else + +static inline void __init reserve_pin_memory_res(void) {} + +static inline void request_pin_mem_res(struct resource *res) {} + +static inline void init_reserve_page_map(void) {} + +#endif /* CONFIG_PIN_MEMORY */
#ifdef CONFIG_PID_RESERVE -extern bool is_need_reserve_pids(void); + extern void free_reserved_pid(struct idr *idr, int pid); + extern void reserve_pids(struct idr *idr, int pid_max); -#endif
-#endif /* CONFIG_PIN_MEMORY */ +#else + +static inline void free_reserved_pid(struct idr *idr, int pid) {} + +static inline void reserve_pids(struct idr *idr, int pid_max) {} + +#endif /* CONFIG_PID_RESERVE */ + #endif /* _LINUX_PIN_MEMORY_H */ diff --git a/kernel/pid.c b/kernel/pid.c index 28fdf3dc1005..3f9490082180 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -45,9 +45,7 @@ #include <net/sock.h> #include <linux/kmemleak.h> #include <uapi/linux/pidfd.h> -#ifdef CONFIG_PID_RESERVE #include <linux/pin_mem.h> -#endif
struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), @@ -212,9 +210,8 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, spin_lock_irq(&pidmap_lock);
if (tid) { -#ifdef CONFIG_PID_RESERVE free_reserved_pid(&tmp->idr, tid); -#endif + nr = idr_alloc(&tmp->idr, NULL, tid, tid + 1, GFP_ATOMIC); /* @@ -659,10 +656,9 @@ void __init pid_idr_init(void)
init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); -#ifdef CONFIG_PID_RESERVE - if (is_need_reserve_pids()) - reserve_pids(&init_pid_ns.idr, pid_max); -#endif + + reserve_pids(&init_pid_ns.idr, pid_max); + hdr = register_sysctl_paths(pid_kern_path, pid_ctl_table); kmemleak_not_leak(hdr); } diff --git a/mm/pin_mem.c b/mm/pin_mem.c index ff6ddd3a47f9..c158b7768d67 100644 --- a/mm/pin_mem.c +++ b/mm/pin_mem.c @@ -17,6 +17,7 @@ #include <linux/ctype.h> #include <linux/highmem.h> #include <crypto/sha2.h> +#include <linux/memblock.h>
#define MAX_PIN_PID_NUM 128 #define DEFAULT_REDIRECT_SPACE_SIZE 0x100000 @@ -1023,22 +1024,6 @@ vm_fault_t do_mem_remap(int pid, struct mm_struct *mm) } EXPORT_SYMBOL_GPL(do_mem_remap);
-#if defined(CONFIG_ARM64) -void init_reserve_page_map(unsigned long map_addr, unsigned long map_size) -{ - void *addr; - - if (!map_addr || !map_size) - return; - addr = phys_to_virt(map_addr); - init_page_map_info((struct pin_mem_dump_info *)addr, map_size); -} -#else -void init_reserve_page_map(unsigned long map_addr, unsigned long map_size) -{ -} -#endif - static void free_all_reserved_pages(void) { unsigned int i, j, index, order; @@ -1088,14 +1073,92 @@ void clear_pin_memory_record(void) } EXPORT_SYMBOL_GPL(clear_pin_memory_record);
-#ifdef CONFIG_PID_RESERVE -struct idr *reserve_idr; +static struct resource pin_memory_resource = { + .name = "Pin memory", + .start = 0, + .end = 0, + .flags = IORESOURCE_MEM, + .desc = IORES_DESC_RESERVED +}; + +static unsigned long long pin_mem_start; +static unsigned long long pin_mem_len;
-/* test if there exist pin memory tasks */ -bool is_need_reserve_pids(void) +static int __init parse_pin_memory(char *cmdline) { - return (pin_pid_num > 0); + char *cur = cmdline; + + pin_mem_len = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + pin_mem_start = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warn("pinmem: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; } +early_param("pinmemory", parse_pin_memory); + +void __init reserve_pin_memory_res(void) +{ + unsigned long long mem_start = pin_mem_start; + unsigned long long mem_len = pin_mem_len; + + if (!pin_mem_len) + return; + + mem_len = PAGE_ALIGN(mem_len); + + if (!memblock_is_region_memory(mem_start, mem_len)) { + pr_warn("cannot reserve for pin memory: region is not memory!\n"); + return; + } + + if (memblock_is_region_reserved(mem_start, mem_len)) { + pr_warn("cannot reserve for pin memory: region overlaps reserved memory!\n"); + return; + } + + memblock_reserve(mem_start, mem_len); + pr_debug("pin memory resource reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + mem_start, mem_start + mem_len, mem_len >> 20); + + pin_memory_resource.start = mem_start; + pin_memory_resource.end = mem_start + mem_len - 1; +} + +void request_pin_mem_res(struct resource *res) +{ + if (pin_memory_resource.end && + pin_memory_resource.start >= res->start && + pin_memory_resource.end <= res->end) + request_resource(res, &pin_memory_resource); +} + +void init_reserve_page_map(void) +{ + void *addr; + unsigned long map_addr, map_size; + + map_addr = (unsigned long)pin_memory_resource.start; + map_size = (unsigned long)(pin_memory_resource.end - pin_memory_resource.start + 1); + if (!map_addr || !map_size) + return; + + addr = phys_to_virt(map_addr); + init_page_map_info((struct pin_mem_dump_info *)addr, map_size); +} + +#endif /* CONFIG_PIN_MEMORY */ + +#ifdef CONFIG_PID_RESERVE +struct idr *reserve_idr;
void free_reserved_pid(struct idr *idr, int pid) { @@ -1121,8 +1184,9 @@ void reserve_pids(struct idr *idr, int pid_max) unsigned int index; struct page_map_info *pmi;
- if (!max_pin_pid_num) + if (!pin_pid_num || !max_pin_pid_num) return; + reserve_idr = idr; for (index = 0; index < pin_pid_num; index++) { pmi = &(user_space_reserve_start[index]); @@ -1137,6 +1201,6 @@ void reserve_pids(struct idr *idr, int pid_max) } } } + #endif /* CONFIG_PID_RESERVE */
-#endif /* CONFIG_PIN_MEMORY */
From: Jingxian He hejingxian@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4QPBH CVE: NA
------------
Use PG_hotreplace flag to mark pin pages which need to be hotreplaced to the recover process.
Signed-off-by: Jingxian He hejingxian@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/page-flags.h | 9 +++++++++ include/trace/events/mmflags.h | 9 ++++++++- 2 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b47a5514ebc8..eb2fe22bc0e9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -140,6 +140,9 @@ enum pageflags { PG_arch_2, #endif PG_pool, /* Used to track page allocated from dynamic hugetlb pool */ +#ifdef CONFIG_PIN_MEMORY + PG_hotreplace, +#endif
/* Add reserved page flags for internal extension. For the new page * flags which backported from kernel upstream, please place them @@ -431,6 +434,12 @@ PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) TESTSCFLAG_FALSE(Mlocked) #endif
+#ifdef CONFIG_PIN_MEMORY +PAGEFLAG(Hotreplace, hotreplace, PF_ANY) +#else +PAGEFLAG_FALSE(Hotreplace) +#endif + #ifdef CONFIG_ARCH_USES_PG_UNCACHED PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) #else diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 1a2896fc039e..fba14499b87e 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -87,6 +87,12 @@ #define IF_HAVE_PG_ARCH_2(flag,string) #endif
+#ifdef CONFIG_PIN_MEMORY +#define IF_HAVE_PG_HOTREPLACE(flag, string) ,{1UL << flag, string} +#else +#define IF_HAVE_PG_HOTREPLACE(flag, string) +#endif + #define __def_pageflag_names \ {1UL << PG_locked, "locked" }, \ {1UL << PG_waiters, "waiters" }, \ @@ -115,7 +121,8 @@ IF_HAVE_PG_UNCACHED(PG_uncached, "uncached" ) \ IF_HAVE_PG_HWPOISON(PG_hwpoison, "hwpoison" ) \ IF_HAVE_PG_IDLE(PG_young, "young" ) \ IF_HAVE_PG_IDLE(PG_idle, "idle" ) \ -IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ), \ +IF_HAVE_PG_ARCH_2(PG_arch_2, "arch_2" ) \ +IF_HAVE_PG_HOTREPLACE(PG_hotreplace, "hotreplace" ), \ {1UL << PG_reserve_pgflag_0, "reserve_pgflag_0"}, \ {1UL << PG_reserve_pgflag_1, "reserve_pgflag_1"}
From: Jingxian He hejingxian@huawei.com
euleros inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4QPBH CVE: NA
------------
1. Improve pin mem pages rmap: Add Hotreplace flag for pin mem pages to avoid spilting. When the Hotreplace flag is set, the page will not be added to deferred_split page list during rmap. If the pin mem pages added to the deferred_split page list, deferred_split_scan ops will spilt the pages which has been pinned. If the pin mem page is spilted, we can't remap the page to the recover process with the recorded pin mem mapping rule. Moreover, the deferred_split page list node can be corrupted while the deferred_split_scan function and pin pages remapping executing at the same time.
2. Improve free method for pin mem pages: Use the put_page method instead of free_pages directly.
Signed-off-by: Jingxian He hejingxian@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/pin_mem.c | 20 ++++++++++++-------- mm/rmap.c | 12 ++++++++---- 2 files changed, 20 insertions(+), 12 deletions(-)
diff --git a/mm/pin_mem.c b/mm/pin_mem.c index c158b7768d67..34fe373c5fcc 100644 --- a/mm/pin_mem.c +++ b/mm/pin_mem.c @@ -531,6 +531,7 @@ int collect_pmd_huge_pages(struct task_struct *task, if (IS_PTE_PRESENT(pte_entry[0])) { temp_page = pfn_to_page(pte_entry[0] & PM_PFRAME_MASK); if (PageHead(temp_page)) { + SetPageHotreplace(temp_page); atomic_inc(&((temp_page)->_refcount)); start += HPAGE_PMD_SIZE; pme->phy_addr_array[index] = page_to_phys(temp_page); @@ -611,6 +612,7 @@ int collect_normal_pages(struct task_struct *task, continue; } tmp_page = pfn_to_page(pte_entry[i] & PM_PFRAME_MASK); + SetPageHotreplace(tmp_page); atomic_inc(&(tmp_page->_refcount)); phy_addr_array[i] = ((pte_entry[i] & PM_PFRAME_MASK) << PAGE_SHIFT); } @@ -839,14 +841,16 @@ vm_fault_t remap_normal_pages(struct mm_struct *mm, struct vm_area_struct *vma, ret = do_anon_page_remap(vma, address, pmd, page); if (ret) goto free; + ClearPageHotreplace(page); } return 0;
free: + ClearPageHotreplace(page); for (i = j; i < pme->nr_pages; i++) { phy_addr = pme->phy_addr_array[i]; if (phy_addr) { - __free_page(phys_to_page(phy_addr)); + put_page(phys_to_page(phy_addr)); pme->phy_addr_array[i] = 0; } } @@ -927,16 +931,18 @@ vm_fault_t remap_huge_pmd_pages(struct mm_struct *mm, struct vm_area_struct *vma ret = do_anon_huge_page_remap(vma, address, pmd, page); if (ret) goto free; + ClearPageHotreplace(page); } return 0;
free: + ClearPageHotreplace(page); for (i = j; i < pme->nr_pages; i++) { phy_addr = pme->phy_addr_array[i]; if (phy_addr) { page = phys_to_page(phy_addr); if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { - __free_pages(page, HPAGE_PMD_ORDER); + put_page(page); pme->phy_addr_array[i] = 0; } } @@ -950,7 +956,6 @@ static void free_unmap_pages(struct page_map_info *pmi, { unsigned int i, j; unsigned long phy_addr; - unsigned int order; struct page *page;
pme = (struct page_map_entry *)(next_pme(pme)); @@ -959,9 +964,8 @@ static void free_unmap_pages(struct page_map_info *pmi, phy_addr = pme->phy_addr_array[i]; if (phy_addr) { page = phys_to_page(phy_addr); - order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { - __free_pages(page, order); + put_page(page); pme->phy_addr_array[i] = 0; } } @@ -1026,7 +1030,7 @@ EXPORT_SYMBOL_GPL(do_mem_remap);
static void free_all_reserved_pages(void) { - unsigned int i, j, index, order; + unsigned int i, j, index; struct page_map_info *pmi; struct page_map_entry *pme; struct page *page; @@ -1042,12 +1046,12 @@ static void free_all_reserved_pages(void) pme = pmi->pme; for (i = 0; i < pmi->entry_num; i++) { for (j = 0; j < pme->nr_pages; j++) { - order = pme->is_huge_page ? HPAGE_PMD_ORDER : 0; phy_addr = pme->phy_addr_array[j]; if (phy_addr) { page = phys_to_page(phy_addr); + ClearPageHotreplace(page); if (!(page->flags & PAGE_FLAGS_CHECK_RESERVED)) { - __free_pages(page, order); + put_page(page); pme->phy_addr_array[j] = 0; } } diff --git a/mm/rmap.c b/mm/rmap.c index e894efb94650..a780862cd226 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1313,8 +1313,10 @@ static void page_remove_anon_compound_rmap(struct page *page) * page of the compound page is unmapped, but at least one * small page is still mapped. */ - if (nr && nr < thp_nr_pages(page)) - deferred_split_huge_page(page); + if (nr && nr < thp_nr_pages(page)) { + if (!PageHotreplace(page)) + deferred_split_huge_page(page); + } } else { nr = thp_nr_pages(page); } @@ -1361,8 +1363,10 @@ void page_remove_rmap(struct page *page, bool compound) if (unlikely(PageMlocked(page))) clear_page_mlock(page);
- if (PageTransCompound(page)) - deferred_split_huge_page(compound_head(page)); + if (PageTransCompound(page)) { + if (!PageHotreplace(compound_head(page))) + deferred_split_huge_page(compound_head(page)); + }
/* * It would be tidy to reset the PageAnon mapping here,
From: Zhang Yi yi.zhang@huawei.com
mainline inclusion from mainline-5.17-rc1 commit 5c48a7df91499e371ef725895b2e2d21a126e227 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4RN96 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
---------------------------
Our syzkaller report an use-after-free issue that accessing the freed buffer_head on the writeback page in __ext4_journalled_writepage(). The problem is that if there was a truncate racing with the data=journalled writeback procedure, the writeback length could become zero and bget_one() refuse to get buffer_head's refcount, then the truncate procedure release buffer once we drop page lock, finally, the last ext4_walk_page_buffers() trigger the use-after-free problem.
sync truncate ext4_sync_file() file_write_and_wait_range() ext4_setattr(0) inode->i_size = 0 ext4_writepage() len = 0 __ext4_journalled_writepage() page_bufs = page_buffers(page) ext4_walk_page_buffers(bget_one) <- does not get refcount do_invalidatepage() free_buffer_head() ext4_walk_page_buffers(page_bufs) <- trigger use-after-free
After commit bdf96838aea6 ("ext4: fix race between truncate and __ext4_journalled_writepage()"), we have already handled the racing case, so the bget_one() and bput_one() are not needed. So this patch simply remove these hunk, and recheck the i_size to make it safe.
Fixes: bdf96838aea6 ("ext4: fix race between truncate and __ext4_journalled_writepage()") Signed-off-by: Zhang Yi yi.zhang@huawei.com Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20211225090937.712867-1-yi.zhang@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu
Conflict: fs/ext4/inode.c
Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Ye bin yebin10@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/inode.c | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 164161e4c144..74f2504b939b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1844,28 +1844,16 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return 0; }
-static int bget_one(handle_t *handle, struct buffer_head *bh) -{ - get_bh(bh); - return 0; -} - -static int bput_one(handle_t *handle, struct buffer_head *bh) -{ - put_bh(bh); - return 0; -} - static int __ext4_journalled_writepage(struct page *page, unsigned int len) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; - struct buffer_head *page_bufs = NULL; handle_t *handle = NULL; int ret = 0, err = 0; int inline_data = ext4_has_inline_data(inode); struct buffer_head *inode_bh = NULL; + loff_t size;
ClearPageChecked(page);
@@ -1875,14 +1863,6 @@ static int __ext4_journalled_writepage(struct page *page, inode_bh = ext4_journalled_write_inline_data(inode, len, page); if (inode_bh == NULL) goto out; - } else { - page_bufs = page_buffers(page); - if (!page_bufs) { - BUG(); - goto out; - } - ext4_walk_page_buffers(handle, page_bufs, 0, len, - NULL, bget_one); } /* * We need to release the page lock before we start the @@ -1903,7 +1883,8 @@ static int __ext4_journalled_writepage(struct page *page,
lock_page(page); put_page(page); - if (page->mapping != mapping) { + size = i_size_read(inode); + if (page->mapping != mapping || page_offset(page) > size) { /* The page got truncated from under us */ ext4_journal_stop(handle); ret = 0; @@ -1913,6 +1894,13 @@ static int __ext4_journalled_writepage(struct page *page, if (inline_data) { ret = ext4_mark_inode_dirty(handle, inode); } else { + struct buffer_head *page_bufs = page_buffers(page); + + if (page->index == size >> PAGE_SHIFT) + len = size & ~PAGE_MASK; + else + len = PAGE_SIZE; + ret = ext4_walk_page_buffers(handle, page_bufs, 0, len, NULL, do_journal_get_write_access);
@@ -1933,9 +1921,6 @@ static int __ext4_journalled_writepage(struct page *page, out: unlock_page(page); out_no_pagelock: - if (!inline_data && page_bufs) - ext4_walk_page_buffers(NULL, page_bufs, 0, len, - NULL, bput_one); brelse(inode_bh); return ret; }
From: Axel Rasmussen axelrasmussen@google.com
mainline inclusion from mainline-5.11-rc1 commit 2b5067a8143e34aa3fa57a20fb8a3c40d905f942 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RL0T CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
The goal of these tracepoints is to be able to debug lock contention issues. This lock is acquired on most (all?) mmap / munmap / page fault operations, so a multi-threaded process which does a lot of these can experience significant contention.
We trace just before we start acquisition, when the acquisition returns (whether it succeeded or not), and when the lock is released (or downgraded). The events are broken out by lock type (read / write).
The events are also broken out by memcg path. For container-based workloads, users often think of several processes in a memcg as a single logical "task", so collecting statistics at this level is useful.
The end goal is to get latency information. This isn't directly included in the trace events. Instead, users are expected to compute the time between "start locking" and "acquire returned", using e.g. synthetic events or BPF. The benefit we get from this is simpler code.
Because we use tracepoint_enabled() to decide whether or not to trace, this patch has effectively no overhead unless tracepoints are enabled at runtime. If tracepoints are enabled, there is a performance impact, but how much depends on exactly what e.g. the BPF program does.
[axelrasmussen@google.com: fix use-after-free race and css ref leak in tracepoints] Link: https://lkml.kernel.org/r/20201130233504.3725241-1-axelrasmussen@google.com [axelrasmussen@google.com: v3] Link: https://lkml.kernel.org/r/20201207213358.573750-1-axelrasmussen@google.com [rostedt@goodmis.org: in-depth examples of tracepoint_enabled() usage, and per-cpu-per-context buffer design]
Link: https://lkml.kernel.org/r/20201105211739.568279-2-axelrasmussen@google.com Signed-off-by: Axel Rasmussen axelrasmussen@google.com Acked-by: Vlastimil Babka vbabka@suse.cz Cc: Steven Rostedt rostedt@goodmis.org Cc: Ingo Molnar mingo@redhat.com Cc: Michel Lespinasse walken@google.com Cc: Daniel Jordan daniel.m.jordan@oracle.com Cc: Jann Horn jannh@google.com Cc: Chinwen Chang chinwen.chang@mediatek.com Cc: Davidlohr Bueso dbueso@suse.de Cc: David Rientjes rientjes@google.com Cc: Laurent Dufour ldufour@linux.ibm.com Cc: Yafang Shao laoar.shao@gmail.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 2b5067a8143e34aa3fa57a20fb8a3c40d905f942) Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mmap_lock.h | 94 ++++++++++++- include/trace/events/mmap_lock.h | 107 ++++++++++++++ mm/Makefile | 2 +- mm/mmap_lock.c | 230 +++++++++++++++++++++++++++++++ 4 files changed, 427 insertions(+), 6 deletions(-) create mode 100644 include/trace/events/mmap_lock.h create mode 100644 mm/mmap_lock.c
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 18e7eae9b5ba..0540f0156f58 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -1,11 +1,65 @@ #ifndef _LINUX_MMAP_LOCK_H #define _LINUX_MMAP_LOCK_H
+#include <linux/lockdep.h> +#include <linux/mm_types.h> #include <linux/mmdebug.h> +#include <linux/rwsem.h> +#include <linux/tracepoint-defs.h> +#include <linux/types.h>
#define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock),
+DECLARE_TRACEPOINT(mmap_lock_start_locking); +DECLARE_TRACEPOINT(mmap_lock_acquire_returned); +DECLARE_TRACEPOINT(mmap_lock_released); + +#ifdef CONFIG_TRACING + +void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write); +void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, + bool success); +void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write); + +static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, + bool write) +{ + if (tracepoint_enabled(mmap_lock_start_locking)) + __mmap_lock_do_trace_start_locking(mm, write); +} + +static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, + bool write, bool success) +{ + if (tracepoint_enabled(mmap_lock_acquire_returned)) + __mmap_lock_do_trace_acquire_returned(mm, write, success); +} + +static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) +{ + if (tracepoint_enabled(mmap_lock_released)) + __mmap_lock_do_trace_released(mm, write); +} + +#else /* !CONFIG_TRACING */ + +static inline void __mmap_lock_trace_start_locking(struct mm_struct *mm, + bool write) +{ +} + +static inline void __mmap_lock_trace_acquire_returned(struct mm_struct *mm, + bool write, bool success) +{ +} + +static inline void __mmap_lock_trace_released(struct mm_struct *mm, bool write) +{ +} + +#endif /* CONFIG_TRACING */ + static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); @@ -13,57 +67,86 @@ static inline void mmap_init_lock(struct mm_struct *mm)
static inline void mmap_write_lock(struct mm_struct *mm) { + __mmap_lock_trace_start_locking(mm, true); down_write(&mm->mmap_lock); + __mmap_lock_trace_acquire_returned(mm, true, true); }
static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { + __mmap_lock_trace_start_locking(mm, true); down_write_nested(&mm->mmap_lock, subclass); + __mmap_lock_trace_acquire_returned(mm, true, true); }
static inline int mmap_write_lock_killable(struct mm_struct *mm) { - return down_write_killable(&mm->mmap_lock); + int ret; + + __mmap_lock_trace_start_locking(mm, true); + ret = down_write_killable(&mm->mmap_lock); + __mmap_lock_trace_acquire_returned(mm, true, ret == 0); + return ret; }
static inline bool mmap_write_trylock(struct mm_struct *mm) { - return down_write_trylock(&mm->mmap_lock) != 0; + bool ret; + + __mmap_lock_trace_start_locking(mm, true); + ret = down_write_trylock(&mm->mmap_lock) != 0; + __mmap_lock_trace_acquire_returned(mm, true, ret); + return ret; }
static inline void mmap_write_unlock(struct mm_struct *mm) { up_write(&mm->mmap_lock); + __mmap_lock_trace_released(mm, true); }
static inline void mmap_write_downgrade(struct mm_struct *mm) { downgrade_write(&mm->mmap_lock); + __mmap_lock_trace_acquire_returned(mm, false, true); }
static inline void mmap_read_lock(struct mm_struct *mm) { + __mmap_lock_trace_start_locking(mm, false); down_read(&mm->mmap_lock); + __mmap_lock_trace_acquire_returned(mm, false, true); }
static inline int mmap_read_lock_killable(struct mm_struct *mm) { - return down_read_killable(&mm->mmap_lock); + int ret; + + __mmap_lock_trace_start_locking(mm, false); + ret = down_read_killable(&mm->mmap_lock); + __mmap_lock_trace_acquire_returned(mm, false, ret == 0); + return ret; }
static inline bool mmap_read_trylock(struct mm_struct *mm) { - return down_read_trylock(&mm->mmap_lock) != 0; + bool ret; + + __mmap_lock_trace_start_locking(mm, false); + ret = down_read_trylock(&mm->mmap_lock) != 0; + __mmap_lock_trace_acquire_returned(mm, false, ret); + return ret; }
static inline void mmap_read_unlock(struct mm_struct *mm) { up_read(&mm->mmap_lock); + __mmap_lock_trace_released(mm, false); }
static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) { - if (down_read_trylock(&mm->mmap_lock)) { + if (mmap_read_trylock(mm)) { rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); return true; } @@ -73,6 +156,7 @@ static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { up_read_non_owner(&mm->mmap_lock); + __mmap_lock_trace_released(mm, false); }
static inline void mmap_assert_locked(struct mm_struct *mm) diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h new file mode 100644 index 000000000000..0abff67b96f0 --- /dev/null +++ b/include/trace/events/mmap_lock.h @@ -0,0 +1,107 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM mmap_lock + +#if !defined(_TRACE_MMAP_LOCK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MMAP_LOCK_H + +#include <linux/tracepoint.h> +#include <linux/types.h> + +struct mm_struct; + +extern int trace_mmap_lock_reg(void); +extern void trace_mmap_lock_unreg(void); + +TRACE_EVENT_FN(mmap_lock_start_locking, + + TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), + + TP_ARGS(mm, memcg_path, write), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __string(memcg_path, memcg_path) + __field(bool, write) + ), + + TP_fast_assign( + __entry->mm = mm; + __assign_str(memcg_path, memcg_path); + __entry->write = write; + ), + + TP_printk( + "mm=%p memcg_path=%s write=%s\n", + __entry->mm, + __get_str(memcg_path), + __entry->write ? "true" : "false" + ), + + trace_mmap_lock_reg, trace_mmap_lock_unreg +); + +TRACE_EVENT_FN(mmap_lock_acquire_returned, + + TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, + bool success), + + TP_ARGS(mm, memcg_path, write, success), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __string(memcg_path, memcg_path) + __field(bool, write) + __field(bool, success) + ), + + TP_fast_assign( + __entry->mm = mm; + __assign_str(memcg_path, memcg_path); + __entry->write = write; + __entry->success = success; + ), + + TP_printk( + "mm=%p memcg_path=%s write=%s success=%s\n", + __entry->mm, + __get_str(memcg_path), + __entry->write ? "true" : "false", + __entry->success ? "true" : "false" + ), + + trace_mmap_lock_reg, trace_mmap_lock_unreg +); + +TRACE_EVENT_FN(mmap_lock_released, + + TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), + + TP_ARGS(mm, memcg_path, write), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __string(memcg_path, memcg_path) + __field(bool, write) + ), + + TP_fast_assign( + __entry->mm = mm; + __assign_str(memcg_path, memcg_path); + __entry->write = write; + ), + + TP_printk( + "mm=%p memcg_path=%s write=%s\n", + __entry->mm, + __get_str(memcg_path), + __entry->write ? "true" : "false" + ), + + trace_mmap_lock_reg, trace_mmap_lock_unreg +); + +#endif /* _TRACE_MMAP_LOCK_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/mm/Makefile b/mm/Makefile index f3dce99ee62f..4b0b5e7af40f 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ mm_init.o percpu.o slab_common.o \ compaction.o vmacache.o \ interval_tree.o list_lru.o workingset.o \ - debug.o gup.o $(mmu-y) + debug.o gup.o mmap_lock.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c new file mode 100644 index 000000000000..dcdde4f722a4 --- /dev/null +++ b/mm/mmap_lock.c @@ -0,0 +1,230 @@ +// SPDX-License-Identifier: GPL-2.0 +#define CREATE_TRACE_POINTS +#include <trace/events/mmap_lock.h> + +#include <linux/mm.h> +#include <linux/cgroup.h> +#include <linux/memcontrol.h> +#include <linux/mmap_lock.h> +#include <linux/mutex.h> +#include <linux/percpu.h> +#include <linux/rcupdate.h> +#include <linux/smp.h> +#include <linux/trace_events.h> + +EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); +EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); +EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); + +#ifdef CONFIG_MEMCG + +/* + * Our various events all share the same buffer (because we don't want or need + * to allocate a set of buffers *per event type*), so we need to protect against + * concurrent _reg() and _unreg() calls, and count how many _reg() calls have + * been made. + */ +static DEFINE_MUTEX(reg_lock); +static int reg_refcount; /* Protected by reg_lock. */ + +/* + * Size of the buffer for memcg path names. Ignoring stack trace support, + * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. + */ +#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL + +/* + * How many contexts our trace events might be called in: normal, softirq, irq, + * and NMI. + */ +#define CONTEXT_COUNT 4 + +static DEFINE_PER_CPU(char __rcu *, memcg_path_buf); +static char **tmp_bufs; +static DEFINE_PER_CPU(int, memcg_path_buf_idx); + +/* Called with reg_lock held. */ +static void free_memcg_path_bufs(void) +{ + int cpu; + char **old = tmp_bufs; + + for_each_possible_cpu(cpu) { + *(old++) = rcu_dereference_protected( + per_cpu(memcg_path_buf, cpu), + lockdep_is_held(®_lock)); + rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL); + } + + /* Wait for inflight memcg_path_buf users to finish. */ + synchronize_rcu(); + + old = tmp_bufs; + for_each_possible_cpu(cpu) { + kfree(*(old++)); + } + + kfree(tmp_bufs); + tmp_bufs = NULL; +} + +int trace_mmap_lock_reg(void) +{ + int cpu; + char *new; + + mutex_lock(®_lock); + + /* If the refcount is going 0->1, proceed with allocating buffers. */ + if (reg_refcount++) + goto out; + + tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs), + GFP_KERNEL); + if (tmp_bufs == NULL) + goto out_fail; + + for_each_possible_cpu(cpu) { + new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL); + if (new == NULL) + goto out_fail_free; + rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), new); + /* Don't need to wait for inflights, they'd have gotten NULL. */ + } + +out: + mutex_unlock(®_lock); + return 0; + +out_fail_free: + free_memcg_path_bufs(); +out_fail: + /* Since we failed, undo the earlier ref increment. */ + --reg_refcount; + + mutex_unlock(®_lock); + return -ENOMEM; +} + +void trace_mmap_lock_unreg(void) +{ + mutex_lock(®_lock); + + /* If the refcount is going 1->0, proceed with freeing buffers. */ + if (--reg_refcount) + goto out; + + free_memcg_path_bufs(); + +out: + mutex_unlock(®_lock); +} + +static inline char *get_memcg_path_buf(void) +{ + char *buf; + int idx; + + rcu_read_lock(); + buf = rcu_dereference(*this_cpu_ptr(&memcg_path_buf)); + if (buf == NULL) { + rcu_read_unlock(); + return NULL; + } + idx = this_cpu_add_return(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE) - + MEMCG_PATH_BUF_SIZE; + return &buf[idx]; +} + +static inline void put_memcg_path_buf(void) +{ + this_cpu_sub(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE); + rcu_read_unlock(); +} + +/* + * Write the given mm_struct's memcg path to a percpu buffer, and return a + * pointer to it. If the path cannot be determined, or no buffer was available + * (because the trace event is being unregistered), NULL is returned. + * + * Note: buffers are allocated per-cpu to avoid locking, so preemption must be + * disabled by the caller before calling us, and re-enabled only after the + * caller is done with the pointer. + * + * The caller must call put_memcg_path_buf() once the buffer is no longer + * needed. This must be done while preemption is still disabled. + */ +static const char *get_mm_memcg_path(struct mm_struct *mm) +{ + char *buf = NULL; + struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm); + + if (memcg == NULL) + goto out; + if (unlikely(memcg->css.cgroup == NULL)) + goto out_put; + + buf = get_memcg_path_buf(); + if (buf == NULL) + goto out_put; + + cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE); + +out_put: + css_put(&memcg->css); +out: + return buf; +} + +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + do { \ + const char *memcg_path; \ + preempt_disable(); \ + memcg_path = get_mm_memcg_path(mm); \ + trace_mmap_lock_##type(mm, \ + memcg_path != NULL ? memcg_path : "", \ + ##__VA_ARGS__); \ + if (likely(memcg_path != NULL)) \ + put_memcg_path_buf(); \ + preempt_enable(); \ + } while (0) + +#else /* !CONFIG_MEMCG */ + +int trace_mmap_lock_reg(void) +{ + return 0; +} + +void trace_mmap_lock_unreg(void) +{ +} + +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) + +#endif /* CONFIG_MEMCG */ + +/* + * Trace calls must be in a separate file, as otherwise there's a circular + * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h. + */ + +void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) +{ + TRACE_MMAP_LOCK_EVENT(start_locking, mm, write); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking); + +void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, + bool success) +{ + TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned); + +void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) +{ + TRACE_MMAP_LOCK_EVENT(released, mm, write); +} +EXPORT_SYMBOL(__mmap_lock_do_trace_released);
From: Nicolas Saenz Julienne nsaenzju@redhat.com
mainline inclusion from mainline-5.14-rc1 commit 832b50725373e8c46781b7d4db104ec9cf564a6b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RL0T CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
mmap_lock will explicitly disable/enable preemption upon manipulating its local CPU variables. This is to be expected, but in this case, it doesn't play well with PREEMPT_RT. The preemption disabled code section also takes a spin-lock. Spin-locks in RT systems will try to schedule, which is exactly what we're trying to avoid.
To mitigate this, convert the explicit preemption handling to local_locks. Which are RT aware, and will disable migration instead of preemption when PREEMPT_RT=y.
The faulty call trace looks like the following: __mmap_lock_do_trace_*() preempt_disable() get_mm_memcg_path() cgroup_path() kernfs_path_from_node() spin_lock_irqsave() /* Scheduling while atomic! */
Link: https://lkml.kernel.org/r/20210604163506.2103900-1-nsaenzju@redhat.com Fixes: 2b5067a8143e3 ("mm: mmap_lock: add tracepoints around lock acquisition ") Signed-off-by: Nicolas Saenz Julienne nsaenzju@redhat.com Tested-by: Axel Rasmussen axelrasmussen@google.com Reviewed-by: Axel Rasmussen axelrasmussen@google.com Cc: Vlastimil Babka vbabka@suse.cz Cc: Steven Rostedt rostedt@goodmis.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 832b50725373e8c46781b7d4db104ec9cf564a6b) Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/mmap_lock.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-)
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index dcdde4f722a4..2ae3f33b85b1 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -11,6 +11,7 @@ #include <linux/rcupdate.h> #include <linux/smp.h> #include <linux/trace_events.h> +#include <linux/local_lock.h>
EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); @@ -39,21 +40,30 @@ static int reg_refcount; /* Protected by reg_lock. */ */ #define CONTEXT_COUNT 4
-static DEFINE_PER_CPU(char __rcu *, memcg_path_buf); +struct memcg_path { + local_lock_t lock; + char __rcu *buf; + local_t buf_idx; +}; +static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = { + .lock = INIT_LOCAL_LOCK(lock), + .buf_idx = LOCAL_INIT(0), +}; + static char **tmp_bufs; -static DEFINE_PER_CPU(int, memcg_path_buf_idx);
/* Called with reg_lock held. */ static void free_memcg_path_bufs(void) { + struct memcg_path *memcg_path; int cpu; char **old = tmp_bufs;
for_each_possible_cpu(cpu) { - *(old++) = rcu_dereference_protected( - per_cpu(memcg_path_buf, cpu), + memcg_path = per_cpu_ptr(&memcg_paths, cpu); + *(old++) = rcu_dereference_protected(memcg_path->buf, lockdep_is_held(®_lock)); - rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL); + rcu_assign_pointer(memcg_path->buf, NULL); }
/* Wait for inflight memcg_path_buf users to finish. */ @@ -88,7 +98,7 @@ int trace_mmap_lock_reg(void) new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL); if (new == NULL) goto out_fail_free; - rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), new); + rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new); /* Don't need to wait for inflights, they'd have gotten NULL. */ }
@@ -122,23 +132,24 @@ void trace_mmap_lock_unreg(void)
static inline char *get_memcg_path_buf(void) { + struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths); char *buf; int idx;
rcu_read_lock(); - buf = rcu_dereference(*this_cpu_ptr(&memcg_path_buf)); + buf = rcu_dereference(memcg_path->buf); if (buf == NULL) { rcu_read_unlock(); return NULL; } - idx = this_cpu_add_return(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE) - + idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) - MEMCG_PATH_BUF_SIZE; return &buf[idx]; }
static inline void put_memcg_path_buf(void) { - this_cpu_sub(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE); + local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx); rcu_read_unlock(); }
@@ -179,14 +190,14 @@ static const char *get_mm_memcg_path(struct mm_struct *mm) #define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ do { \ const char *memcg_path; \ - preempt_disable(); \ + local_lock(&memcg_paths.lock); \ memcg_path = get_mm_memcg_path(mm); \ trace_mmap_lock_##type(mm, \ memcg_path != NULL ? memcg_path : "", \ ##__VA_ARGS__); \ if (likely(memcg_path != NULL)) \ put_memcg_path_buf(); \ - preempt_enable(); \ + local_unlock(&memcg_paths.lock); \ } while (0)
#else /* !CONFIG_MEMCG */
From: Mel Gorman mgorman@techsingularity.net
mainline inclusion from mainline-5.14-rc1 commit d01079f3d0c0a9e306ffbdb2694c5281bd9e065e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RL0T CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
make W=1 generates the following warning in mmap_lock.c for allnoconfig
mm/mmap_lock.c:213:6: warning: no previous prototype for `__mmap_lock_do_trace_start_locking' [-Wmissing-prototypes] void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write) ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mm/mmap_lock.c:219:6: warning: no previous prototype for `__mmap_lock_do_trace_acquire_returned' [-Wmissing-prototypes] void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write, ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mm/mmap_lock.c:226:6: warning: no previous prototype for `__mmap_lock_do_trace_released' [-Wmissing-prototypes] void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
On !CONFIG_TRACING configurations, the code is dead so put it behind an #ifdef.
[cuibixuan@huawei.com: fix warning when CONFIG_TRACING is not defined] Link: https://lkml.kernel.org/r/20210531033426.74031-1-cuibixuan@huawei.com
Link: https://lkml.kernel.org/r/20210520084809.8576-13-mgorman@techsingularity.net Signed-off-by: Mel Gorman mgorman@techsingularity.net Signed-off-by: Bixuan Cui cuibixuan@huawei.com Reviewed-by: Yang Shi shy828301@gmail.com Acked-by: Vlastimil Babka vbabka@suse.cz Cc: Dan Streetman ddstreet@ieee.org Cc: David Hildenbrand david@redhat.com Cc: Michal Hocko mhocko@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit d01079f3d0c0a9e306ffbdb2694c5281bd9e065e) Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/mmap_lock.c | 59 +++++++++++++++++++++++++++----------------------- 1 file changed, 32 insertions(+), 27 deletions(-)
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 2ae3f33b85b1..f5852a058ce0 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -153,6 +153,37 @@ static inline void put_memcg_path_buf(void) rcu_read_unlock(); }
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + do { \ + const char *memcg_path; \ + preempt_disable(); \ + memcg_path = get_mm_memcg_path(mm); \ + trace_mmap_lock_##type(mm, \ + memcg_path != NULL ? memcg_path : "", \ + ##__VA_ARGS__); \ + if (likely(memcg_path != NULL)) \ + put_memcg_path_buf(); \ + preempt_enable(); \ + } while (0) + +#else /* !CONFIG_MEMCG */ + +int trace_mmap_lock_reg(void) +{ + return 0; +} + +void trace_mmap_lock_unreg(void) +{ +} + +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) + +#endif /* CONFIG_MEMCG */ + +#ifdef CONFIG_TRACING +#ifdef CONFIG_MEMCG /* * Write the given mm_struct's memcg path to a percpu buffer, and return a * pointer to it. If the path cannot be determined, or no buffer was available @@ -187,33 +218,6 @@ static const char *get_mm_memcg_path(struct mm_struct *mm) return buf; }
-#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - do { \ - const char *memcg_path; \ - local_lock(&memcg_paths.lock); \ - memcg_path = get_mm_memcg_path(mm); \ - trace_mmap_lock_##type(mm, \ - memcg_path != NULL ? memcg_path : "", \ - ##__VA_ARGS__); \ - if (likely(memcg_path != NULL)) \ - put_memcg_path_buf(); \ - local_unlock(&memcg_paths.lock); \ - } while (0) - -#else /* !CONFIG_MEMCG */ - -int trace_mmap_lock_reg(void) -{ - return 0; -} - -void trace_mmap_lock_unreg(void) -{ -} - -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) - #endif /* CONFIG_MEMCG */
/* @@ -239,3 +243,4 @@ void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write) TRACE_MMAP_LOCK_EVENT(released, mm, write); } EXPORT_SYMBOL(__mmap_lock_do_trace_released); +#endif /* CONFIG_TRACING */
From: Muchun Song songmuchun@bytedance.com
mainline inclusion from mainline-5.14-rc3 commit e904c2ccf9b5cb356eec754ffea05c08984f6535 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RL0T CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Commit 832b50725373 ("mm: mmap_lock: use local locks instead of disabling preemption") fixed a bug by using local locks.
But commit d01079f3d0c0 ("mm/mmap_lock: remove dead code for !CONFIG_TRACING configurations") changed those lines back to the original version.
I guess it was introduced by fixing conflicts.
Link: https://lkml.kernel.org/r/20210720074228.76342-1-songmuchun@bytedance.com Fixes: d01079f3d0c0 ("mm/mmap_lock: remove dead code for !CONFIG_TRACING configurations") Signed-off-by: Muchun Song songmuchun@bytedance.com Acked-by: Mel Gorman mgorman@techsingularity.net Reviewed-by: Yang Shi shy828301@gmail.com Reviewed-by: Pankaj Gupta pankaj.gupta@ionos.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit e904c2ccf9b5cb356eec754ffea05c08984f6535) Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/mmap_lock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index f5852a058ce0..1854850b4b89 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -156,14 +156,14 @@ static inline void put_memcg_path_buf(void) #define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ do { \ const char *memcg_path; \ - preempt_disable(); \ + local_lock(&memcg_paths.lock); \ memcg_path = get_mm_memcg_path(mm); \ trace_mmap_lock_##type(mm, \ memcg_path != NULL ? memcg_path : "", \ ##__VA_ARGS__); \ if (likely(memcg_path != NULL)) \ put_memcg_path_buf(); \ - preempt_enable(); \ + local_unlock(&memcg_paths.lock); \ } while (0)
#else /* !CONFIG_MEMCG */
From: Liam Howlett liam.howlett@oracle.com
mainline inclusion from mainline-5.15-rc1 commit 10994316089c9682f2fbe0be0b1e82bcaf5f4e8c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RL0T CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Print to the trace log before releasing the lock to avoid racing with other trace log printers of the same lock type.
Link: https://lkml.kernel.org/r/20210903022041.1843024-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Liam.Howlett@oracle.com Suggested-by: Steven Rostedt (VMware) rostedt@goodmis.org Reviewed-by: Matthew Wilcox (Oracle) willy@infradead.org Cc: Michel Lespinasse walken.cr@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org (cherry picked from commit 10994316089c9682f2fbe0be0b1e82bcaf5f4e8c) Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mmap_lock.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index 0540f0156f58..b179f1e3541a 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -101,14 +101,14 @@ static inline bool mmap_write_trylock(struct mm_struct *mm)
static inline void mmap_write_unlock(struct mm_struct *mm) { - up_write(&mm->mmap_lock); __mmap_lock_trace_released(mm, true); + up_write(&mm->mmap_lock); }
static inline void mmap_write_downgrade(struct mm_struct *mm) { - downgrade_write(&mm->mmap_lock); __mmap_lock_trace_acquire_returned(mm, false, true); + downgrade_write(&mm->mmap_lock); }
static inline void mmap_read_lock(struct mm_struct *mm) @@ -140,8 +140,8 @@ static inline bool mmap_read_trylock(struct mm_struct *mm)
static inline void mmap_read_unlock(struct mm_struct *mm) { - up_read(&mm->mmap_lock); __mmap_lock_trace_released(mm, false); + up_read(&mm->mmap_lock); }
static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) @@ -155,8 +155,8 @@ static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm)
static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { - up_read_non_owner(&mm->mmap_lock); __mmap_lock_trace_released(mm, false); + up_read_non_owner(&mm->mmap_lock); }
static inline void mmap_assert_locked(struct mm_struct *mm)
From: Yonghong Song yhs@fb.com
mainline inclusion from mainline-5.15-rc2 commit 2f1aaf3ea666b737ad717b3d88667225aca23149 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RL0T CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Currently the bpf selftest "get_stack_raw_tp" triggered the warning:
[ 1411.304463] WARNING: CPU: 3 PID: 140 at include/linux/mmap_lock.h:164 find_vma+0x47/0xa0 [ 1411.304469] Modules linked in: bpf_testmod(O) [last unloaded: bpf_testmod] [ 1411.304476] CPU: 3 PID: 140 Comm: systemd-journal Tainted: G W O 5.14.0+ #53 [ 1411.304479] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 [ 1411.304481] RIP: 0010:find_vma+0x47/0xa0 [ 1411.304484] Code: de 48 89 ef e8 ba f5 fe ff 48 85 c0 74 2e 48 83 c4 08 5b 5d c3 48 8d bf 28 01 00 00 be ff ff ff ff e8 2d 9f d8 00 85 c0 75 d4 <0f> 0b 48 89 de 48 8 [ 1411.304487] RSP: 0018:ffffabd440403db8 EFLAGS: 00010246 [ 1411.304490] RAX: 0000000000000000 RBX: 00007f00ad80a0e0 RCX: 0000000000000000 [ 1411.304492] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.304494] RBP: ffff9cf5c2f50000 R08: ffff9cf5c3eb25d8 R09: 00000000fffffffe [ 1411.304496] R10: 0000000000000001 R11: 00000000ef974e19 R12: ffff9cf5c39ae0e0 [ 1411.304498] R13: 0000000000000000 R14: 0000000000000000 R15: ffff9cf5c39ae0e0 [ 1411.304501] FS: 00007f00ae754780(0000) GS:ffff9cf5fba00000(0000) knlGS:0000000000000000 [ 1411.304504] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.304506] CR2: 000000003e34343c CR3: 0000000103a98005 CR4: 0000000000370ee0 [ 1411.304508] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.304510] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.304512] Call Trace: [ 1411.304517] stack_map_get_build_id_offset+0x17c/0x260 [ 1411.304528] __bpf_get_stack+0x18f/0x230 [ 1411.304541] bpf_get_stack_raw_tp+0x5a/0x70 [ 1411.305752] RAX: 0000000000000000 RBX: 5541f689495641d7 RCX: 0000000000000000 [ 1411.305756] RDX: 0000000000000001 RSI: ffffffff9776b144 RDI: ffffffff977e1b0e [ 1411.305758] RBP: ffff9cf5c02b2f40 R08: ffff9cf5ca7606c0 R09: ffffcbd43ee02c04 [ 1411.306978] bpf_prog_32007c34f7726d29_bpf_prog1+0xaf/0xd9c [ 1411.307861] R10: 0000000000000001 R11: 0000000000000044 R12: ffff9cf5c2ef60e0 [ 1411.307865] R13: 0000000000000005 R14: 0000000000000000 R15: ffff9cf5c2ef6108 [ 1411.309074] bpf_trace_run2+0x8f/0x1a0 [ 1411.309891] FS: 00007ff485141700(0000) GS:ffff9cf5fae00000(0000) knlGS:0000000000000000 [ 1411.309896] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1411.311221] syscall_trace_enter.isra.20+0x161/0x1f0 [ 1411.311600] CR2: 00007ff48514d90e CR3: 0000000107114001 CR4: 0000000000370ef0 [ 1411.312291] do_syscall_64+0x15/0x80 [ 1411.312941] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1411.313803] entry_SYSCALL_64_after_hwframe+0x44/0xae [ 1411.314223] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1411.315082] RIP: 0033:0x7f00ad80a0e0 [ 1411.315626] Call Trace: [ 1411.315632] stack_map_get_build_id_offset+0x17c/0x260
To reproduce, first build `test_progs` binary:
make -C tools/testing/selftests/bpf -j60
and then run the binary at tools/testing/selftests/bpf directory:
./test_progs -t get_stack_raw_tp
The warning is due to commit 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") which added mmap_assert_locked() in find_vma() function. The mmap_assert_locked() function asserts that mm->mmap_lock needs to be held. But this is not the case for bpf_get_stack() or bpf_get_stackid() helper (kernel/bpf/stackmap.c), which uses mmap_read_trylock_non_owner() instead. Since mm->mmap_lock is not held in bpf_get_stack[id]() use case, the above warning is emitted during test run.
This patch fixed the issue by (1). using mmap_read_trylock() instead of mmap_read_trylock_non_owner() to satisfy lockdep checking in find_vma(), and (2). droping lockdep for mmap_lock right before the irq_work_queue(). The function mmap_read_trylock_non_owner() is also removed since after this patch nobody calls it any more.
Fixes: 5b78ed24e8ec ("mm/pagemap: add mmap_assert_locked() annotations to find_vma*()") Suggested-by: Jason Gunthorpe jgg@ziepe.ca Signed-off-by: Yonghong Song yhs@fb.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Reviewed-by: Liam R. Howlett Liam.Howlett@oracle.com Cc: Luigi Rizzo lrizzo@google.com Cc: Jason Gunthorpe jgg@ziepe.ca Cc: linux-mm@kvack.org Link: https://lore.kernel.org/bpf/20210909155000.1610299-1-yhs@fb.com (cherry picked from commit 2f1aaf3ea666b737ad717b3d88667225aca23149) Signed-off-by: Yongqiang Liu liuyongqiang13@huawei.com Reviewed-by: Kefeng Wangwangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mmap_lock.h | 9 --------- kernel/bpf/stackmap.c | 10 ++++++++-- 2 files changed, 8 insertions(+), 11 deletions(-)
diff --git a/include/linux/mmap_lock.h b/include/linux/mmap_lock.h index b179f1e3541a..96e113e23d04 100644 --- a/include/linux/mmap_lock.h +++ b/include/linux/mmap_lock.h @@ -144,15 +144,6 @@ static inline void mmap_read_unlock(struct mm_struct *mm) up_read(&mm->mmap_lock); }
-static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) -{ - if (mmap_read_trylock(mm)) { - rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); - return true; - } - return false; -} - static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { __mmap_lock_trace_released(mm, false); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 4477873ac3a0..4b5b390e22ea 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -325,7 +325,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, * with build_id. */ if (!user || !current || !current->mm || irq_work_busy || - !mmap_read_trylock_non_owner(current->mm)) { + !mmap_read_trylock(current->mm)) { /* cannot access current->mm, fall back to ips */ for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; @@ -350,9 +350,15 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, }
if (!work) { - mmap_read_unlock_non_owner(current->mm); + mmap_read_unlock(current->mm); } else { work->mm = current->mm; + + /* The lock will be released once we're out of interrupt + * context. Tell lockdep that we've released it now so + * it doesn't complain that we forgot to release it. + */ + rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_); irq_work_queue(&work->irq_work); } }
From: Ye Bin yebin10@huawei.com
mainline inclusion from mainline-v5.17-rc1 commit 298b5c521746d69c07beb2757292fb5ccc1b0f85 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4RPAD?from=project-issue CVE: NA
--------------------------------
We got issue as follows when run syzkaller test: [ 1901.130043] EXT4-fs error (device vda): ext4_remount:5624: comm syz-executor.5: Abort forced by user [ 1901.130901] Aborting journal on device vda-8. [ 1901.131437] EXT4-fs error (device vda): ext4_journal_check_start:61: comm syz-executor.16: Detected aborted journal [ 1901.131566] EXT4-fs error (device vda): ext4_journal_check_start:61: comm syz-executor.11: Detected aborted journal [ 1901.132586] EXT4-fs error (device vda): ext4_journal_check_start:61: comm syz-executor.18: Detected aborted journal [ 1901.132751] EXT4-fs error (device vda): ext4_journal_check_start:61: comm syz-executor.9: Detected aborted journal [ 1901.136149] EXT4-fs error (device vda) in ext4_reserve_inode_write:6035: Journal has aborted [ 1901.136837] EXT4-fs error (device vda): ext4_journal_check_start:61: comm syz-fuzzer: Detected aborted journal [ 1901.136915] ================================================================== [ 1901.138175] BUG: KASAN: null-ptr-deref in __ext4_journal_ensure_credits+0x74/0x140 [ext4] [ 1901.138343] EXT4-fs error (device vda): ext4_journal_check_start:61: comm syz-executor.13: Detected aborted journal [ 1901.138398] EXT4-fs error (device vda): ext4_journal_check_start:61: comm syz-executor.1: Detected aborted journal [ 1901.138808] Read of size 8 at addr 0000000000000000 by task syz-executor.17/968 [ 1901.138817] [ 1901.138852] EXT4-fs error (device vda): ext4_journal_check_start:61: comm syz-executor.30: Detected aborted journal [ 1901.144779] CPU: 1 PID: 968 Comm: syz-executor.17 Not tainted 4.19.90-vhulk2111.1.0.h893.eulerosv2r10.aarch64+ #1 [ 1901.146479] Hardware name: linux,dummy-virt (DT) [ 1901.147317] Call trace: [ 1901.147552] dump_backtrace+0x0/0x2d8 [ 1901.147898] show_stack+0x28/0x38 [ 1901.148215] dump_stack+0xec/0x15c [ 1901.148746] kasan_report+0x108/0x338 [ 1901.149207] __asan_load8+0x58/0xb0 [ 1901.149753] __ext4_journal_ensure_credits+0x74/0x140 [ext4] [ 1901.150579] ext4_xattr_delete_inode+0xe4/0x700 [ext4] [ 1901.151316] ext4_evict_inode+0x524/0xba8 [ext4] [ 1901.151985] evict+0x1a4/0x378 [ 1901.152353] iput+0x310/0x428 [ 1901.152733] do_unlinkat+0x260/0x428 [ 1901.153056] __arm64_sys_unlinkat+0x6c/0xc0 [ 1901.153455] el0_svc_common+0xc8/0x320 [ 1901.153799] el0_svc_handler+0xf8/0x160 [ 1901.154265] el0_svc+0x10/0x218 [ 1901.154682] ==================================================================
This issue may happens like this: Process1 Process2 ext4_evict_inode ext4_journal_start ext4_truncate ext4_ind_truncate ext4_free_branches ext4_ind_truncate_ensure_credits ext4_journal_ensure_credits_fn ext4_journal_restart handle->h_transaction = NULL; mount -o remount,abort /mnt -> trigger JBD abort start_this_handle -> will return failed ext4_xattr_delete_inode ext4_journal_ensure_credits ext4_journal_ensure_credits_fn __ext4_journal_ensure_credits jbd2_handle_buffer_credits journal = handle->h_transaction->t_journal; ->null-ptr-deref
Now, indirect truncate process didn't handle error. To solve this issue maybe simply add check handle is abort in '__ext4_journal_ensure_credits' is enough, and i also think this is necessary.
Cc: stable@kernel.org Signed-off-by: Ye Bin yebin10@huawei.com Link: https://lore.kernel.org/r/20211224100341.3299128-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu Signed-off-by: Ye Bin yebin10@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/ext4_jbd2.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b96ecba91899..b53e1d0b13fc 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -162,6 +162,8 @@ int __ext4_journal_ensure_credits(handle_t *handle, int check_cred, { if (!ext4_handle_valid(handle)) return 0; + if (is_handle_aborted(handle)) + return -EROFS; if (jbd2_handle_buffer_credits(handle) >= check_cred && handle->h_revoke_credits >= revoke_cred) return 0;
From: Ye Bin yebin10@huawei.com
mainline inclusion from mainline-v5.17-rc1 commit 380a0091cab482489e9b19e07f2a166ad2b76d5c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4RP94?from=project-issue CVE: NA
--------------------------------
We got issue as follows when run syzkaller: [ 167.936972] EXT4-fs error (device loop0): __ext4_remount:6314: comm rep: Abort forced by user [ 167.938306] EXT4-fs (loop0): Remounting filesystem read-only [ 167.981637] Assertion failure in ext4_getblk() at fs/ext4/inode.c:847: '(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || handle != NULL || create == 0' [ 167.983601] ------------[ cut here ]------------ [ 167.984245] kernel BUG at fs/ext4/inode.c:847! [ 167.984882] invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI [ 167.985624] CPU: 7 PID: 2290 Comm: rep Tainted: G B 5.16.0-rc5-next-20211217+ #123 [ 167.986823] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-ppc64le-16.ppc.fedoraproject.org-3.fc31 04/01/2014 [ 167.988590] RIP: 0010:ext4_getblk+0x17e/0x504 [ 167.989189] Code: c6 01 74 28 49 c7 c0 a0 a3 5c 9b b9 4f 03 00 00 48 c7 c2 80 9c 5c 9b 48 c7 c6 40 b6 5c 9b 48 c7 c7 20 a4 5c 9b e8 77 e3 fd ff <0f> 0b 8b 04 244 [ 167.991679] RSP: 0018:ffff8881736f7398 EFLAGS: 00010282 [ 167.992385] RAX: 0000000000000094 RBX: 1ffff1102e6dee75 RCX: 0000000000000000 [ 167.993337] RDX: 0000000000000001 RSI: ffffffff9b6e29e0 RDI: ffffed102e6dee66 [ 167.994292] RBP: ffff88816a076210 R08: 0000000000000094 R09: ffffed107363fa09 [ 167.995252] R10: ffff88839b1fd047 R11: ffffed107363fa08 R12: ffff88816a0761e8 [ 167.996205] R13: 0000000000000000 R14: 0000000000000021 R15: 0000000000000001 [ 167.997158] FS: 00007f6a1428c740(0000) GS:ffff88839b000000(0000) knlGS:0000000000000000 [ 167.998238] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 167.999025] CR2: 00007f6a140716c8 CR3: 0000000133216000 CR4: 00000000000006e0 [ 167.999987] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 168.000944] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 168.001899] Call Trace: [ 168.002235] <TASK> [ 168.007167] ext4_bread+0xd/0x53 [ 168.007612] ext4_quota_write+0x20c/0x5c0 [ 168.010457] write_blk+0x100/0x220 [ 168.010944] remove_free_dqentry+0x1c6/0x440 [ 168.011525] free_dqentry.isra.0+0x565/0x830 [ 168.012133] remove_tree+0x318/0x6d0 [ 168.014744] remove_tree+0x1eb/0x6d0 [ 168.017346] remove_tree+0x1eb/0x6d0 [ 168.019969] remove_tree+0x1eb/0x6d0 [ 168.022128] qtree_release_dquot+0x291/0x340 [ 168.023297] v2_release_dquot+0xce/0x120 [ 168.023847] dquot_release+0x197/0x3e0 [ 168.024358] ext4_release_dquot+0x22a/0x2d0 [ 168.024932] dqput.part.0+0x1c9/0x900 [ 168.025430] __dquot_drop+0x120/0x190 [ 168.025942] ext4_clear_inode+0x86/0x220 [ 168.026472] ext4_evict_inode+0x9e8/0xa22 [ 168.028200] evict+0x29e/0x4f0 [ 168.028625] dispose_list+0x102/0x1f0 [ 168.029148] evict_inodes+0x2c1/0x3e0 [ 168.030188] generic_shutdown_super+0xa4/0x3b0 [ 168.030817] kill_block_super+0x95/0xd0 [ 168.031360] deactivate_locked_super+0x85/0xd0 [ 168.031977] cleanup_mnt+0x2bc/0x480 [ 168.033062] task_work_run+0xd1/0x170 [ 168.033565] do_exit+0xa4f/0x2b50 [ 168.037155] do_group_exit+0xef/0x2d0 [ 168.037666] __x64_sys_exit_group+0x3a/0x50 [ 168.038237] do_syscall_64+0x3b/0x90 [ 168.038751] entry_SYSCALL_64_after_hwframe+0x44/0xae
In order to reproduce this problem, the following conditions need to be met: 1. Ext4 filesystem with no journal; 2. Filesystem image with incorrect quota data; 3. Abort filesystem forced by user; 4. umount filesystem;
As in ext4_quota_write: ... if (EXT4_SB(sb)->s_journal && !handle) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started", (unsigned long long)off, (unsigned long long)len); return -EIO; } ... We only check handle if NULL when filesystem has journal. There is need check handle if NULL even when filesystem has no journal.
Signed-off-by: Ye Bin yebin10@huawei.com Reviewed-by: Jan Kara jack@suse.cz Link: https://lore.kernel.org/r/20211223015506.297766-1-yebin10@huawei.com Signed-off-by: Theodore Ts'o tytso@mit.edu Cc: stable@kernel.org Signed-off-by: Ye Bin yebin10@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 926c0d7d9a3d..ee717b73c2b1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -6565,7 +6565,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, struct buffer_head *bh; handle_t *handle = journal_current_handle();
- if (EXT4_SB(sb)->s_journal && !handle) { + if (!handle) { ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started", (unsigned long long)off, (unsigned long long)len);
From: Liu Shixin liushixin2@huawei.com
hulk inclusion category: feature bugzilla: 46904, https://gitee.com/openeuler/kernel/issues/I4RO84 CVE: NA
--------------------------------
Export PG_pool page flag in /proc/kpageflags.
27. KPF_POOL page is allocated from hpool.
By using this tool, we can easily count the number of pages allocated from dynamic hugetlb pool by a process or a file.
Signed-off-by: Liu Shixin liushixin2@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/page.c | 1 + include/uapi/linux/kernel-page-flags.h | 1 + tools/vm/page-types.c | 1 + 3 files changed, 3 insertions(+)
diff --git a/fs/proc/page.c b/fs/proc/page.c index 9f1077d94cde..d00c23d543fe 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -220,6 +220,7 @@ u64 stable_page_flags(struct page *page) #ifdef CONFIG_64BIT u |= kpf_copy_bit(k, KPF_ARCH_2, PG_arch_2); #endif + u |= kpf_copy_bit(k, KPF_POOL, PG_pool);
return u; }; diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index 6f2f2720f3ac..f8297cb68bdd 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h @@ -36,5 +36,6 @@ #define KPF_ZERO_PAGE 24 #define KPF_IDLE 25 #define KPF_PGTABLE 26 +#define KPF_POOL 27
#endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index f62f10c988db..6034477926d6 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -126,6 +126,7 @@ static const char * const page_flag_names[] = { [KPF_PGTABLE] = "g:pgtable", [KPF_ZERO_PAGE] = "z:zero_page", [KPF_IDLE] = "i:idle_page", + [KPF_POOL] = "h:hpool",
[KPF_RESERVED] = "r:reserved", [KPF_MLOCKED] = "m:mlocked",
From: Todd Kjos tkjos@google.com
aosp inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RCS8 CVE: NA
Reference: https://android.googlesource.com/kernel/common/+/7f62740112ef
---------------------------
Add support for vendor hooks. Adds include/trace/hooks directory for trace definition headers where hooks can be defined and vendor_hook.c for instantiating and exporting them for vendor modules.
There are two variants of vendor hooks, both based on tracepoints:
Normal: this uses the DECLARE_HOOK macro to create a tracepoint function with the name trace_<name> where <name> is the unique identifier for the trace.
Restricted: restricted hooks are needed for cases like scheduler hooks where the attached function must be called even if the cpu is offline or requires a non-atomic context. Restricted vendor hooks cannot be detached, so modules that attach to a restricted hook can never unload. Also, only 1 attachment is allowed (any other attempts to attach will fail with -EBUSY).
For either case, modules attach to the hook by using register_trace_<name>(func_ptr, NULL).
New hooks should be defined in headers in the include/trace/hooks/ directory using the DECLARE_HOOK() or DECLARE_RESTRICTED_HOOK() macros.
New files added to include/trace/hooks should be #include'd from drivers/android/vendor_hooks.c. The EXPORT_TRACEPOINT_SYMBOL_GPL() should be also added to drivers/android/vendor_hooks.c.
For example, if a new hook, 'android_vh_foo(int &ret)' is added in do_exit() in exit.c, these changes are needed:
1. create a new header file include/trace/hooks/foo.h which contains: #include <trace/hooks/vendor_hooks.h> ... DECLARE_HOOK(android_vh_foo, TP_PROTO(int *retp), TP_ARGS(retp);
2. in exit.c, add #include <trace/hooks/foo.h> ... int ret = 0; ... android_vh_foo(&ret); if (ret) return ret; ...
3. in drivers/android/vendor_hooks.c, add #include <trace/hooks/foo.h> ... EXPORT_TRACEPOINT_SYMBOL_GPL(android_vh_foo);
The hook can then be attached by adding the registration code to the module:
#include <trace/hooks/sched.h> ... static void my_foo(int *retp) { *retp = 0; } ... rc = register_trace_android_vh_sched_exit(my_foo, NULL);
Bug: 156285741 Signed-off-by: Todd Kjos tkjos@google.com Change-Id: I6a7d1c8919dae91c965e2a0450df50eac2d282db Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/android/Kconfig | 9 ++++ drivers/android/Makefile | 1 + drivers/android/vendor_hooks.c | 16 +++++++ include/trace/hooks/vendor_hooks.h | 72 ++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+) create mode 100644 drivers/android/vendor_hooks.c create mode 100644 include/trace/hooks/vendor_hooks.h
diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 53b22e26266c..32fb9e5b6195 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -54,6 +54,15 @@ config ANDROID_BINDER_IPC_SELFTEST exhaustively with combinations of various buffer sizes and alignments.
+config ANDROID_VENDOR_HOOKS + bool "Android Vendor Hooks" + depends on TRACEPOINTS + help + Enable vendor hooks implemented as tracepoints + + Allow vendor modules to attach to tracepoint "hooks" defined via + DECLARE_HOOK or DECLARE_RESTRICTED_HOOK. + endif # if ANDROID
endmenu diff --git a/drivers/android/Makefile b/drivers/android/Makefile index c9d3d0c99c25..d488047415a0 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -4,3 +4,4 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o +obj-$(CONFIG_ANDROID_VENDOR_HOOKS) += vendor_hooks.o diff --git a/drivers/android/vendor_hooks.c b/drivers/android/vendor_hooks.c new file mode 100644 index 000000000000..4a403a81eed3 --- /dev/null +++ b/drivers/android/vendor_hooks.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* vendor_hook.c + * + * Android Vendor Hook Support + * + * Copyright (C) 2020 Google, Inc. + */ + +#define CREATE_TRACE_POINTS +#include <trace/hooks/vendor_hooks.h> + +/* + * Export tracepoints that act as a bare tracehook (ie: have no trace event + * associated with them) to allow external modules to probe them. + */ + diff --git a/include/trace/hooks/vendor_hooks.h b/include/trace/hooks/vendor_hooks.h new file mode 100644 index 000000000000..9d9ae21895dd --- /dev/null +++ b/include/trace/hooks/vendor_hooks.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#if !defined(_TRACE_VENDOR_HOOKS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VENDOR_HOOKS_H + +#include <linux/tracepoint.h> + +#define DECLARE_HOOK DECLARE_TRACE + +#ifdef TRACE_HEADER_MULTI_READ + +#undef DECLARE_RESTRICTED_HOOK +#define DECLARE_RESTRICTED_HOOK(name, proto, args, cond) \ + DEFINE_TRACE(name) + +/* prevent additional recursion */ +#undef TRACE_HEADER_MULTI_READ +#else /* TRACE_HEADER_MULTI_READ */ + +#define DO_HOOK(tp, proto, args, cond) \ + do { \ + struct tracepoint_func *it_func_ptr; \ + void *it_func; \ + void *__data; \ + \ + if (!(cond)) \ + return; \ + \ + it_func_ptr = (tp)->funcs; \ + if (it_func_ptr) { \ + it_func = (it_func_ptr)->func; \ + __data = (it_func_ptr)->data; \ + ((void(*)(proto))(it_func))(args); \ + WARN_ON(((++it_func_ptr)->func)); \ + } \ + } while (0) + +#define __DECLARE_HOOK(name, proto, args, cond, data_proto, data_args) \ + extern struct tracepoint __tracepoint_##name; \ + static inline void trace_##name(proto) \ + { \ + if (static_key_false(&__tracepoint_##name.key)) \ + DO_HOOK(&__tracepoint_##name, \ + TP_PROTO(data_proto), \ + TP_ARGS(data_args), \ + TP_CONDITION(cond)); \ + } \ + static inline bool \ + trace_##name##_enabled(void) \ + { \ + return static_key_false(&__tracepoint_##name.key); \ + } \ + static inline int \ + register_trace_##name(void (*probe)(data_proto), void *data) \ + { \ + /* only allow a single attachment */ \ + if (trace_##name##_enabled()) \ + return -EBUSY; \ + return tracepoint_probe_register(&__tracepoint_##name, \ + (void *)probe, data); \ + } \ + /* vendor hooks cannot be unregistered */ \ + +#define DECLARE_RESTRICTED_HOOK(name, proto, args, cond) \ + __DECLARE_HOOK(name, PARAMS(proto), PARAMS(args), \ + cond, \ + PARAMS(void *__data, proto), \ + PARAMS(__data, args)) + +#endif /* TRACE_HEADER_MULTI_READ */ + +#endif /* _TRACE_VENDOR_HOOKS_H */
From: Todd Kjos tkjos@google.com
aosp inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RCS8 CVE: NA
Reference: https://android.googlesource.com/kernel/common/+/e706f27c765b
---------------------------
Because of the multi-inclusion oddities of tracepoints, the multi-inclusion protection in vendor_hooks.h caused issues if more than 1 vendor hook header file with restricted vendor hooks defined were included with "CREATE_TRACE_POINTS" defined (redefinition of symbol errors).
The problem is fixed by removing the multiple-inclusion protection as is done for regular tracepoints.
Fixes: 7f62740112ef ("ANDROID: add support for vendor hooks") Bug: 163076069 Signed-off-by: Todd Kjos tkjos@google.com Change-Id: Ic177db1693a6a2db58f08917e9115c7e6c2971b6 Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/trace/hooks/vendor_hooks.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/include/trace/hooks/vendor_hooks.h b/include/trace/hooks/vendor_hooks.h index 9d9ae21895dd..8a3fdb9222e5 100644 --- a/include/trace/hooks/vendor_hooks.h +++ b/include/trace/hooks/vendor_hooks.h @@ -1,7 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */
-#if !defined(_TRACE_VENDOR_HOOKS_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_VENDOR_HOOKS_H +/* + * Note: we intentionally omit include file ifdef protection + * This is due to the way trace events work. If a file includes two + * trace event headers under one "CREATE_TRACE_POINTS" the first include + * will override the DECLARE_RESTRICTED_HOOK and break the second include. + */
#include <linux/tracepoint.h>
@@ -13,6 +17,7 @@ #define DECLARE_RESTRICTED_HOOK(name, proto, args, cond) \ DEFINE_TRACE(name)
+ /* prevent additional recursion */ #undef TRACE_HEADER_MULTI_READ #else /* TRACE_HEADER_MULTI_READ */ @@ -61,6 +66,7 @@ } \ /* vendor hooks cannot be unregistered */ \
+#undef DECLARE_RESTRICTED_HOOK #define DECLARE_RESTRICTED_HOOK(name, proto, args, cond) \ __DECLARE_HOOK(name, PARAMS(proto), PARAMS(args), \ cond, \ @@ -68,5 +74,3 @@ PARAMS(__data, args))
#endif /* TRACE_HEADER_MULTI_READ */ - -#endif /* _TRACE_VENDOR_HOOKS_H */
From: Todd Kjos tkjos@google.com
aosp inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RCS8 CVE: NA
Reference: https://android.googlesource.com/kernel/common/+/5e767aa07eea
---------------------------
commit d25e37d89dd2 ("tracepoint: Optimize using static_call()") refactored tracepoints to use static_call(). Add the same optimization for restricted vendor hooks.
Fixes: d25e37d89dd2 ("tracepoint: Optimize using static_call()") Signed-off-by: Todd Kjos tkjos@google.com Change-Id: I336db7e90b733ac4098ce342001cc31fd215d137 Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/trace/hooks/vendor_hooks.h | 46 ++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 9 deletions(-)
diff --git a/include/trace/hooks/vendor_hooks.h b/include/trace/hooks/vendor_hooks.h index 8a3fdb9222e5..8a7acb983c67 100644 --- a/include/trace/hooks/vendor_hooks.h +++ b/include/trace/hooks/vendor_hooks.h @@ -13,39 +13,67 @@
#ifdef TRACE_HEADER_MULTI_READ
+#define DEFINE_HOOK_FN(_name, _reg, _unreg, proto, args) \ + static const char __tpstrtab_##_name[] \ + __section(__tracepoints_strings) = #_name; \ + extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \ + int __traceiter_##_name(void *__data, proto); \ + struct tracepoint __tracepoint_##_name __used \ + __section(__tracepoints) = { \ + .name = __tpstrtab_##_name, \ + .key = STATIC_KEY_INIT_FALSE, \ + .static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \ + .static_call_tramp = STATIC_CALL_TRAMP_ADDR(tp_func_##_name), \ + .iterator = &__traceiter_##_name, \ + .regfunc = _reg, \ + .unregfunc = _unreg, \ + .funcs = NULL }; \ + __TRACEPOINT_ENTRY(_name); \ + int __traceiter_##_name(void *__data, proto) \ + { \ + struct tracepoint_func *it_func_ptr; \ + void *it_func; \ + \ + it_func_ptr = (&__tracepoint_##_name)->funcs; \ + it_func = (it_func_ptr)->func; \ + __data = (it_func_ptr)->data; \ + ((void(*)(void *, proto))(it_func))(__data, args); \ + WARN_ON(((++it_func_ptr)->func)); \ + return 0; \ + } \ + DEFINE_STATIC_CALL(tp_func_##_name, __traceiter_##_name); + #undef DECLARE_RESTRICTED_HOOK #define DECLARE_RESTRICTED_HOOK(name, proto, args, cond) \ - DEFINE_TRACE(name) - + DEFINE_HOOK_FN(name, NULL, NULL, PARAMS(proto), PARAMS(args))
/* prevent additional recursion */ #undef TRACE_HEADER_MULTI_READ #else /* TRACE_HEADER_MULTI_READ */
-#define DO_HOOK(tp, proto, args, cond) \ +#define DO_HOOK(name, proto, args, cond) \ do { \ struct tracepoint_func *it_func_ptr; \ - void *it_func; \ void *__data; \ \ if (!(cond)) \ return; \ \ - it_func_ptr = (tp)->funcs; \ + it_func_ptr = (&__tracepoint_##name)->funcs; \ if (it_func_ptr) { \ - it_func = (it_func_ptr)->func; \ __data = (it_func_ptr)->data; \ - ((void(*)(proto))(it_func))(args); \ - WARN_ON(((++it_func_ptr)->func)); \ + __DO_TRACE_CALL(name)(args); \ } \ } while (0)
#define __DECLARE_HOOK(name, proto, args, cond, data_proto, data_args) \ + extern int __traceiter_##name(data_proto); \ + DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \ extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ if (static_key_false(&__tracepoint_##name.key)) \ - DO_HOOK(&__tracepoint_##name, \ + DO_HOOK(name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ TP_CONDITION(cond)); \
From: Nick Desaulniers ndesaulniers@google.com
aosp inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RCS8 CVE: NA
Reference: https://android.googlesource.com/kernel/common/+/4cc2f83c77aa
---------------------------
After upstream 33def8498fdd ("treewide: Convert macro and uses of __section(foo) to __section("foo")"), the preprocessor macro __section now requires the section name to be double quoted.
This patch resolves breakage that results from merging down from mainline in this out of tree header.
Fixes: 33def8498fdd ("treewide: Convert macro and uses of __section(foo) to __section("foo")") Signed-off-by: Nick Desaulniers ndesaulniers@google.com Signed-off-by: Greg Kroah-Hartman gregkh@google.com Change-Id: Ie6a701251e6420e63187a466b43ec2c834e0ec2e Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/trace/hooks/vendor_hooks.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/trace/hooks/vendor_hooks.h b/include/trace/hooks/vendor_hooks.h index 8a7acb983c67..e6cabd366eeb 100644 --- a/include/trace/hooks/vendor_hooks.h +++ b/include/trace/hooks/vendor_hooks.h @@ -15,11 +15,11 @@
#define DEFINE_HOOK_FN(_name, _reg, _unreg, proto, args) \ static const char __tpstrtab_##_name[] \ - __section(__tracepoints_strings) = #_name; \ + __section("__tracepoints_strings") = #_name; \ extern struct static_call_key STATIC_CALL_KEY(tp_func_##_name); \ int __traceiter_##_name(void *__data, proto); \ struct tracepoint __tracepoint_##_name __used \ - __section(__tracepoints) = { \ + __section("__tracepoints") = { \ .name = __tpstrtab_##_name, \ .key = STATIC_KEY_INIT_FALSE, \ .static_call_key = &STATIC_CALL_KEY(tp_func_##_name), \
From: Todd Kjos tkjos@google.com
aosp inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RCS8 CVE: NA
Reference: https://android.googlesource.com/kernel/common/+/943c3b3124d2
---------------------------
Vendor hooks required explicitly defining macros or inline functions to handle the non-GKI build case (!CONFIG_ANDROID_VENDOR_HOOKS). Added support for generating them automatically so the macros are no longer required.
Both models are now supported so we can transition.
Bug: 177416721 Signed-off-by: Todd Kjos tkjos@google.com Change-Id: I01acc389d315a5d509b0c48116854342a42e1058 Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/trace/hooks/vendor_hooks.h | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/include/trace/hooks/vendor_hooks.h b/include/trace/hooks/vendor_hooks.h index e6cabd366eeb..ef8c95aaebf0 100644 --- a/include/trace/hooks/vendor_hooks.h +++ b/include/trace/hooks/vendor_hooks.h @@ -9,6 +9,8 @@
#include <linux/tracepoint.h>
+#if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_ANDROID_VENDOR_HOOKS) + #define DECLARE_HOOK DECLARE_TRACE
#ifdef TRACE_HEADER_MULTI_READ @@ -102,3 +104,10 @@ PARAMS(__data, args))
#endif /* TRACE_HEADER_MULTI_READ */ + +#else /* !CONFIG_TRACEPOINTS || !CONFIG_ANDROID_VENDOR_HOOKS */ +/* suppress trace hooks */ +#define DECLARE_HOOK DECLARE_EVENT_NOP +#define DECLARE_RESTRICTED_HOOK(name, proto, args, cond) \ + DECLARE_EVENT_NOP(name, PARAMS(proto), PARAMS(args)) +#endif
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
mainline inclusion from mainline-v5.12-rc1 commit 1746fd4416ed5510fe9fdd6a93e49a436187b680 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4RCS8 CVE: NA
---------------------------
While working on a clean up that would restructure the difference between architectures that have static calls vs those that do not, I was stumbling over the "data_args" parameter that includes "__data" in the arguments. The issue was that one version didn't even need it, while the other one did. Instead of injecting a "__data = NULL;" into the macro for the unneeded version, just remove it completely.
The original idea behind data_args is that there may be a case of a tracepoint with no arguments. But this is considered bad practice, and all tracepoints should pass something to that location (that's what tracepoints were created for).
Link: https://lkml.kernel.org/r/20210208201050.768074128@goodmis.org
Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/tracepoint.h | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-)
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index e4c5df71f0e7..b4b2cc56be8d 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -170,13 +170,8 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) /* * it_func[0] is never NULL because there is at least one element in the array * when the array itself is non NULL. - * - * Note, the proto and args passed in includes "__data" as the first parameter. - * The reason for this is to handle the "void" prototype. If a tracepoint - * has a "void" prototype, then it is invalid to declare a function - * as "(void *, void)". */ -#define __DO_TRACE(name, proto, args, cond, rcuidle) \ +#define __DO_TRACE(name, args, cond, rcuidle) \ do { \ struct tracepoint_func *it_func_ptr; \ int __maybe_unused __idx = 0; \ @@ -204,7 +199,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) rcu_dereference_raw((&__tracepoint_##name)->funcs); \ if (it_func_ptr) { \ __data = (it_func_ptr)->data; \ - __DO_TRACE_CALL(name)(args); \ + __DO_TRACE_CALL(name)(__data, args); \ } \ \ if (rcuidle) { \ @@ -216,17 +211,16 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) } while (0)
#ifndef MODULE -#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) \ +#define __DECLARE_TRACE_RCU(name, proto, args, cond) \ static inline void trace_##name##_rcuidle(proto) \ { \ if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ - TP_PROTO(data_proto), \ - TP_ARGS(data_args), \ + TP_ARGS(args), \ TP_CONDITION(cond), 1); \ } #else -#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) +#define __DECLARE_TRACE_RCU(name, proto, args, cond) #endif
/* @@ -241,7 +235,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) * even when this tracepoint is off. This code has no purpose other than * poking RCU a bit. */ -#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \ +#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ extern int __traceiter_##name(data_proto); \ DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \ extern struct tracepoint __tracepoint_##name; \ @@ -249,8 +243,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) { \ if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(name, \ - TP_PROTO(data_proto), \ - TP_ARGS(data_args), \ + TP_ARGS(args), \ TP_CONDITION(cond), 0); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ rcu_read_lock_sched_notrace(); \ @@ -259,7 +252,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) } \ } \ __DECLARE_TRACE_RCU(name, PARAMS(proto), PARAMS(args), \ - PARAMS(cond), PARAMS(data_proto), PARAMS(data_args)) \ + PARAMS(cond)) \ static inline int \ register_trace_##name(void (*probe)(data_proto), void *data) \ { \ @@ -342,7 +335,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
#else /* !TRACEPOINTS_ENABLED */ -#define __DECLARE_TRACE(name, proto, args, cond, data_proto, data_args) \ +#define __DECLARE_TRACE(name, proto, args, cond, data_proto) \ static inline void trace_##name(proto) \ { } \ static inline void trace_##name##_rcuidle(proto) \ @@ -422,14 +415,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #define DECLARE_TRACE(name, proto, args) \ __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ cpu_online(raw_smp_processor_id()), \ - PARAMS(void *__data, proto), \ - PARAMS(__data, args)) + PARAMS(void *__data, proto))
#define DECLARE_TRACE_CONDITION(name, proto, args, cond) \ __DECLARE_TRACE(name, PARAMS(proto), PARAMS(args), \ cpu_online(raw_smp_processor_id()) && (PARAMS(cond)), \ - PARAMS(void *__data, proto), \ - PARAMS(__data, args)) + PARAMS(void *__data, proto))
#define TRACE_EVENT_FLAGS(event, flag)
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
mainline inclusion from mainline-v5.12-rc1 commit d9a1be1be331fc857d3fe29f86c3a305950b35a9 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I4RCS8 CVE: NA
---------------------------
With static calls, a tracepoint can call the callback directly if there is only one callback registered to that tracepoint. When there is more than one, the static call will call the tracepoint's "iterator" function, which needs to reload the tracepoint's "funcs" array again, as it could have changed since the first time it was loaded.
But an arch without static calls is punished by having to load the tracepoint's "funcs" array twice. Once in the DO_TRACE macro, and once again in the iterator macro.
For archs without static calls, there's no reason to load the array macro in the first place, since the iterator function will do it anyway.
Change the __DO_TRACE_CALL() macro to do the load and call of the tracepoints funcs array only for architectures with static calls, and just call the iterator function directly for architectures without static calls.
Link: https://lkml.kernel.org/r/20210208201050.909329787@goodmis.org
Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/tracepoint.h | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index b4b2cc56be8d..45b24a3965e9 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -162,9 +162,19 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) #ifdef TRACEPOINTS_ENABLED
#ifdef CONFIG_HAVE_STATIC_CALL -#define __DO_TRACE_CALL(name) static_call(tp_func_##name) +#define __DO_TRACE_CALL(name, args) \ + do { \ + struct tracepoint_func *it_func_ptr; \ + void *__data; \ + it_func_ptr = \ + rcu_dereference_raw((&__tracepoint_##name)->funcs); \ + if (it_func_ptr) { \ + __data = (it_func_ptr)->data; \ + static_call(tp_func_##name)(__data, args); \ + } \ + } while (0) #else -#define __DO_TRACE_CALL(name) __traceiter_##name +#define __DO_TRACE_CALL(name, args) __traceiter_##name(NULL, args) #endif /* CONFIG_HAVE_STATIC_CALL */
/* @@ -173,9 +183,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) */ #define __DO_TRACE(name, args, cond, rcuidle) \ do { \ - struct tracepoint_func *it_func_ptr; \ int __maybe_unused __idx = 0; \ - void *__data; \ \ if (!(cond)) \ return; \ @@ -195,12 +203,7 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p) rcu_irq_enter_irqson(); \ } \ \ - it_func_ptr = \ - rcu_dereference_raw((&__tracepoint_##name)->funcs); \ - if (it_func_ptr) { \ - __data = (it_func_ptr)->data; \ - __DO_TRACE_CALL(name)(__data, args); \ - } \ + __DO_TRACE_CALL(name, TP_ARGS(args)); \ \ if (rcuidle) { \ rcu_irq_exit_irqson(); \
From: Todd Kjos tkjos@google.com
aosp inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RCS8 CVE: NA
Reference: https://android.googlesource.com/kernel/common/+/51681321c0ef
---------------------------
In upstream commit d9a1be1be331 ("tracepoints: Do not punish non static call users"), tracepoint macros were refactored to optimize for static_call() cases. Since the Android-specific restricted vendor hook mechanism leverages tracehooks, this required equivalent refactoring in include/trace/hooks/vendor_hooks.h
Fixes: d9a1be1be331 ("tracepoints: Do not punish non static call users") Signed-off-by: Todd Kjos tkjos@google.com Change-Id: I2e01b34606e6ff0e577b76b57c47f601c32f626b Signed-off-by: Greg Kroah-Hartman gregkh@google.com Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/trace/hooks/vendor_hooks.h | 34 ++++++++++++++++++------------ 1 file changed, 20 insertions(+), 14 deletions(-)
diff --git a/include/trace/hooks/vendor_hooks.h b/include/trace/hooks/vendor_hooks.h index ef8c95aaebf0..3661e5056519 100644 --- a/include/trace/hooks/vendor_hooks.h +++ b/include/trace/hooks/vendor_hooks.h @@ -53,32 +53,39 @@ #undef TRACE_HEADER_MULTI_READ #else /* TRACE_HEADER_MULTI_READ */
-#define DO_HOOK(name, proto, args, cond) \ +#ifdef CONFIG_HAVE_STATIC_CALL +#define __DO_RESTRICTED_HOOK_CALL(name, args) \ do { \ struct tracepoint_func *it_func_ptr; \ void *__data; \ - \ - if (!(cond)) \ - return; \ - \ it_func_ptr = (&__tracepoint_##name)->funcs; \ if (it_func_ptr) { \ __data = (it_func_ptr)->data; \ - __DO_TRACE_CALL(name)(args); \ + static_call(tp_func_##name)(__data, args); \ } \ } while (0) +#else +#define __DO_RESTRICTED_HOOK_CALL(name, args) __traceiter_##name(NULL, args) +#endif + +#define DO_RESTRICTED_HOOK(name, args, cond) \ + do { \ + if (!(cond)) \ + return; \ + \ + __DO_RESTRICTED_HOOK_CALL(name, TP_ARGS(args)); \ + } while (0)
-#define __DECLARE_HOOK(name, proto, args, cond, data_proto, data_args) \ +#define __DECLARE_RESTRICTED_HOOK(name, proto, args, cond, data_proto) \ extern int __traceiter_##name(data_proto); \ DECLARE_STATIC_CALL(tp_func_##name, __traceiter_##name); \ extern struct tracepoint __tracepoint_##name; \ static inline void trace_##name(proto) \ { \ if (static_key_false(&__tracepoint_##name.key)) \ - DO_HOOK(name, \ - TP_PROTO(data_proto), \ - TP_ARGS(data_args), \ - TP_CONDITION(cond)); \ + DO_RESTRICTED_HOOK(name, \ + TP_ARGS(args), \ + TP_CONDITION(cond)); \ } \ static inline bool \ trace_##name##_enabled(void) \ @@ -98,10 +105,9 @@
#undef DECLARE_RESTRICTED_HOOK #define DECLARE_RESTRICTED_HOOK(name, proto, args, cond) \ - __DECLARE_HOOK(name, PARAMS(proto), PARAMS(args), \ + __DECLARE_RESTRICTED_HOOK(name, PARAMS(proto), PARAMS(args), \ cond, \ - PARAMS(void *__data, proto), \ - PARAMS(__data, args)) + PARAMS(void *__data, proto))
#endif /* TRACE_HEADER_MULTI_READ */
From: Jialin Zhang zhangjialin11@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RCS8 CVE: NA
Reference: https://android.googlesource.com/kernel/common/
---------------------------
Make android vendor hooks feature generic.
Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/Kconfig | 2 ++ drivers/Makefile | 1 + drivers/android/Kconfig | 9 --------- drivers/android/Makefile | 1 - drivers/hooks/Kconfig | 13 +++++++++++++ drivers/hooks/Makefile | 4 ++++ drivers/{android => hooks}/vendor_hooks.c | 2 +- include/trace/hooks/vendor_hooks.h | 4 ++-- 8 files changed, 23 insertions(+), 13 deletions(-) create mode 100644 drivers/hooks/Kconfig create mode 100644 drivers/hooks/Makefile rename drivers/{android => hooks}/vendor_hooks.c (91%)
diff --git a/drivers/Kconfig b/drivers/Kconfig index dcecc9f6e33f..9310808ee385 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -204,6 +204,8 @@ source "drivers/thunderbolt/Kconfig"
source "drivers/android/Kconfig"
+source "drivers/hooks/Kconfig" + source "drivers/gpu/trace/Kconfig"
source "drivers/nvdimm/Kconfig" diff --git a/drivers/Makefile b/drivers/Makefile index 576228037718..9d67932a5037 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -178,6 +178,7 @@ obj-$(CONFIG_CORESIGHT) += hwtracing/coresight/ obj-y += hwtracing/intel_th/ obj-$(CONFIG_STM) += hwtracing/stm/ obj-$(CONFIG_ANDROID) += android/ +obj-$(CONFIG_VENDOR_HOOKS) += hooks/ obj-$(CONFIG_NVMEM) += nvmem/ obj-$(CONFIG_FPGA) += fpga/ obj-$(CONFIG_FSI) += fsi/ diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 32fb9e5b6195..53b22e26266c 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -54,15 +54,6 @@ config ANDROID_BINDER_IPC_SELFTEST exhaustively with combinations of various buffer sizes and alignments.
-config ANDROID_VENDOR_HOOKS - bool "Android Vendor Hooks" - depends on TRACEPOINTS - help - Enable vendor hooks implemented as tracepoints - - Allow vendor modules to attach to tracepoint "hooks" defined via - DECLARE_HOOK or DECLARE_RESTRICTED_HOOK. - endif # if ANDROID
endmenu diff --git a/drivers/android/Makefile b/drivers/android/Makefile index d488047415a0..c9d3d0c99c25 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -4,4 +4,3 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDERFS) += binderfs.o obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o -obj-$(CONFIG_ANDROID_VENDOR_HOOKS) += vendor_hooks.o diff --git a/drivers/hooks/Kconfig b/drivers/hooks/Kconfig new file mode 100644 index 000000000000..1c0e33ef9a56 --- /dev/null +++ b/drivers/hooks/Kconfig @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-2.0 +menu "Vendor Hooks" + +config VENDOR_HOOKS + bool "Vendor Hooks" + depends on TRACEPOINTS + help + Enable vendor hooks implemented as tracepoints + + Allow vendor modules to attach to tracepoint "hooks" defined via + DECLARE_HOOK or DECLARE_RESTRICTED_HOOK. + +endmenu diff --git a/drivers/hooks/Makefile b/drivers/hooks/Makefile new file mode 100644 index 000000000000..159230826966 --- /dev/null +++ b/drivers/hooks/Makefile @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-2.0-only +ccflags-y += -I$(src) # needed for trace events + +obj-$(CONFIG_VENDOR_HOOKS) += vendor_hooks.o diff --git a/drivers/android/vendor_hooks.c b/drivers/hooks/vendor_hooks.c similarity index 91% rename from drivers/android/vendor_hooks.c rename to drivers/hooks/vendor_hooks.c index 4a403a81eed3..359989d1bb32 100644 --- a/drivers/android/vendor_hooks.c +++ b/drivers/hooks/vendor_hooks.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only /* vendor_hook.c * - * Android Vendor Hook Support + * Vendor Hook Support * * Copyright (C) 2020 Google, Inc. */ diff --git a/include/trace/hooks/vendor_hooks.h b/include/trace/hooks/vendor_hooks.h index 3661e5056519..ab8864da66d8 100644 --- a/include/trace/hooks/vendor_hooks.h +++ b/include/trace/hooks/vendor_hooks.h @@ -9,7 +9,7 @@
#include <linux/tracepoint.h>
-#if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_ANDROID_VENDOR_HOOKS) +#if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_VENDOR_HOOKS)
#define DECLARE_HOOK DECLARE_TRACE
@@ -111,7 +111,7 @@
#endif /* TRACE_HEADER_MULTI_READ */
-#else /* !CONFIG_TRACEPOINTS || !CONFIG_ANDROID_VENDOR_HOOKS */ +#else /* !CONFIG_TRACEPOINTS || !CONFIG_VENDOR_HOOKS */ /* suppress trace hooks */ #define DECLARE_HOOK DECLARE_EVENT_NOP #define DECLARE_RESTRICTED_HOOK(name, proto, args, cond) \
From: Jialin Zhang zhangjialin11@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4RCS8 CVE: NA
---------------------------
Enable CONFIG_VENDOR_HOOKS for x86 and arm64 by default.
Signed-off-by: Jialin Zhang zhangjialin11@huawei.com Reviewed-by: Wei Li liwei391@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 6 ++++++ arch/x86/configs/openeuler_defconfig | 6 ++++++ 2 files changed, 12 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index d8eda75eb784..7ae6b2c053be 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -6027,6 +6027,12 @@ CONFIG_USB4=m # CONFIG_ANDROID is not set # end of Android
+# +# Vendor Hooks +# +CONFIG_VENDOR_HOOKS=y +# end of Vendor Hooks + CONFIG_LIBNVDIMM=m CONFIG_BLK_DEV_PMEM=m CONFIG_ND_BLK=m diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index 443299650f6a..dd41eb690608 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -7373,6 +7373,12 @@ CONFIG_RAS=y # CONFIG_ANDROID is not set # end of Android
+# +# Vendor Hooks +# +CONFIG_VENDOR_HOOKS=y +# end of Vendor Hooks + CONFIG_LIBNVDIMM=m CONFIG_BLK_DEV_PMEM=m CONFIG_ND_BLK=m
From: Zhuling zhuling8@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4O31I
-------------------------
Move x86's pmem.c into nvdimm, and rename X86_PMEM_LEGACY_DEVICE to PMEM_LEGACY_DEVICE, also add PMEM_LEGACY to control the built of nd_e820.o, then the code could be reused by other architectures.
Noteļ¼this patch fixs the nd_e820.c build introduced by commit 2499317e408e ("arm64: Revert feature: Add memmap parameter and register pmem").
Signed-off-by: Zhuling zhuling8@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/Kconfig | 6 ++---- arch/x86/kernel/Makefile | 1 - drivers/nvdimm/Kconfig | 6 ++++++ drivers/nvdimm/Makefile | 2 ++ .../kernel/pmem.c => drivers/nvdimm/pmem_legacy_device.c | 0 tools/testing/nvdimm/Kbuild | 2 +- 6 files changed, 11 insertions(+), 6 deletions(-) rename arch/x86/kernel/pmem.c => drivers/nvdimm/pmem_legacy_device.c (100%)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c77ef59d7bf2..1d3176a41a29 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1667,14 +1667,12 @@ config ILLEGAL_POINTER_VALUE default 0 if X86_32 default 0xdead000000000000 if X86_64
-config X86_PMEM_LEGACY_DEVICE - bool - config X86_PMEM_LEGACY tristate "Support non-standard NVDIMMs and ADR protected memory" depends on PHYS_ADDR_T_64BIT depends on BLK_DEV - select X86_PMEM_LEGACY_DEVICE + select PMEM_LEGACY + select PMEM_LEGACY_DEVICE select NUMA_KEEP_MEMINFO if NUMA select LIBNVDIMM help diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f0606f816aa8..1e127514c824 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -130,7 +130,6 @@ obj-$(CONFIG_KVM_GUEST) += kvm.o kvmclock.o obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch.o obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o -obj-$(CONFIG_X86_PMEM_LEGACY_DEVICE) += pmem.o
obj-$(CONFIG_JAILHOUSE_GUEST) += jailhouse.o
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index b7d1eb38b27d..632b6eda5252 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig @@ -19,6 +19,12 @@ menuconfig LIBNVDIMM
if LIBNVDIMM
+config PMEM_LEGACY + tristate + +config PMEM_LEGACY_DEVICE + bool + config BLK_DEV_PMEM tristate "PMEM: Persistent memory block device support" default LIBNVDIMM diff --git a/drivers/nvdimm/Makefile b/drivers/nvdimm/Makefile index 04077532f7ed..2098221f1c90 100644 --- a/drivers/nvdimm/Makefile +++ b/drivers/nvdimm/Makefile @@ -3,6 +3,8 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o obj-$(CONFIG_ND_BTT) += nd_btt.o obj-$(CONFIG_ND_BLK) += nd_blk.o +obj-$(CONFIG_PMEM_LEGACY_DEVICE) += pmem_legacy_device.o +obj-$(CONFIG_PMEM_LEGACY) += nd_e820.o obj-$(CONFIG_OF_PMEM) += of_pmem.o obj-$(CONFIG_VIRTIO_PMEM) += virtio_pmem.o nd_virtio.o
diff --git a/arch/x86/kernel/pmem.c b/drivers/nvdimm/pmem_legacy_device.c similarity index 100% rename from arch/x86/kernel/pmem.c rename to drivers/nvdimm/pmem_legacy_device.c diff --git a/tools/testing/nvdimm/Kbuild b/tools/testing/nvdimm/Kbuild index 47f9cc9dcd94..77aa117fbd11 100644 --- a/tools/testing/nvdimm/Kbuild +++ b/tools/testing/nvdimm/Kbuild @@ -28,7 +28,7 @@ obj-$(CONFIG_LIBNVDIMM) += libnvdimm.o obj-$(CONFIG_BLK_DEV_PMEM) += nd_pmem.o obj-$(CONFIG_ND_BTT) += nd_btt.o obj-$(CONFIG_ND_BLK) += nd_blk.o -obj-$(CONFIG_X86_PMEM_LEGACY) += nd_e820.o +obj-$(CONFIG_PMEM_LEGACY) += nd_e820.o obj-$(CONFIG_ACPI_NFIT) += nfit.o ifeq ($(CONFIG_DAX),m) obj-$(CONFIG_DAX) += dax.o
From: Zhuling zhuling8@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4O31I
----------------------------
This patch is to support persistent memory(legacy) register on arm64.
Firstly, support memory region marked as protected memory, which removed from memblock, the ranges for persistent memory are described by the 'memmap=nn[KMG]!ss[KMG]' kernel parameter", then they will be passed to the 'pmem' driver so they can be used for persistent storage. For now, the maximum memory regions supported is 8.
Secondly, add ARM64_PMEM_LEGACY Kconfig to select PMEM_LEGACY and PMEM_LEGACY_DEVICE to reuse the nvdimm resource discovery and pmem device registering mechanism provided by pmem_legacy.c and e820.c.
Note, the functions in those file should not only used by x86, but the file and function naming is x86 specific, will update after this feature upstreamed.
Here are steps to show how to use this feature on arm64, 1. setup the memmap kernel parameter, memmap=nn[KMG]!ss[KMG], eg, memmap=100K!0x1a0000000. 2. load nd_e820.ko driver, modprobe nd_e820. 3. check pmem device in /dev, eg, /dev/pmem0
Signed-off-by: Zhuling zhuling8@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../admin-guide/kernel-parameters.txt | 7 +- arch/arm64/Kconfig | 18 +++++ arch/arm64/kernel/setup.c | 4 ++ arch/arm64/mm/Makefile | 2 + arch/arm64/mm/init.c | 7 ++ arch/arm64/mm/pmem_reserve.c | 66 +++++++++++++++++++ arch/arm64/mm/pmem_reserve.h | 13 ++++ 7 files changed, 115 insertions(+), 2 deletions(-) create mode 100644 arch/arm64/mm/pmem_reserve.c create mode 100644 arch/arm64/mm/pmem_reserve.h
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5a0a68b35bb1..4b38e33741e4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2831,10 +2831,13 @@ will be eaten.
memmap=nn[KMG]!ss[KMG] - [KNL,X86] Mark specific memory as protected. + [KNL,X86,ARM64] Mark specific memory as protected. Region of memory to be used, from ss to ss+nn. - The memory region may be marked as e820 type 12 (0xc) + [X86] The memory region may be marked as e820 type 12 (0xc) and is NVDIMM or ADR memory. + [ARM64] The maximum memory regions supported is 8. + Example: + memmap=100K!0x1a0000000
memmap=<size>%<offset>-<oldtype>+<newtype> [KNL,ACPI] Convert memory within the specified region diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 2cab963563d9..e0c2f9a1a256 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1325,6 +1325,24 @@ config RODATA_FULL_DEFAULT_ENABLED This requires the linear region to be mapped down to pages, which may adversely affect performance in some cases.
+config ARM64_PMEM_RESERVE + bool + +config ARM64_PMEM_LEGACY + tristate "Support Persistent Memory (legacy) register via protected memory" + depends on BLK_DEV + select ARM64_PMEM_RESERVE + select PMEM_LEGACY + select PMEM_LEGACY_DEVICE + select LIBNVDIMM + help + Protected memory ranges for persistent memory are described by the + 'memmap=nn[KMG]!ss[KMG]' kernel parameter". + The kernel will offer those memory regions to the 'pmem' driver so + they can be used for persistent storage. + + Say Y if unsure. + config ARM64_SW_TTBR0_PAN bool "Emulate Privileged Access Never using TTBR0_EL1 switching" help diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 58d69e2e7538..5e0713f5120e 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -52,6 +52,8 @@ #include <asm/xen/hypervisor.h> #include <asm/mmu_context.h>
+#include "../mm/pmem_reserve.h" + static int num_standard_resources; static struct resource *standard_resources;
@@ -297,6 +299,8 @@ static void __init request_standard_resources(void)
request_pin_mem_res(res); } + + request_pmem_res_resource(); }
static int __init reserve_memblock_reserved_regions(void) diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 5ead3c3de3b6..42e107d6da4f 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -13,3 +13,5 @@ KASAN_SANITIZE_physaddr.o += n
obj-$(CONFIG_KASAN) += kasan_init.o KASAN_SANITIZE_kasan_init.o := n + +obj-$(CONFIG_ARM64_PMEM_RESERVE) += pmem_reserve.o diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 1364d52cbaa8..5ab9dd7d55d9 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -45,6 +45,8 @@ #include <asm/tlb.h> #include <asm/alternative.h>
+#include "pmem_reserve.h" + /* * We need to be able to catch inadvertent references to memstart_addr * that occur (potentially in generic code) before arm64_memblock_init() @@ -394,6 +396,9 @@ static int __init parse_memmap_one(char *p) start_at = memparse(p + 1, &p); memblock_reserve(start_at, mem_size); memblock_mark_memmap(start_at, mem_size); + } else if (*p == '!') { + start_at = memparse(p + 1, &p); + setup_reserve_pmem(start_at, mem_size); } else pr_info("Unrecognized memmap option, please check the parameter.\n");
@@ -590,6 +595,8 @@ void __init bootmem_init(void) reserve_quick_kexec(); #endif
+ reserve_pmem(); + reserve_pin_memory_res();
memblock_dump_all(); diff --git a/arch/arm64/mm/pmem_reserve.c b/arch/arm64/mm/pmem_reserve.c new file mode 100644 index 000000000000..70fec28409ad --- /dev/null +++ b/arch/arm64/mm/pmem_reserve.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "pmem_reserve: " fmt + +#include <linux/memblock.h> +#include <linux/ioport.h> +#include <linux/types.h> + +#define MAX_REGIONS 8 +static int pmem_res_cnt; +struct resource pmem_res[MAX_REGIONS]; + +void __init setup_reserve_pmem(u64 start, u64 size) +{ + if (pmem_res_cnt >= MAX_REGIONS) { + pr_err("protected memory regions above upper limit %d\n", MAX_REGIONS); + return; + } + + pmem_res[pmem_res_cnt].start = start; + pmem_res[pmem_res_cnt].end = start + size - 1; + pmem_res_cnt++; +} + +void __init request_pmem_res_resource(void) +{ + struct resource *res; + int i; + + for (i = 0; i < pmem_res_cnt; i++) { + res = &pmem_res[i]; + res->name = "Persistent Memory (legacy)"; + res->flags = IORESOURCE_MEM; + res->desc = IORES_DESC_PERSISTENT_MEMORY_LEGACY; + if (res->start && res->end) + request_resource(&iomem_resource, res); + } +} + +void __init reserve_pmem(void) +{ + struct resource *res; + phys_addr_t size; + int i; + + for (i = 0; i < pmem_res_cnt; i++) { + res = &pmem_res[i]; + size = res->end - res->start; + if (!memblock_is_region_memory(res->start, size)) { + pr_warn("region[%pa-%pa] is not in memory\n", + &res->start, &res->end); + res->start = res->end = 0; + continue; + } + + if (memblock_is_region_reserved(res->start, size)) { + pr_warn("region[%pa-%pa] overlaps reserved memory\n", + &res->start, &res->end); + res->start = res->end = 0; + continue; + } + + memblock_remove(res->start, size); + pr_info("region %d: [%pa-%pa] (%lluMB)\n", i, &res->start, &res->end, size >> 20); + } +} diff --git a/arch/arm64/mm/pmem_reserve.h b/arch/arm64/mm/pmem_reserve.h new file mode 100644 index 000000000000..d143198c9696 --- /dev/null +++ b/arch/arm64/mm/pmem_reserve.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include <linux/types.h> + +#ifdef CONFIG_ARM64_PMEM_RESERVE +void __init setup_reserve_pmem(u64 start, u64 size); +void __init reserve_pmem(void); +void __init request_pmem_res_resource(void); +#else +static inline void __init setup_reserve_pmem(u64 start, u64 size) {} +static inline void __init reserve_pmem(void) {} +static inline void __init request_pmem_res_resource(void) {} +#endif
From: Zhuling zhuling8@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4O31I
--------------------------
Enable legacy pmem register feature for arm64.
Signed-off-by: Zhuling zhuling8@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/arm64/configs/openeuler_defconfig | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 7ae6b2c053be..f4b62b7b5766 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -416,6 +416,8 @@ CONFIG_ARM64_CPU_PARK=y CONFIG_FORCE_MAX_ZONEORDER=11 CONFIG_UNMAP_KERNEL_AT_EL0=y CONFIG_RODATA_FULL_DEFAULT_ENABLED=y +CONFIG_ARM64_PMEM_RESERVE=y +CONFIG_ARM64_PMEM_LEGACY=m # CONFIG_ARM64_SW_TTBR0_PAN is not set CONFIG_ARM64_TAGGED_ADDR_ABI=y CONFIG_ARM64_ILP32=y @@ -6034,6 +6036,8 @@ CONFIG_VENDOR_HOOKS=y # end of Vendor Hooks
CONFIG_LIBNVDIMM=m +CONFIG_PMEM_LEGACY=m +CONFIG_PMEM_LEGACY_DEVICE=y CONFIG_BLK_DEV_PMEM=m CONFIG_ND_BLK=m CONFIG_ND_CLAIM=y