From: Miklos Szeredi mszeredi@redhat.com
stable inclusion from linux-4.19.200 commit 1dabafa9f61118b1377fde424d9a94bf8dbf2813
--------------------------------
commit cbcf01128d0a92e131bd09f1688fe032480b65ca upstream.
unix_gc() assumes that candidate sockets can never gain an external reference (i.e. be installed into an fd) while the unix_gc_lock is held. Except for MSG_PEEK this is guaranteed by modifying inflight count under the unix_gc_lock.
MSG_PEEK does not touch any variable protected by unix_gc_lock (file count is not), yet it needs to be serialized with garbage collection. Do this by locking/unlocking unix_gc_lock:
1) increment file count
2) lock/unlock barrier to make sure incremented file count is visible to garbage collection
3) install file into fd
This is a lock barrier (unlike smp_mb()) that ensures that garbage collection is run completely before or completely after the barrier.
Cc: stable@vger.kernel.org Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/unix/af_unix.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 337c4797ab167..98c253afa0db2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1517,6 +1517,53 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer) return err; }
+static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + scm->fp = scm_fp_dup(UNIXCB(skb).fp); + + /* + * Garbage collection of unix sockets starts by selecting a set of + * candidate sockets which have reference only from being in flight + * (total_refs == inflight_refs). This condition is checked once during + * the candidate collection phase, and candidates are marked as such, so + * that non-candidates can later be ignored. While inflight_refs is + * protected by unix_gc_lock, total_refs (file count) is not, hence this + * is an instantaneous decision. + * + * Once a candidate, however, the socket must not be reinstalled into a + * file descriptor while the garbage collection is in progress. + * + * If the above conditions are met, then the directed graph of + * candidates (*) does not change while unix_gc_lock is held. + * + * Any operations that changes the file count through file descriptors + * (dup, close, sendmsg) does not change the graph since candidates are + * not installed in fds. + * + * Dequeing a candidate via recvmsg would install it into an fd, but + * that takes unix_gc_lock to decrement the inflight count, so it's + * serialized with garbage collection. + * + * MSG_PEEK is special in that it does not change the inflight count, + * yet does install the socket into an fd. The following lock/unlock + * pair is to ensure serialization with garbage collection. It must be + * done between incrementing the file count and installing the file into + * an fd. + * + * If garbage collection starts after the barrier provided by the + * lock/unlock, then it will see the elevated refcount and not mark this + * as a candidate. If a garbage collection is already in progress + * before the file count was incremented, then the lock/unlock pair will + * ensure that garbage collection is finished before progressing to + * installing the fd. + * + * (*) A -> B where B is on the queue of A or B is on the queue of C + * which is on the queue of listening socket A. + */ + spin_lock(&unix_gc_lock); + spin_unlock(&unix_gc_lock); +} + static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; @@ -2142,7 +2189,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, sk_peek_offset_fwd(sk, size);
if (UNIXCB(skb).fp) - scm.fp = scm_fp_dup(UNIXCB(skb).fp); + unix_peek_fds(&scm, skb); } err = (flags & MSG_TRUNC) ? skb->len - skip : size;
@@ -2383,7 +2430,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, /* It is questionable, see note in unix_dgram_recvmsg. */ if (UNIXCB(skb).fp) - scm.fp = scm_fp_dup(UNIXCB(skb).fp); + unix_peek_fds(&scm, skb);
sk_peek_offset_fwd(sk, chunk);
stable inclusion from linux-4.19.200 commit f9dd1e4e9d39e799fbe2be9ac7e6b43a9567ff8c
--------------------------------
[ Upstream commit 996af62167d0e0ec69b938a3561e96f84ffff1aa ]
I got kmemleak report when doing fuzz test:
BUG: memory leak unreferenced object 0xffff88810c239500 (size 64): comm "syz-executor940", pid 882, jiffies 4294712870 (age 14.631s) hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 01 00 00 00 01 02 00 04 ................ backtrace: [<00000000a323afa4>] slab_alloc_node mm/slub.c:2972 [inline] [<00000000a323afa4>] slab_alloc mm/slub.c:2980 [inline] [<00000000a323afa4>] __kmalloc+0x167/0x340 mm/slub.c:4130 [<000000005034ca11>] kmalloc include/linux/slab.h:595 [inline] [<000000005034ca11>] mrp_attr_create net/802/mrp.c:276 [inline] [<000000005034ca11>] mrp_request_join+0x265/0x550 net/802/mrp.c:530 [<00000000fcfd81f3>] vlan_mvrp_request_join+0x145/0x170 net/8021q/vlan_mvrp.c:40 [<000000009258546e>] vlan_dev_open+0x477/0x890 net/8021q/vlan_dev.c:292 [<0000000059acd82b>] __dev_open+0x281/0x410 net/core/dev.c:1609 [<000000004e6dc695>] __dev_change_flags+0x424/0x560 net/core/dev.c:8767 [<00000000471a09af>] rtnl_configure_link+0xd9/0x210 net/core/rtnetlink.c:3122 [<0000000037a4672b>] __rtnl_newlink+0xe08/0x13e0 net/core/rtnetlink.c:3448 [<000000008d5d0fda>] rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3488 [<000000004882fe39>] rtnetlink_rcv_msg+0x369/0xa10 net/core/rtnetlink.c:5552 [<00000000907e6c54>] netlink_rcv_skb+0x134/0x3d0 net/netlink/af_netlink.c:2504 [<00000000e7d7a8c4>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<00000000e7d7a8c4>] netlink_unicast+0x4a0/0x6a0 net/netlink/af_netlink.c:1340 [<00000000e0645d50>] netlink_sendmsg+0x78e/0xc90 net/netlink/af_netlink.c:1929 [<00000000c24559b7>] sock_sendmsg_nosec net/socket.c:654 [inline] [<00000000c24559b7>] sock_sendmsg+0x139/0x170 net/socket.c:674 [<00000000fc210bc2>] ____sys_sendmsg+0x658/0x7d0 net/socket.c:2350 [<00000000be4577b5>] ___sys_sendmsg+0xf8/0x170 net/socket.c:2404
Calling mrp_request_leave() after mrp_request_join(), the attr->state is set to MRP_APPLICANT_VO, mrp_attr_destroy() won't be called in last TX event in mrp_uninit_applicant(), the attr of applicant will be leaked. To fix this leak, iterate and free each attr of applicant before rerturning from mrp_uninit_applicant().
Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org --- net/802/mrp.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git a/net/802/mrp.c b/net/802/mrp.c index a808dd5bbb27a..32f87d458f054 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -295,6 +295,19 @@ static void mrp_attr_destroy(struct mrp_applicant *app, struct mrp_attr *attr) kfree(attr); }
+static void mrp_attr_destroy_all(struct mrp_applicant *app) +{ + struct rb_node *node, *next; + struct mrp_attr *attr; + + for (node = rb_first(&app->mad); + next = node ? rb_next(node) : NULL, node != NULL; + node = next) { + attr = rb_entry(node, struct mrp_attr, node); + mrp_attr_destroy(app, attr); + } +} + static int mrp_pdu_init(struct mrp_applicant *app) { struct sk_buff *skb; @@ -898,6 +911,7 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl)
spin_lock_bh(&app->lock); mrp_mad_event(app, MRP_EVENT_TX); + mrp_attr_destroy_all(app); mrp_pdu_queue(app); spin_unlock_bh(&app->lock);
stable inclusion from linux-4.19.200 commit e954107513e5e984821591b9b0ee4b002fcb63c6
--------------------------------
[ Upstream commit 42ca63f980842918560b25f0244307fd83b4777c ]
I got kmemleak report when doing fuzz test:
BUG: memory leak unreferenced object 0xffff88810c909b80 (size 64): comm "syz", pid 957, jiffies 4295220394 (age 399.090s) hex dump (first 32 bytes): 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 08 00 00 00 01 02 00 04 ................ backtrace: [<00000000ca1f2e2e>] garp_request_join+0x285/0x3d0 [<00000000bf153351>] vlan_gvrp_request_join+0x15b/0x190 [<0000000024005e72>] vlan_dev_open+0x706/0x980 [<00000000dc20c4d4>] __dev_open+0x2bb/0x460 [<0000000066573004>] __dev_change_flags+0x501/0x650 [<0000000035b42f83>] rtnl_configure_link+0xee/0x280 [<00000000a5e69de0>] __rtnl_newlink+0xed5/0x1550 [<00000000a5258f4a>] rtnl_newlink+0x66/0x90 [<00000000506568ee>] rtnetlink_rcv_msg+0x439/0xbd0 [<00000000b7eaeae1>] netlink_rcv_skb+0x14d/0x420 [<00000000c373ce66>] netlink_unicast+0x550/0x750 [<00000000ec74ce74>] netlink_sendmsg+0x88b/0xda0 [<00000000381ff246>] sock_sendmsg+0xc9/0x120 [<000000008f6a2db3>] ____sys_sendmsg+0x6e8/0x820 [<000000008d9c1735>] ___sys_sendmsg+0x145/0x1c0 [<00000000aa39dd8b>] __sys_sendmsg+0xfe/0x1d0
Calling garp_request_leave() after garp_request_join(), the attr->state is set to GARP_APPLICANT_VO, garp_attr_destroy() won't be called in last transmit event in garp_uninit_applicant(), the attr of applicant will be leaked. To fix this leak, iterate and free each attr of applicant before rerturning from garp_uninit_applicant().
Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org --- net/802/garp.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git a/net/802/garp.c b/net/802/garp.c index 7f50d47470bd4..8e19f51833d6f 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -206,6 +206,19 @@ static void garp_attr_destroy(struct garp_applicant *app, struct garp_attr *attr kfree(attr); }
+static void garp_attr_destroy_all(struct garp_applicant *app) +{ + struct rb_node *node, *next; + struct garp_attr *attr; + + for (node = rb_first(&app->gid); + next = node ? rb_next(node) : NULL, node != NULL; + node = next) { + attr = rb_entry(node, struct garp_attr, node); + garp_attr_destroy(app, attr); + } +} + static int garp_pdu_init(struct garp_applicant *app) { struct sk_buff *skb; @@ -612,6 +625,7 @@ void garp_uninit_applicant(struct net_device *dev, struct garp_application *appl
spin_lock_bh(&app->lock); garp_gid_event(app, GARP_EVENT_TRANSMIT_PDU); + garp_attr_destroy_all(app); garp_pdu_queue(app); spin_unlock_bh(&app->lock);
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.200 commit c1a5cd807960d07381364c7b05aa3a43eb6d3a2f
--------------------------------
[ Upstream commit 0dbffbb5335a1e3aa6855e4ee317e25e669dd302 ]
sk_ll_usec is read locklessly from sk_can_busy_loop() while another thread can change its value in sock_setsockopt()
This is correct but needs annotations.
BUG: KCSAN: data-race in __skb_try_recv_datagram / sock_setsockopt
write to 0xffff88814eb5f904 of 4 bytes by task 14011 on cpu 0: sock_setsockopt+0x1287/0x2090 net/core/sock.c:1175 __sys_setsockopt+0x14f/0x200 net/socket.c:2100 __do_sys_setsockopt net/socket.c:2115 [inline] __se_sys_setsockopt net/socket.c:2112 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2112 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae
read to 0xffff88814eb5f904 of 4 bytes by task 14001 on cpu 1: sk_can_busy_loop include/net/busy_poll.h:41 [inline] __skb_try_recv_datagram+0x14f/0x320 net/core/datagram.c:273 unix_dgram_recvmsg+0x14c/0x870 net/unix/af_unix.c:2101 unix_seqpacket_recvmsg+0x5a/0x70 net/unix/af_unix.c:2067 ____sys_recvmsg+0x15d/0x310 include/linux/uio.h:244 ___sys_recvmsg net/socket.c:2598 [inline] do_recvmmsg+0x35c/0x9f0 net/socket.c:2692 __sys_recvmmsg net/socket.c:2771 [inline] __do_sys_recvmmsg net/socket.c:2794 [inline] __se_sys_recvmmsg net/socket.c:2787 [inline] __x64_sys_recvmmsg+0xcf/0x150 net/socket.c:2787 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae
value changed: 0x00000000 -> 0x00000101
Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 14001 Comm: syz-executor.3 Not tainted 5.13.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/busy_poll.h | 2 +- net/core/sock.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index cf8f792743ec2..c76a5e9894dac 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -48,7 +48,7 @@ static inline bool net_busy_loop_on(void)
static inline bool sk_can_busy_loop(const struct sock *sk) { - return sk->sk_ll_usec && !signal_pending(current); + return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current); }
bool sk_busy_loop_end(void *p, unsigned long start_time); diff --git a/net/core/sock.c b/net/core/sock.c index cb27f9cc7ab99..a0c1d8531e86e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1011,7 +1011,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, if (val < 0) ret = -EINVAL; else - sk->sk_ll_usec = val; + WRITE_ONCE(sk->sk_ll_usec, val); } break; #endif
From: Xin Long lucien.xin@gmail.com
stable inclusion from linux-4.19.200 commit 53012dd6ca2f3c9420b5cc447279375a90290fb4
--------------------------------
[ Upstream commit 1d11fa231cabeae09a95cb3e4cf1d9dd34e00f08 ]
The doc draft-stewart-tsvwg-sctp-ipv4-00 that restricts 198 addresses was never published. These addresses as private addresses should be allowed to use in SCTP.
As Michael Tuexen suggested, this patch is to move 198 addresses from unusable to private scope.
Reported-by: Sérgio surkamp@gmail.com Signed-off-by: Xin Long lucien.xin@gmail.com Acked-by: Marcelo Ricardo Leitner marcelo.leitner@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/sctp/constants.h | 4 +--- net/sctp/protocol.c | 3 ++- 2 files changed, 3 insertions(+), 4 deletions(-)
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h index 48d74674d5e95..bc22e44ffcdf7 100644 --- a/include/net/sctp/constants.h +++ b/include/net/sctp/constants.h @@ -348,8 +348,7 @@ enum { #define SCTP_SCOPE_POLICY_MAX SCTP_SCOPE_POLICY_LINK
/* Based on IPv4 scoping <draft-stewart-tsvwg-sctp-ipv4-00.txt>, - * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 198.18.0.0/24, - * 192.88.99.0/24. + * SCTP IPv4 unusable addresses: 0.0.0.0/8, 224.0.0.0/4, 192.88.99.0/24. * Also, RFC 8.4, non-unicast addresses are not considered valid SCTP * addresses. */ @@ -357,7 +356,6 @@ enum { ((htonl(INADDR_BROADCAST) == a) || \ ipv4_is_multicast(a) || \ ipv4_is_zeronet(a) || \ - ipv4_is_test_198(a) || \ ipv4_is_anycast_6to4(a))
/* Flags used for the bind address copy functions. */ diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index dd51256582556..7207a9769f1a9 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -412,7 +412,8 @@ static enum sctp_scope sctp_v4_scope(union sctp_addr *addr) retval = SCTP_SCOPE_LINK; } else if (ipv4_is_private_10(addr->v4.sin_addr.s_addr) || ipv4_is_private_172(addr->v4.sin_addr.s_addr) || - ipv4_is_private_192(addr->v4.sin_addr.s_addr)) { + ipv4_is_private_192(addr->v4.sin_addr.s_addr) || + ipv4_is_test_198(addr->v4.sin_addr.s_addr)) { retval = SCTP_SCOPE_PRIVATE; } else { retval = SCTP_SCOPE_GLOBAL;
From: Willem de Bruijn willemb@google.com
stable inclusion from linux-4.19.128 commit 8920e8ae16a89bebd4d98ec6c7b306b1e3e06722
--------------------------------
[ Upstream commit 6dd912f82680761d8fb6b1bb274a69d4c7010988 ]
Syzkaller again found a path to a kernel crash through bad gso input: a packet with gso size exceeding len.
These packets are dropped in tcp_gso_segment and udp[46]_ufo_fragment. But they may affect gso size calculations earlier in the path.
Now that we have thlen as of commit 9274124f023b ("net: stricter validation of untrusted gso packets"), check gso_size at entry too.
Fixes: bfd5f4a3d605 ("packet: Add GSO/csum offload support.") Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: Willem de Bruijn willemb@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/virtio_net.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 92570a4c211d1..5131a6975d8d1 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -31,6 +31,7 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, { unsigned int gso_type = 0; unsigned int thlen = 0; + unsigned int p_off = 0; unsigned int ip_proto;
if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { @@ -70,7 +71,8 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, if (!skb_partial_csum_set(skb, start, off)) return -EINVAL;
- if (skb_transport_offset(skb) + thlen > skb_headlen(skb)) + p_off = skb_transport_offset(skb) + thlen; + if (p_off > skb_headlen(skb)) return -EINVAL; } else { /* gso packets without NEEDS_CSUM do not set transport_offset. @@ -94,17 +96,25 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, return -EINVAL; }
- if (keys.control.thoff + thlen > skb_headlen(skb) || + p_off = keys.control.thoff + thlen; + if (p_off > skb_headlen(skb) || keys.basic.ip_proto != ip_proto) return -EINVAL;
skb_set_transport_header(skb, keys.control.thoff); + } else if (gso_type) { + p_off = thlen; + if (p_off > skb_headlen(skb)) + return -EINVAL; } }
if (hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) { u16 gso_size = __virtio16_to_cpu(little_endian, hdr->gso_size);
+ if (skb->len - p_off <= gso_size) + return -EINVAL; + skb_shinfo(skb)->gso_size = gso_size; skb_shinfo(skb)->gso_type = gso_type;
From: Yuya Kusakabe yuya.kusakabe@gmail.com
stable inclusion from linux-4.19.187 commit 15f135b4ea6e31215d184ef26d0bbb44e1cbe9f5
--------------------------------
[ Upstream commit 503d539a6e417b018616bf3060e0b5814fafce47 ]
Implement support for transferring XDP meta data into skb for virtio_net driver; before calling into the program, xdp.data_meta points to xdp.data, where on program return with pass verdict, we call into skb_metadata_set().
Tested with the script at https://github.com/higebu/virtio_net-xdp-metadata-test.
Signed-off-by: Yuya Kusakabe yuya.kusakabe@gmail.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Acked-by: Jason Wang jasowang@redhat.com Acked-by: Michael S. Tsirkin mst@redhat.com Link: https://lore.kernel.org/bpf/20200225033212.437563-2-yuya.kusakabe@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/virtio_net.c | 52 ++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 20 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 6d0bb3ce37e80..cdf4a5bbcfd03 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -383,7 +383,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, struct receive_queue *rq, struct page *page, unsigned int offset, unsigned int len, unsigned int truesize, - bool hdr_valid) + bool hdr_valid, unsigned int metasize) { struct sk_buff *skb; struct virtio_net_hdr_mrg_rxbuf *hdr; @@ -405,6 +405,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, else hdr_padded_len = sizeof(struct padded_vnet_hdr);
+ /* hdr_valid means no XDP, so we can copy the vnet header */ if (hdr_valid) memcpy(hdr, p, hdr_len);
@@ -417,6 +418,11 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, copy = skb_tailroom(skb); skb_put_data(skb, p, copy);
+ if (metasize) { + __skb_pull(skb, metasize); + skb_metadata_set(skb, metasize); + } + len -= copy; offset += copy;
@@ -462,10 +468,6 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, struct virtio_net_hdr_mrg_rxbuf *hdr; int err;
- /* virtqueue want to use data area in-front of packet */ - if (unlikely(xdpf->metasize > 0)) - return -EOPNOTSUPP; - if (unlikely(xdpf->headroom < vi->hdr_len)) return -EOVERFLOW;
@@ -656,6 +658,7 @@ static struct sk_buff *receive_small(struct net_device *dev, unsigned int delta = 0; struct page *xdp_page; int err; + unsigned int metasize = 0;
len -= vi->hdr_len; stats->bytes += len; @@ -695,8 +698,8 @@ static struct sk_buff *receive_small(struct net_device *dev,
xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len; xdp.data = xdp.data_hard_start + xdp_headroom; - xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + len; + xdp.data_meta = xdp.data; xdp.rxq = &rq->xdp_rxq; orig_data = xdp.data; act = bpf_prog_run_xdp(xdp_prog, &xdp); @@ -707,6 +710,7 @@ static struct sk_buff *receive_small(struct net_device *dev, /* Recalculate length in case bpf program changed it */ delta = orig_data - xdp.data; len = xdp.data_end - xdp.data; + metasize = xdp.data - xdp.data_meta; break; case XDP_TX: stats->xdp_tx++; @@ -752,6 +756,9 @@ static struct sk_buff *receive_small(struct net_device *dev, memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len); } /* keep zeroed vnet hdr since packet was changed by bpf */
+ if (metasize) + skb_metadata_set(skb, metasize); + err: return skb;
@@ -772,8 +779,8 @@ static struct sk_buff *receive_big(struct net_device *dev, struct virtnet_rq_stats *stats) { struct page *page = buf; - struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, - PAGE_SIZE, true); + struct sk_buff *skb = + page_to_skb(vi, rq, page, 0, len, PAGE_SIZE, true, 0);
stats->bytes += len - vi->hdr_len; if (unlikely(!skb)) @@ -805,6 +812,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, unsigned int truesize; unsigned int headroom = mergeable_ctx_to_headroom(ctx); int err; + unsigned int metasize = 0;
head_skb = NULL; stats->bytes += len - vi->hdr_len; @@ -851,8 +859,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, data = page_address(xdp_page) + offset; xdp.data_hard_start = data - VIRTIO_XDP_HEADROOM + vi->hdr_len; xdp.data = data + vi->hdr_len; - xdp_set_data_meta_invalid(&xdp); xdp.data_end = xdp.data + (len - vi->hdr_len); + xdp.data_meta = xdp.data; xdp.rxq = &rq->xdp_rxq;
act = bpf_prog_run_xdp(xdp_prog, &xdp); @@ -860,24 +868,27 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
switch (act) { case XDP_PASS: + metasize = xdp.data - xdp.data_meta; + /* recalculate offset to account for any header - * adjustments. Note other cases do not build an - * skb and avoid using offset + * adjustments and minus the metasize to copy the + * metadata in page_to_skb(). Note other cases do not + * build an skb and avoid using offset */ - offset = xdp.data - - page_address(xdp_page) - vi->hdr_len; + offset = xdp.data - page_address(xdp_page) - + vi->hdr_len - metasize;
- /* recalculate len if xdp.data or xdp.data_end were - * adjusted + /* recalculate len if xdp.data, xdp.data_end or + * xdp.data_meta were adjusted */ - len = xdp.data_end - xdp.data + vi->hdr_len; + len = xdp.data_end - xdp.data + vi->hdr_len + metasize; /* We can only create skb based on xdp_page. */ if (unlikely(xdp_page != page)) { rcu_read_unlock(); put_page(page); - head_skb = page_to_skb(vi, rq, xdp_page, - offset, len, - PAGE_SIZE, false); + head_skb = page_to_skb(vi, rq, xdp_page, offset, + len, PAGE_SIZE, false, + metasize); return head_skb; } break; @@ -933,7 +944,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev, goto err_skb; }
- head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog); + head_skb = page_to_skb(vi, rq, page, offset, len, truesize, !xdp_prog, + metasize); curr_skb = head_skb;
if (unlikely(!curr_skb))
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.201 commit 16851e34b621bc7e652c508bb28c47948fb86958
--------------------------------
commit 0f6925b3e8da0dbbb52447ca8a8b42b371aac7db upstream.
Xuan Zhuo reported that commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs") brought a ~10% performance drop.
The reason for the performance drop was that GRO was forced to chain sk_buff (using skb_shinfo(skb)->frag_list), which uses more memory but also cause packet consumers to go over a lot of overhead handling all the tiny skbs.
It turns out that virtio_net page_to_skb() has a wrong strategy : It allocates skbs with GOOD_COPY_LEN (128) bytes in skb->head, then copies 128 bytes from the page, before feeding the packet to GRO stack.
This was suboptimal before commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs") because GRO was using 2 frags per MSS, meaning we were not packing MSS with 100% efficiency.
Fix is to pull only the ethernet header in page_to_skb()
Then, we change virtio_net_hdr_to_skb() to pull the missing headers, instead of assuming they were already pulled by callers.
This fixes the performance regression, but could also allow virtio_net to accept packets with more than 128bytes of headers.
Many thanks to Xuan Zhuo for his report, and his tests/help.
Fixes: 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs") Reported-by: Xuan Zhuo xuanzhuo@linux.alibaba.com Link: https://www.spinics.net/lists/netdev/msg731397.html Co-Developed-by: Xuan Zhuo xuanzhuo@linux.alibaba.com Signed-off-by: Xuan Zhuo xuanzhuo@linux.alibaba.com Signed-off-by: Eric Dumazet edumazet@google.com Cc: "Michael S. Tsirkin" mst@redhat.com Cc: Jason Wang jasowang@redhat.com Cc: virtualization@lists.linux-foundation.org Acked-by: Jason Wang jasowang@redhat.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Matthieu Baerts matthieu.baerts@tessares.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/virtio_net.c | 10 +++++++--- include/linux/virtio_net.h | 14 +++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index cdf4a5bbcfd03..a8768588e7373 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -413,9 +413,13 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi, offset += hdr_padded_len; p += hdr_padded_len;
- copy = len; - if (copy > skb_tailroom(skb)) - copy = skb_tailroom(skb); + /* Copy all frame if it fits skb->head, otherwise + * we let virtio_net_hdr_to_skb() and GRO pull headers as needed. + */ + if (len <= skb_tailroom(skb)) + copy = len; + else + copy = ETH_HLEN + metasize; skb_put_data(skb, p, copy);
if (metasize) { diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 5131a6975d8d1..3e2ee7739c11a 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -65,14 +65,18 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, skb_reset_mac_header(skb);
if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { - u16 start = __virtio16_to_cpu(little_endian, hdr->csum_start); - u16 off = __virtio16_to_cpu(little_endian, hdr->csum_offset); + u32 start = __virtio16_to_cpu(little_endian, hdr->csum_start); + u32 off = __virtio16_to_cpu(little_endian, hdr->csum_offset); + u32 needed = start + max_t(u32, thlen, off + sizeof(__sum16)); + + if (!pskb_may_pull(skb, needed)) + return -EINVAL;
if (!skb_partial_csum_set(skb, start, off)) return -EINVAL;
p_off = skb_transport_offset(skb) + thlen; - if (p_off > skb_headlen(skb)) + if (!pskb_may_pull(skb, p_off)) return -EINVAL; } else { /* gso packets without NEEDS_CSUM do not set transport_offset. @@ -97,14 +101,14 @@ static inline int virtio_net_hdr_to_skb(struct sk_buff *skb, }
p_off = keys.control.thoff + thlen; - if (p_off > skb_headlen(skb) || + if (!pskb_may_pull(skb, p_off) || keys.basic.ip_proto != ip_proto) return -EINVAL;
skb_set_transport_header(skb, keys.control.thoff); } else if (gso_type) { p_off = thlen; - if (p_off > skb_headlen(skb)) + if (!pskb_may_pull(skb, p_off)) return -EINVAL; } }
From: Florian Westphal fw@strlen.de
stable inclusion from linux-4.19.201 commit 512fd52e2091560de66da26799b3f1ca7ca1d41b
--------------------------------
[ Upstream commit 30a56a2b881821625f79837d4d968c679852444e ]
In case the entry is evicted via garbage collection there is delay between the timeout value and the eviction event.
This adjusts the stop value based on how much time has passed.
Fixes: b87a2f9199ea82 ("netfilter: conntrack: add gc worker to remove timed-out entries") Signed-off-by: Florian Westphal fw@strlen.de Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netfilter/nf_conntrack_core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 4edf6c70a8fd6..34a9d69295d0d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -630,8 +630,13 @@ bool nf_ct_delete(struct nf_conn *ct, u32 portid, int report) return false;
tstamp = nf_conn_tstamp_find(ct); - if (tstamp && tstamp->stop == 0) + if (tstamp) { + s32 timeout = ct->timeout - nfct_time_stamp; + tstamp->stop = ktime_get_real_ns(); + if (timeout < 0) + tstamp->stop -= jiffies_to_nsecs(-timeout); + }
if (nf_conntrack_event_report(IPCT_DESTROY, ct, portid, report) < 0) {
From: Pablo Neira Ayuso pablo@netfilter.org
stable inclusion from linux-4.19.201 commit 1cb5995a39eb3dc97a7539d00d2c82be030e0bb8
--------------------------------
[ Upstream commit a33f387ecd5aafae514095c2c4a8c24f7aea7e8b ]
nft_nat reports a bogus EAFNOSUPPORT if no layer 3 information is specified.
Fixes: d07db9884a5f ("netfilter: nf_tables: introduce nft_validate_register_load()") Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netfilter/nft_nat.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 3e82a7d0df2a4..2c3d7ff6f58a7 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -153,7 +153,9 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, alen = FIELD_SIZEOF(struct nf_nat_range, min_addr.ip6); break; default: - return -EAFNOSUPPORT; + if (tb[NFTA_NAT_REG_ADDR_MIN]) + return -EAFNOSUPPORT; + break; } priv->family = family;
From: Pravin B Shelar pshelar@ovn.org
stable inclusion from linux-4.19.202 commit a66fdcda469a0e103fe105dc0c95536fa28dc733
--------------------------------
[ Upstream commit a17ad0961706244dce48ec941f7e476a38c0e727 ]
In some cases skb head could be locked and entire header data is pulled from skb. When skb_zerocopy() called in such cases, following BUG is triggered. This patch fixes it by copying entire skb in such cases. This could be optimized incase this is performance bottleneck.
---8<--- kernel BUG at net/core/skbuff.c:2961! invalid opcode: 0000 [#1] SMP PTI CPU: 2 PID: 0 Comm: swapper/2 Tainted: G OE 5.4.0-77-generic #86-Ubuntu Hardware name: OpenStack Foundation OpenStack Nova, BIOS 1.13.0-1ubuntu1.1 04/01/2014 RIP: 0010:skb_zerocopy+0x37a/0x3a0 RSP: 0018:ffffbcc70013ca38 EFLAGS: 00010246 Call Trace: <IRQ> queue_userspace_packet+0x2af/0x5e0 [openvswitch] ovs_dp_upcall+0x3d/0x60 [openvswitch] ovs_dp_process_packet+0x125/0x150 [openvswitch] ovs_vport_receive+0x77/0xd0 [openvswitch] netdev_port_receive+0x87/0x130 [openvswitch] netdev_frame_hook+0x4b/0x60 [openvswitch] __netif_receive_skb_core+0x2b4/0xc90 __netif_receive_skb_one_core+0x3f/0xa0 __netif_receive_skb+0x18/0x60 process_backlog+0xa9/0x160 net_rx_action+0x142/0x390 __do_softirq+0xe1/0x2d6 irq_exit+0xae/0xb0 do_IRQ+0x5a/0xf0 common_interrupt+0xf/0xf
Code that triggered BUG: int skb_zerocopy(struct sk_buff *to, struct sk_buff *from, int len, int hlen) { int i, j = 0; int plen = 0; /* length of skb->head fragment */ int ret; struct page *page; unsigned int offset;
BUG_ON(!from->head_frag && !hlen);
Signed-off-by: Pravin B Shelar pshelar@ovn.org Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/core/skbuff.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ea9684bcc2e8d..e1daab49b0eb0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2705,8 +2705,11 @@ skb_zerocopy_headlen(const struct sk_buff *from)
if (!from->head_frag || skb_headlen(from) < L1_CACHE_BYTES || - skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) + skb_shinfo(from)->nr_frags >= MAX_SKB_FRAGS) { hlen = skb_headlen(from); + if (!hlen) + hlen = from->len; + }
if (skb_has_frag_list(from)) hlen = from->len;
From: Yu Kuai yukuai3@huawei.com
stable inclusion from linux-4.19.203 commit 76ab02d9b861da0785176f0228340f22023902fa
--------------------------------
[ Upstream commit 8d75d0eff6887bcac7225e12b9c75595e523d92d ]
If queue is dying while iolatency_set_limit() is in progress, blk_get_queue() won't increment the refcount of the queue. However, blk_put_queue() will still decrement the refcount later, which will cause the refcout to be unbalanced.
Thus error out in such case to fix the problem.
Fixes: 8c772a9bfc7c ("blk-iolatency: fix IO hang due to negative inflight counter") Signed-off-by: Yu Kuai yukuai3@huawei.com Acked-by: Tejun Heo tj@kernel.org Link: https://lore.kernel.org/r/20210805124645.543797-1-yukuai3@huawei.com Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/blk-iolatency.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index ae6ae7f50c6f7..1baa3c49e2e3b 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -800,7 +800,11 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
enable = iolatency_set_min_lat_nsec(blkg, lat_val); if (enable) { - WARN_ON_ONCE(!blk_get_queue(blkg->q)); + if (!blk_get_queue(blkg->q)) { + ret = -ENODEV; + goto out; + } + blkg_get(blkg); }
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
stable inclusion from linux-4.19.203 commit 43cba13ff1e793c0e1e1e317c951dea63710290e
--------------------------------
commit 2c05caa7ba8803209769b9e4fe02c38d77ae88d0 upstream.
When working on my user space applications, I found a bug in the synthetic event code where the automated synthetic event field was not matching the event field calculation it was attached to. Looking deeper into it, it was because the calculation hist_field was not given a size.
The synthetic event fields are matched to their hist_fields either by having the field have an identical string type, or if that does not match, then the size and signed values are used to match the fields.
The problem arose when I tried to match a calculation where the fields were "unsigned int". My tool created a synthetic event of type "u32". But it failed to match. The string was:
diff=field1-field2:onmatch(event).trace(synth,$diff)
Adding debugging into the kernel, I found that the size of "diff" was 0. And since it was given "unsigned int" as a type, the histogram fallback code used size and signed. The signed matched, but the size of u32 (4) did not match zero, and the event failed to be created.
This can be worse if the field you want to match is not one of the acceptable fields for a synthetic event. As event fields can have any type that is supported in Linux, this can cause an issue. For example, if a type is an enum. Then there's no way to use that with any calculations.
Have the calculation field simply take on the size of what it is calculating.
Link: https://lkml.kernel.org/r/20210730171951.59c7743f@oasis.local.home
Cc: Tom Zanussi zanussi@kernel.org Cc: Masami Hiramatsu mhiramat@kernel.org Cc: Namhyung Kim namhyung@kernel.org Cc: Ingo Molnar mingo@kernel.org Cc: Andrew Morton akpm@linux-foundation.org Cc: stable@vger.kernel.org Fixes: 100719dcef447 ("tracing: Add simple expression support to hist triggers") Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/trace_events_hist.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 5ac561c9da57c..ce10a9cb47e12 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2796,6 +2796,10 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data,
expr->operands[0] = operand1; expr->operands[1] = operand2; + + /* The operand sizes should be the same, so just pick one */ + expr->size = operand1->size; + expr->operator = field_op; expr->name = expr_str(expr, 0); expr->type = kstrdup(operand1->type, GFP_KERNEL);
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
stable inclusion from linux-4.19.203 commit c0add455ae992b53b0d52cd4d8682528b4014c42
--------------------------------
commit 1e3bac71c5053c99d438771fc9fa5082ae5d90aa upstream.
Currently the histogram logic allows the user to write "cpu" in as an event field, and it will record the CPU that the event happened on.
The problem with this is that there's a lot of events that have "cpu" as a real field, and using "cpu" as the CPU it ran on, makes it impossible to run histograms on the "cpu" field of events.
For example, if I want to have a histogram on the count of the workqueue_queue_work event on its cpu field, running:
# echo 'hist:keys=cpu' > events/workqueue/workqueue_queue_work/trigger
Gives a misleading and wrong result.
Change the command to "common_cpu" as no event should have "common_*" fields as that's a reserved name for fields used by all events. And this makes sense here as common_cpu would be a field used by all events.
Now we can even do:
# echo 'hist:keys=common_cpu,cpu if cpu < 100' > events/workqueue/workqueue_queue_work/trigger # cat events/workqueue/workqueue_queue_work/hist
# event histogram # # trigger info: hist:keys=common_cpu,cpu:vals=hitcount:sort=hitcount:size=2048 if cpu < 100 [active] #
{ common_cpu: 0, cpu: 2 } hitcount: 1 { common_cpu: 0, cpu: 4 } hitcount: 1 { common_cpu: 7, cpu: 7 } hitcount: 1 { common_cpu: 0, cpu: 7 } hitcount: 1 { common_cpu: 0, cpu: 1 } hitcount: 1 { common_cpu: 0, cpu: 6 } hitcount: 2 { common_cpu: 0, cpu: 5 } hitcount: 2 { common_cpu: 1, cpu: 1 } hitcount: 4 { common_cpu: 6, cpu: 6 } hitcount: 4 { common_cpu: 5, cpu: 5 } hitcount: 14 { common_cpu: 4, cpu: 4 } hitcount: 26 { common_cpu: 0, cpu: 0 } hitcount: 39 { common_cpu: 2, cpu: 2 } hitcount: 184
Now for backward compatibility, I added a trick. If "cpu" is used, and the field is not found, it will fall back to "common_cpu" and work as it did before. This way, it will still work for old programs that use "cpu" to get the actual CPU, but if the event has a "cpu" as a field, it will get that event's "cpu" field, which is probably what it wants anyway.
I updated the tracefs/README to include documentation about both the common_timestamp and the common_cpu. This way, if that text is present in the README, then an application can know that common_cpu is supported over just plain "cpu".
Link: https://lkml.kernel.org/r/20210721110053.26b4f641@oasis.local.home
Cc: Namhyung Kim namhyung@kernel.org Cc: Ingo Molnar mingo@kernel.org Cc: Andrew Morton akpm@linux-foundation.org Cc: stable@vger.kernel.org Fixes: 8b7622bf94a44 ("tracing: Add cpu field for hist triggers") Reviewed-by: Tom Zanussi zanussi@kernel.org Reviewed-by: Masami Hiramatsu mhiramat@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/trace/histogram.rst | 2 +- kernel/trace/trace.c | 4 ++++ kernel/trace/trace_events_hist.c | 21 +++++++++++++++------ 3 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/Documentation/trace/histogram.rst b/Documentation/trace/histogram.rst index 5ac724baea7d9..c14dab13a47e8 100644 --- a/Documentation/trace/histogram.rst +++ b/Documentation/trace/histogram.rst @@ -191,7 +191,7 @@ Documentation written by Tom Zanussi with the event, in nanoseconds. May be modified by .usecs to have timestamps interpreted as microseconds. - cpu int the cpu on which the event occurred. + common_cpu int the cpu on which the event occurred. ====================== ==== =======================================
Extended error information diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 475577163c0e9..8f20a658dabb4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4687,6 +4687,10 @@ static const char readme_msg[] = "\t [:pause][:continue][:clear]\n" "\t [:name=histname1]\n" "\t [if <filter>]\n\n" + "\t Note, special fields can be used as well:\n" + "\t common_timestamp - to record current timestamp\n" + "\t common_cpu - to record the CPU the event happened on\n" + "\n" "\t When a matching event is hit, an entry is added to a hash\n" "\t table using the key(s) and value(s) named, and the value of a\n" "\t sum called 'hitcount' is incremented. Keys and values\n" diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index ce10a9cb47e12..a963287d29b08 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1718,7 +1718,7 @@ static const char *hist_field_name(struct hist_field *field, field->flags & HIST_FIELD_FL_ALIAS) field_name = hist_field_name(field->operands[0], ++level); else if (field->flags & HIST_FIELD_FL_CPU) - field_name = "cpu"; + field_name = "common_cpu"; else if (field->flags & HIST_FIELD_FL_EXPR || field->flags & HIST_FIELD_FL_VAR_REF) { if (field->system) { @@ -2510,14 +2510,23 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file, hist_data->enable_timestamps = true; if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) hist_data->attrs->ts_in_usecs = true; - } else if (strcmp(field_name, "cpu") == 0) + } else if (strcmp(field_name, "common_cpu") == 0) *flags |= HIST_FIELD_FL_CPU; else { field = trace_find_event_field(file->event_call, field_name); if (!field || !field->size) { - hist_err("Couldn't find field: ", field_name); - field = ERR_PTR(-EINVAL); - goto out; + /* + * For backward compatibility, if field_name + * was "cpu", then we treat this the same as + * common_cpu. + */ + if (strcmp(field_name, "cpu") == 0) { + *flags |= HIST_FIELD_FL_CPU; + } else { + hist_err("Couldn't find field: ", field_name); + field = ERR_PTR(-EINVAL); + goto out; + } } } out: @@ -4944,7 +4953,7 @@ static void hist_field_print(struct seq_file *m, struct hist_field *hist_field) seq_printf(m, "%s=", hist_field->var.name);
if (hist_field->flags & HIST_FIELD_FL_CPU) - seq_puts(m, "cpu"); + seq_puts(m, "common_cpu"); else if (field_name) { if (hist_field->flags & HIST_FIELD_FL_VAR_REF || hist_field->flags & HIST_FIELD_FL_ALIAS)
From: "Alex Xu (Hello71)" alex_y_xu@yahoo.ca
stable inclusion from linux-4.19.203 commit 76ccb26c5312760113b2b3ef6de307474e8d4b45
--------------------------------
commit 46c4c9d1beb7f5b4cec4dd90e7728720583ee348 upstream.
This program always prints 4096 and hangs before the patch, and always prints 8192 and exits successfully after:
int main() { int pipefd[2]; for (int i = 0; i < 1025; i++) if (pipe(pipefd) == -1) return 1; size_t bufsz = fcntl(pipefd[1], F_GETPIPE_SZ); printf("%zd\n", bufsz); char *buf = calloc(bufsz, 1); write(pipefd[1], buf, bufsz); read(pipefd[0], buf, bufsz-1); write(pipefd[1], buf, 1); }
Note that you may need to increase your RLIMIT_NOFILE before running the program.
Fixes: 759c01142a ("pipe: limit the per-user amount of pages allocated in pipes") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/ Link: https://lore.kernel.org/lkml/1628127094.lxxn016tj7.none@localhost/ Signed-off-by: Alex Xu (Hello71) alex_y_xu@yahoo.ca Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/pipe.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/fs/pipe.c b/fs/pipe.c index d786511aa23b2..fcc9d07fda59d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -29,6 +29,21 @@
#include "internal.h"
+/* + * New pipe buffers will be restricted to this size while the user is exceeding + * their pipe buffer quota. The general pipe use case needs at least two + * buffers: one for data yet to be read, and one for new data. If this is less + * than two, then a write to a non-empty pipe may block even if the pipe is not + * full. This can occur with GNU make jobserver or similar uses of pipes as + * semaphores: multiple processes may be waiting to write tokens back to the + * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/. + * + * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their + * own risk, namely: pipe writes to non-full pipes may block until the pipe is + * emptied. + */ +#define PIPE_MIN_DEF_BUFFERS 2 + /* * The max size that a non-root user is allowed to grow the pipe. Can * be set by root in /proc/sys/fs/pipe-max-size @@ -653,8 +668,8 @@ struct pipe_inode_info *alloc_pipe_info(void) user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
if (too_many_pipe_buffers_soft(user_bufs) && is_unprivileged_user()) { - user_bufs = account_pipe_buffers(user, pipe_bufs, 1); - pipe_bufs = 1; + user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS); + pipe_bufs = PIPE_MIN_DEF_BUFFERS; }
if (too_many_pipe_buffers_hard(user_bufs) && is_unprivileged_user())
From: Theodore Ts'o tytso@mit.edu
stable inclusion from linux-4.19.203 commit bc1954aa8a7e195ebd686a77e81c11863ce8edbf
--------------------------------
commit 877ba3f729fd3d8ef0e29bc2a55e57cfa54b2e43 upstream.
Commit b5776e7524af ("ext4: fix potential htree index checksum corruption) removed a required restart when multiple levels of index nodes need to be split. Fix this to avoid directory htree corruptions when using the large_dir feature.
Cc: stable@kernel.org # v5.11 Cc: Благодаренко Артём artem.blagodarenko@gmail.com Fixes: b5776e7524af ("ext4: fix potential htree index checksum corruption) Reported-by: Denis denis@voxelsoft.com Signed-off-by: Theodore Ts'o tytso@mit.edu Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index cf196515444bd..58dd74b804805 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2290,7 +2290,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, goto journal_error; err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); - if (err) + if (restart || err) goto journal_error; } else { struct dx_root *dxroot;
From: "Maciej W. Rozycki" macro@orcam.me.uk
stable inclusion from linux-4.19.203 commit 2c39c32f92084736bc871c1ef096602eb1cc7b5b
--------------------------------
commit e5227c51090e165db4b48dcaa300605bfced7014 upstream.
Make sure only actual 8 bits of the IIR register are used in determining the port type in `autoconfig'.
The `serial_in' port accessor returns the `unsigned int' type, meaning that with UPIO_AU, UPIO_MEM16, UPIO_MEM32, and UPIO_MEM32BE access types more than 8 bits of data are returned, of which the high order bits will often come from bus lines that are left floating in the data phase. For example with the MIPS Malta board's CBUS UART, where the registers are aligned on 8-byte boundaries and which uses 32-bit accesses, data as follows is returned:
YAMON> dump -32 0xbf000900 0x40
BF000900: 1F000942 1F000942 1F000900 1F000900 ...B...B........ BF000910: 1F000901 1F000901 1F000900 1F000900 ................ BF000920: 1F000900 1F000900 1F000960 1F000960 ...........`...` BF000930: 1F000900 1F000900 1F0009FF 1F0009FF ................
YAMON>
Evidently high-order 24 bits return values previously driven in the address phase (the 3 highest order address bits used with the command above are masked out in the simple virtual address mapping used here and come out at zeros on the external bus), a common scenario with bus lines left floating, due to bus capacitance.
Consequently when the value of IIR, mapped at 0x1f000910, is retrieved in `autoconfig', it comes out at 0x1f0009c1 and when it is right-shifted by 6 and then assigned to 8-bit `scratch' variable, the value calculated is 0x27, not one of 0, 1, 2, 3 expected in port type determination.
Fix the issue then, by assigning the value returned from `serial_in' to `scratch' first, which masks out 24 high-order bits retrieved, and only then right-shift the resulting 8-bit data quantity, producing the value of 3 in this case, as expected. Fix the same issue in `serial_dl_read'.
The problem first appeared with Linux 2.6.9-rc3 which predates our repo history, but the origin could be identified with the old MIPS/Linux repo also at: git://git.kernel.org/pub/scm/linux/kernel/git/ralf/linux.git as commit e0d2356c0777 ("Merge with Linux 2.6.9-rc3."), where code in `serial_in' was updated with this case:
+ case UPIO_MEM32: + return readl(up->port.membase + offset); +
which made it produce results outside the unsigned 8-bit range for the first time, though obviously it is system dependent what actual values appear in the high order bits retrieved and it may well have been zeros in the relevant positions with the system the change originally was intended for. It is at that point that code in `autoconf' should have been updated accordingly, but clearly it was overlooked.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org # v2.6.12+ Reviewed-by: Philippe Mathieu-Daudé f4bug@amsat.org Signed-off-by: Maciej W. Rozycki macro@orcam.me.uk Link: https://lore.kernel.org/r/alpine.DEB.2.21.2106260516220.37803@angie.orcam.me... Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/tty/serial/8250/8250_port.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-)
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 65bc08a97044a..1867c2546cd97 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -313,7 +313,11 @@ static const struct serial8250_config uart_config[] = { /* Uart divisor latch read */ static int default_serial_dl_read(struct uart_8250_port *up) { - return serial_in(up, UART_DLL) | serial_in(up, UART_DLM) << 8; + /* Assign these in pieces to truncate any bits above 7. */ + unsigned char dll = serial_in(up, UART_DLL); + unsigned char dlm = serial_in(up, UART_DLM); + + return dll | dlm << 8; }
/* Uart divisor latch write */ @@ -1301,9 +1305,11 @@ static void autoconfig(struct uart_8250_port *up) serial_out(up, UART_LCR, 0);
serial_out(up, UART_FCR, UART_FCR_ENABLE_FIFO); - scratch = serial_in(up, UART_IIR) >> 6;
- switch (scratch) { + /* Assign this as it is to truncate any bits above 7. */ + scratch = serial_in(up, UART_IIR); + + switch (scratch >> 6) { case 0: autoconfig_8250(up); break;
From: Yu Kuai yukuai3@huawei.com
stable inclusion from linux-4.19.203 commit df2f583b63637f9f882ba604cf23e0336de82220
--------------------------------
[ Upstream commit 2acf15b94d5b8ea8392c4b6753a6ffac3135cd78 ]
Our syzcaller report a NULL pointer dereference:
BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 116e95067 P4D 116e95067 PUD 1080b5067 PMD 0 Oops: 0010 [#1] SMP KASAN CPU: 7 PID: 592 Comm: a.out Not tainted 5.13.0-next-20210629-dirty #67 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_073836-buildvm-p4 RIP: 0010:0x0 Code: Unable to access opcode bytes at RIP 0xffffffffffffffd6. RSP: 0018:ffff888114e779b8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 1ffff110229cef39 RCX: ffffffffaa67e1aa RDX: 0000000000000000 RSI: ffff88810a58ee00 RDI: ffff8881233180b0 RBP: ffffffffac38e9c0 R08: ffffffffaa67e17e R09: 0000000000000001 R10: ffffffffb91c5557 R11: fffffbfff7238aaa R12: ffff88810a58ee00 R13: ffff888114e77aa0 R14: 0000000000000000 R15: ffff8881233180b0 FS: 00007f946163c480(0000) GS:ffff88839f1c0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffffffffd6 CR3: 00000001099c1000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __lookup_slow+0x116/0x2d0 ? page_put_link+0x120/0x120 ? __d_lookup+0xfc/0x320 ? d_lookup+0x49/0x90 lookup_one_len+0x13c/0x170 ? __lookup_slow+0x2d0/0x2d0 ? reiserfs_schedule_old_flush+0x31/0x130 reiserfs_lookup_privroot+0x64/0x150 reiserfs_fill_super+0x158c/0x1b90 ? finish_unfinished+0xb10/0xb10 ? bprintf+0xe0/0xe0 ? __mutex_lock_slowpath+0x30/0x30 ? __kasan_check_write+0x20/0x30 ? up_write+0x51/0xb0 ? set_blocksize+0x9f/0x1f0 mount_bdev+0x27c/0x2d0 ? finish_unfinished+0xb10/0xb10 ? reiserfs_kill_sb+0x120/0x120 get_super_block+0x19/0x30 legacy_get_tree+0x76/0xf0 vfs_get_tree+0x49/0x160 ? capable+0x1d/0x30 path_mount+0xacc/0x1380 ? putname+0x97/0xd0 ? finish_automount+0x450/0x450 ? kmem_cache_free+0xf8/0x5a0 ? putname+0x97/0xd0 do_mount+0xe2/0x110 ? path_mount+0x1380/0x1380 ? copy_mount_options+0x69/0x140 __x64_sys_mount+0xf0/0x190 do_syscall_64+0x35/0x80 entry_SYSCALL_64_after_hwframe+0x44/0xae
This is because 'root_inode' is initialized with wrong mode, and it's i_op is set to 'reiserfs_special_inode_operations'. Thus add check for 'root_inode' to fix the problem.
Link: https://lore.kernel.org/r/20210702040743.1918552-1-yukuai3@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Signed-off-by: Jan Kara jack@suse.cz Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/reiserfs/super.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index a3507490be6e8..499126c9b0d66 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -2084,6 +2084,14 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) unlock_new_inode(root_inode); }
+ if (!S_ISDIR(root_inode->i_mode) || !inode_get_bytes(root_inode) || + !root_inode->i_size) { + SWARN(silent, s, "", "corrupt root inode, run fsck"); + iput(root_inode); + errval = -EUCLEAN; + goto error; + } + s->s_root = d_make_root(root_inode); if (!s->s_root) goto error;
From: Masami Hiramatsu mhiramat@kernel.org
stable inclusion from linux-4.19.204 commit 7c165d58effc19fdf68196d4ceebf940d5da777d
--------------------------------
commit a9d10ca4986571bffc19778742d508cc8dd13e02 upstream.
Since the string type can not be the target of the addition / subtraction operation, it must be rejected. Without this fix, the string type silently converted to digits.
Link: https://lkml.kernel.org/r/162742654278.290973.1523000673366456634.stgit@devn...
Cc: stable@vger.kernel.org Fixes: 100719dcef447 ("tracing: Add simple expression support to hist triggers") Signed-off-by: Masami Hiramatsu mhiramat@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/trace_events_hist.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+)
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index a963287d29b08..76813bc91e1e0 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -2675,6 +2675,12 @@ static struct hist_field *parse_unary(struct hist_trigger_data *hist_data, ret = PTR_ERR(operand1); goto free; } + if (operand1->flags & HIST_FIELD_FL_STRING) { + /* String type can not be the operand of unary operator. */ + destroy_hist_field(operand1, 0); + ret = -EINVAL; + goto free; + }
expr->flags |= operand1->flags & (HIST_FIELD_FL_TIMESTAMP | HIST_FIELD_FL_TIMESTAMP_USECS); @@ -2775,6 +2781,10 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, operand1 = NULL; goto free; } + if (operand1->flags & HIST_FIELD_FL_STRING) { + ret = -EINVAL; + goto free; + }
/* rest of string could be another expression e.g. b+c in a+b+c */ operand_flags = 0; @@ -2784,6 +2794,10 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, operand2 = NULL; goto free; } + if (operand2->flags & HIST_FIELD_FL_STRING) { + ret = -EINVAL; + goto free; + }
ret = check_expr_operands(operand1, operand2); if (ret)
From: Miklos Szeredi mszeredi@redhat.com
stable inclusion from linux-4.19.204 commit 963d85d630dabe75a3cfde44a006fec3304d07b8
--------------------------------
commit 427215d85e8d1476da1a86b8d67aceb485eb3631 upstream.
Add the following checks from __do_loopback() to clone_private_mount() as well:
- verify that the mount is in the current namespace
- verify that there are no locked children
Reported-by: Alois Wohlschlager alois1@gmx-topmail.de Fixes: c771d683a62e ("vfs: introduce clone_private_mount()") Cc: stable@vger.kernel.org # v3.18 Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/namespace.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-)
diff --git a/fs/namespace.c b/fs/namespace.c index cd14125207ba0..f47d4850bd93d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1865,6 +1865,20 @@ void drop_collected_mounts(struct vfsmount *mnt) namespace_unlock(); }
+static bool has_locked_children(struct mount *mnt, struct dentry *dentry) +{ + struct mount *child; + + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + if (!is_subdir(child->mnt_mountpoint, dentry)) + continue; + + if (child->mnt.mnt_flags & MNT_LOCKED) + return true; + } + return false; +} + /** * clone_private_mount - create a private clone of a path * @@ -1879,14 +1893,27 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt;
+ down_read(&namespace_sem); if (IS_MNT_UNBINDABLE(old_mnt)) - return ERR_PTR(-EINVAL); + goto invalid; + + if (!check_mnt(old_mnt)) + goto invalid; + + if (has_locked_children(old_mnt, path->dentry)) + goto invalid;
new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); + up_read(&namespace_sem); + if (IS_ERR(new_mnt)) return ERR_CAST(new_mnt);
return &new_mnt->mnt; + +invalid: + up_read(&namespace_sem); + return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(clone_private_mount);
@@ -2202,19 +2229,6 @@ static int do_change_type(struct path *path, int ms_flags) return err; }
-static bool has_locked_children(struct mount *mnt, struct dentry *dentry) -{ - struct mount *child; - list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { - if (!is_subdir(child->mnt_mountpoint, dentry)) - continue; - - if (child->mnt.mnt_flags & MNT_LOCKED) - return true; - } - return false; -} - /* * do loopback mount. */
From: Dan Williams dan.j.williams@intel.com
stable inclusion from linux-4.19.205 commit c39e22fd3f7ce3af64140f560ea63b0c986a46db
--------------------------------
commit b93dfa6bda4d4e88e5386490f2b277a26958f9d3 upstream.
Fix the NFIT parsing code to treat a 0 index in a SPA Range Structure as a special case and not match Region Mapping Structures that use 0 to indicate that they are not mapped. Without this fix some platform BIOS descriptions of "virtual disk" ranges do not result in the pmem driver attaching to the range.
Details: In addition to typical persistent memory ranges, the ACPI NFIT may also convey "virtual" ranges. These ranges are indicated by a UUID in the SPA Range Structure of UUID_VOLATILE_VIRTUAL_DISK, UUID_VOLATILE_VIRTUAL_CD, UUID_PERSISTENT_VIRTUAL_DISK, or UUID_PERSISTENT_VIRTUAL_CD. The critical difference between virtual ranges and UUID_PERSISTENT_MEMORY, is that virtual do not support associations with Region Mapping Structures. For this reason the "index" value of virtual SPA Range Structures is allowed to be 0. If a platform BIOS decides to represent NVDIMMs with disconnected "Region Mapping Structures" (range-index == 0), the kernel may falsely associate them with standalone ranges where the "SPA Range Structure Index" is also zero. When this happens the driver may falsely require labels where "virtual disks" are expected to be label-less. I.e. "label-less" is where the namespace-range == region-range and the pmem driver attaches with no user action to create a namespace.
Cc: Jacek Zloch jacek.zloch@intel.com Cc: Lukasz Sobieraj lukasz.sobieraj@intel.com Cc: "Lee, Chun-Yi" jlee@suse.com Cc: stable@vger.kernel.org Fixes: c2f32acdf848 ("acpi, nfit: treat virtual ramdisk SPA as pmem region") Reported-by: Krzysztof Rusocki krzysztof.rusocki@intel.com Reported-by: Damian Bassa damian.bassa@intel.com Reviewed-by: Jeff Moyer jmoyer@redhat.com Link: https://lore.kernel.org/r/162870796589.2521182.1240403310175570220.stgit@dwi... Signed-off-by: Dan Williams dan.j.williams@intel.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/acpi/nfit/core.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index af280ae0cc7c8..8784730911476 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -2828,6 +2828,9 @@ static int acpi_nfit_register_region(struct acpi_nfit_desc *acpi_desc, struct acpi_nfit_memory_map *memdev = nfit_memdev->memdev; struct nd_mapping_desc *mapping;
+ /* range index 0 == unmapped in SPA or invalid-SPA */ + if (memdev->range_index == 0 || spa->range_index == 0) + continue; if (memdev->range_index != spa->range_index) continue; if (count >= ND_MAX_MAPPINGS) {
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.205 commit fb5db3106036f4e21a63c0c6b08db4b4f18f157c
--------------------------------
[ Upstream commit 4a2b285e7e103d4d6c6ed3e5052a0ff74a5d7f15 ]
Fix the data-race reported by syzbot [1] Issue here is that igmp_ifc_timer_expire() can update in_dev->mr_ifc_count while another change just occured from another context.
in_dev->mr_ifc_count is only 8bit wide, so the race had little consequences.
[1] BUG: KCSAN: data-race in igmp_ifc_event / igmp_ifc_timer_expire
write to 0xffff8881051e3062 of 1 bytes by task 12547 on cpu 0: igmp_ifc_event+0x1d5/0x290 net/ipv4/igmp.c:821 igmp_group_added+0x462/0x490 net/ipv4/igmp.c:1356 ____ip_mc_inc_group+0x3ff/0x500 net/ipv4/igmp.c:1461 __ip_mc_join_group+0x24d/0x2c0 net/ipv4/igmp.c:2199 ip_mc_join_group_ssm+0x20/0x30 net/ipv4/igmp.c:2218 do_ip_setsockopt net/ipv4/ip_sockglue.c:1285 [inline] ip_setsockopt+0x1827/0x2a80 net/ipv4/ip_sockglue.c:1423 tcp_setsockopt+0x8c/0xa0 net/ipv4/tcp.c:3657 sock_common_setsockopt+0x5d/0x70 net/core/sock.c:3362 __sys_setsockopt+0x18f/0x200 net/socket.c:2159 __do_sys_setsockopt net/socket.c:2170 [inline] __se_sys_setsockopt net/socket.c:2167 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2167 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae
read to 0xffff8881051e3062 of 1 bytes by interrupt on cpu 1: igmp_ifc_timer_expire+0x706/0xa30 net/ipv4/igmp.c:808 call_timer_fn+0x2e/0x1d0 kernel/time/timer.c:1419 expire_timers+0x135/0x250 kernel/time/timer.c:1464 __run_timers+0x358/0x420 kernel/time/timer.c:1732 run_timer_softirq+0x19/0x30 kernel/time/timer.c:1745 __do_softirq+0x12c/0x26e kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu+0x9a/0xb0 kernel/softirq.c:636 sysvec_apic_timer_interrupt+0x69/0x80 arch/x86/kernel/apic/apic.c:1100 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:638 console_unlock+0x8e8/0xb30 kernel/printk/printk.c:2646 vprintk_emit+0x125/0x3d0 kernel/printk/printk.c:2174 vprintk_default+0x22/0x30 kernel/printk/printk.c:2185 vprintk+0x15a/0x170 kernel/printk/printk_safe.c:392 printk+0x62/0x87 kernel/printk/printk.c:2216 selinux_netlink_send+0x399/0x400 security/selinux/hooks.c:6041 security_netlink_send+0x42/0x90 security/security.c:2070 netlink_sendmsg+0x59e/0x7c0 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392 ___sys_sendmsg net/socket.c:2446 [inline] __sys_sendmsg+0x1ed/0x270 net/socket.c:2475 __do_sys_sendmsg net/socket.c:2484 [inline] __se_sys_sendmsg net/socket.c:2482 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2482 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae
value changed: 0x01 -> 0x02
Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 12539 Comm: syz-executor.1 Not tainted 5.14.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/igmp.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 1a0953b2b1396..542f0c34c316b 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -807,10 +807,17 @@ static void igmp_gq_timer_expire(struct timer_list *t) static void igmp_ifc_timer_expire(struct timer_list *t) { struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer); + u8 mr_ifc_count;
igmpv3_send_cr(in_dev); - if (in_dev->mr_ifc_count) { - in_dev->mr_ifc_count--; +restart: + mr_ifc_count = READ_ONCE(in_dev->mr_ifc_count); + + if (mr_ifc_count) { + if (cmpxchg(&in_dev->mr_ifc_count, + mr_ifc_count, + mr_ifc_count - 1) != mr_ifc_count) + goto restart; igmp_ifc_start_timer(in_dev, unsolicited_report_interval(in_dev)); } @@ -822,7 +829,7 @@ static void igmp_ifc_event(struct in_device *in_dev) struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv); igmp_ifc_start_timer(in_dev, 1); }
@@ -961,7 +968,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, in_dev->mr_qri; } /* cancel the interface change timer */ - in_dev->mr_ifc_count = 0; + WRITE_ONCE(in_dev->mr_ifc_count, 0); if (del_timer(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); /* clear deleted report items */ @@ -1739,7 +1746,7 @@ void ip_mc_down(struct in_device *in_dev) igmp_group_dropped(pmc);
#ifdef CONFIG_IP_MULTICAST - in_dev->mr_ifc_count = 0; + WRITE_ONCE(in_dev->mr_ifc_count, 0); if (del_timer(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); in_dev->mr_gq_running = 0; @@ -1956,7 +1963,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; - in_dev->mr_ifc_count = pmc->crcount; + WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(pmc->interface); @@ -2135,7 +2142,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, /* else no filters; keep old mode for reports */
pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; - in_dev->mr_ifc_count = pmc->crcount; + WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(in_dev);
stable inclusion from linux-4.19.205 commit f41237f60cb0202827432706c33faba3adadbfb5
--------------------------------
[ Upstream commit 519133debcc19f5c834e7e28480b60bdc234fe02 ]
I got a memleak report:
BUG: memory leak unreferenced object 0x607ee521a658 (size 240): comm "syz-executor.0", pid 955, jiffies 4294780569 (age 16.449s) hex dump (first 32 bytes, cpu 1): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000d830ea5a>] br_multicast_add_port+0x1c2/0x300 net/bridge/br_multicast.c:1693 [<00000000274d9a71>] new_nbp net/bridge/br_if.c:435 [inline] [<00000000274d9a71>] br_add_if+0x670/0x1740 net/bridge/br_if.c:611 [<0000000012ce888e>] do_set_master net/core/rtnetlink.c:2513 [inline] [<0000000012ce888e>] do_set_master+0x1aa/0x210 net/core/rtnetlink.c:2487 [<0000000099d1cafc>] __rtnl_newlink+0x1095/0x13e0 net/core/rtnetlink.c:3457 [<00000000a01facc0>] rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3488 [<00000000acc9186c>] rtnetlink_rcv_msg+0x369/0xa10 net/core/rtnetlink.c:5550 [<00000000d4aabb9c>] netlink_rcv_skb+0x134/0x3d0 net/netlink/af_netlink.c:2504 [<00000000bc2e12a3>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<00000000bc2e12a3>] netlink_unicast+0x4a0/0x6a0 net/netlink/af_netlink.c:1340 [<00000000e4dc2d0e>] netlink_sendmsg+0x789/0xc70 net/netlink/af_netlink.c:1929 [<000000000d22c8b3>] sock_sendmsg_nosec net/socket.c:654 [inline] [<000000000d22c8b3>] sock_sendmsg+0x139/0x170 net/socket.c:674 [<00000000e281417a>] ____sys_sendmsg+0x658/0x7d0 net/socket.c:2350 [<00000000237aa2ab>] ___sys_sendmsg+0xf8/0x170 net/socket.c:2404 [<000000004f2dc381>] __sys_sendmsg+0xd3/0x190 net/socket.c:2433 [<0000000005feca6c>] do_syscall_64+0x37/0x90 arch/x86/entry/common.c:47 [<000000007304477d>] entry_SYSCALL_64_after_hwframe+0x44/0xae
On error path of br_add_if(), p->mcast_stats allocated in new_nbp() need be freed, or it will be leaked.
Fixes: 1080ab95e3c7 ("net: bridge: add support for IGMP/MLD stats and export them via netlink") Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Acked-by: Nikolay Aleksandrov nikolay@nvidia.com Link: https://lore.kernel.org/r/20210809132023.978546-1-yangyingliang@huawei.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/bridge/br_if.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 5aa508a08a691..b5fb2b682e191 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -604,6 +604,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev,
err = dev_set_allmulti(dev, 1); if (err) { + br_multicast_del_port(p); kfree(p); /* kobject not yet init'd, manually free */ goto err1; } @@ -708,6 +709,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, err3: sysfs_remove_link(br->ifobj, p->dev->name); err2: + br_multicast_del_port(p); kobject_put(&p->kobj); dev_set_allmulti(dev, -1); err1:
From: Neal Cardwell ncardwell@google.com
stable inclusion from linux-4.19.205 commit 32b6627fec712fb75fbed272517c74814c00ccfc
--------------------------------
[ Upstream commit 6de035fec045f8ae5ee5f3a02373a18b939e91fb ]
Currently if BBR congestion control is initialized after more than 2B packets have been delivered, depending on the phase of the tp->delivered counter the tracking of BBR round trips can get stuck.
The bug arises because if tp->delivered is between 2^31 and 2^32 at the time the BBR congestion control module is initialized, then the initialization of bbr->next_rtt_delivered to 0 will cause the logic to believe that the end of the round trip is still billions of packets in the future. More specifically, the following check will fail repeatedly:
!before(rs->prior_delivered, bbr->next_rtt_delivered)
and thus the connection will take up to 2B packets delivered before that check will pass and the connection will set:
bbr->round_start = 1;
This could cause many mechanisms in BBR to fail to trigger, for example bbr_check_full_bw_reached() would likely never exit STARTUP.
This bug is 5 years old and has not been observed, and as a practical matter this would likely rarely trigger, since it would require transferring at least 2B packets, or likely more than 3 terabytes of data, before switching congestion control algorithms to BBR.
This patch is a stable candidate for kernels as far back as v4.9, when tcp_bbr.c was added.
Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Signed-off-by: Neal Cardwell ncardwell@google.com Reviewed-by: Yuchung Cheng ycheng@google.com Reviewed-by: Kevin Yang yyd@google.com Reviewed-by: Eric Dumazet edumazet@google.com Link: https://lore.kernel.org/r/20210811024056.235161-1-ncardwell@google.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/tcp_bbr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index b70c9365e1313..1740de0530726 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -985,7 +985,7 @@ static void bbr_init(struct sock *sk) bbr->prior_cwnd = 0; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; bbr->rtt_cnt = 0; - bbr->next_rtt_delivered = 0; + bbr->next_rtt_delivered = tp->delivered; bbr->prev_ca_state = TCP_CA_Open; bbr->packet_conservation = 0;
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.205 commit cab824f67d7e8f68288d615929dec02607e473ad
--------------------------------
commit 826da771291fc25a428e871f9e7fb465e390f852 upstream.
X86 IO/APIC and MSI interrupts (when used without interrupts remapping) require that the affinity setup on startup is done before the interrupt is enabled for the first time as the non-remapped operation mode cannot safely migrate enabled interrupts from arbitrary contexts. Provide a new irq chip flag which allows affected hardware to request this.
This has to be opt-in because there have been reports in the past that some interrupt chips cannot handle affinity setting before startup.
Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Marc Zyngier maz@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.779791738@linutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: include/linux/irq.h [yyl: adjust context] Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/irq.h | 2 ++ kernel/irq/chip.c | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/include/linux/irq.h b/include/linux/irq.h index ca367d98a991e..536f1abc9a8ce 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -554,6 +554,7 @@ struct irq_chip { * IRQCHIP_EOI_THREADED: Chip requires eoi() on unmask in threaded mode * IRQCHIP_SUPPORTS_LEVEL_MSI Chip can provide two doorbells for Level MSIs * IRQCHIP_SUPPORTS_NMI: Chip can deliver NMIs, only for root irqchips + * IRQCHIP_AFFINITY_PRE_STARTUP: Default affinity update before startup */ enum { IRQCHIP_SET_TYPE_MASKED = (1 << 0), @@ -565,6 +566,7 @@ enum { IRQCHIP_EOI_THREADED = (1 << 6), IRQCHIP_SUPPORTS_LEVEL_MSI = (1 << 7), IRQCHIP_SUPPORTS_NMI = (1 << 8), + IRQCHIP_AFFINITY_PRE_STARTUP = (1 << 10), };
#include <linux/irqdesc.h> diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index fc03e7271169d..63a9ad8307e6c 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -265,8 +265,11 @@ int irq_startup(struct irq_desc *desc, bool resend, bool force) } else { switch (__irq_startup_managed(desc, aff, force)) { case IRQ_STARTUP_NORMAL: + if (d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP) + irq_setup_affinity(desc); ret = __irq_startup(desc); - irq_setup_affinity(desc); + if (!(d->chip->flags & IRQCHIP_AFFINITY_PRE_STARTUP)) + irq_setup_affinity(desc); break; case IRQ_STARTUP_MANAGED: irq_do_set_affinity(d, aff, false);
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.205 commit 354b210062b1e50ef284f97590011c2231316eaa
--------------------------------
commit ff363f480e5997051dd1de949121ffda3b753741 upstream.
The X86 MSI mechanism cannot handle interrupt affinity changes safely after startup other than from an interrupt handler, unless interrupt remapping is enabled. The startup sequence in the generic interrupt code violates that assumption.
Mark the irq chips with the new IRQCHIP_AFFINITY_PRE_STARTUP flag so that the default interrupt setting happens before the interrupt is started up for the first time.
While the interrupt remapping MSI chip does not require this, there is no point in treating it differently as this might spare an interrupt to a CPU which is not in the default affinity mask.
For the non-remapping case go to the direct write path when the interrupt is not yet started similar to the not yet activated case.
Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Marc Zyngier maz@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.886722080@linutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/kernel/apic/msi.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index 1f5df339e48ff..d72a03dba1aec 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -89,11 +89,13 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force) * The quirk bit is not set in this case. * - The new vector is the same as the old vector * - The old vector is MANAGED_IRQ_SHUTDOWN_VECTOR (interrupt starts up) + * - The interrupt is not yet started up * - The new destination CPU is the same as the old destination CPU */ if (!irqd_msi_nomask_quirk(irqd) || cfg->vector == old_cfg.vector || old_cfg.vector == MANAGED_IRQ_SHUTDOWN_VECTOR || + !irqd_is_started(irqd) || cfg->dest_apicid == old_cfg.dest_apicid) { irq_msi_update_msg(irqd, cfg); return ret; @@ -181,7 +183,8 @@ static struct irq_chip pci_msi_controller = { .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, .irq_set_affinity = msi_set_affinity, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, };
int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) @@ -281,7 +284,8 @@ static struct irq_chip pci_msi_ir_controller = { .irq_ack = irq_chip_ack_parent, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_set_vcpu_affinity = irq_chip_set_vcpu_affinity_parent, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, };
static struct msi_domain_info pci_msi_ir_domain_info = { @@ -323,7 +327,8 @@ static struct irq_chip dmar_msi_controller = { .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = dmar_msi_write_msg, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, };
static irq_hw_number_t dmar_msi_get_hwirq(struct msi_domain_info *info, @@ -420,7 +425,7 @@ static struct irq_chip hpet_msi_controller __ro_after_init = { .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = irq_msi_compose_msg, .irq_write_msi_msg = hpet_msi_write_msg, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_AFFINITY_PRE_STARTUP, };
static irq_hw_number_t hpet_msi_get_hwirq(struct msi_domain_info *info,
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.205 commit 697658a61db4f3aa213d76336ccf30e66e6c44ca
--------------------------------
commit 0c0e37dc11671384e53ba6ede53a4d91162a2cc5 upstream.
The IO/APIC cannot handle interrupt affinity changes safely after startup other than from an interrupt handler. The startup sequence in the generic interrupt code violates that assumption.
Mark the irq chip with the new IRQCHIP_AFFINITY_PRE_STARTUP flag so that the default interrupt setting happens before the interrupt is started up for the first time.
Fixes: 18404756765c ("genirq: Expose default irq affinity mask (take 3)") Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Marc Zyngier maz@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.832143400@linutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/kernel/apic/io_apic.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 0784aeb0356bc..a6397da20b289 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1949,7 +1949,8 @@ static struct irq_chip ioapic_chip __read_mostly = { .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_get_irqchip_state = ioapic_irq_get_chip_state, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, };
static struct irq_chip ioapic_ir_chip __read_mostly = { @@ -1962,7 +1963,8 @@ static struct irq_chip ioapic_ir_chip __read_mostly = { .irq_set_affinity = ioapic_set_affinity, .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_get_irqchip_state = ioapic_irq_get_chip_state, - .flags = IRQCHIP_SKIP_SET_WAKE, + .flags = IRQCHIP_SKIP_SET_WAKE | + IRQCHIP_AFFINITY_PRE_STARTUP, };
static inline void init_IO_APIC_traps(void)
From: Bixuan Cui cuibixuan@huawei.com
stable inclusion from linux-4.19.205 commit 504a4c1057151a1f1332fb3ce940134db8d6b885
--------------------------------
commit dbbc93576e03fbe24b365fab0e901eb442237a8a upstream.
msi_domain_alloc_irqs() invokes irq_domain_activate_irq(), but msi_domain_free_irqs() does not enforce deactivation before tearing down the interrupts.
This happens when PCI/MSI interrupts are set up and never used before being torn down again, e.g. in error handling pathes. The only place which cleans that up is the error handling path in msi_domain_alloc_irqs().
Move the cleanup from msi_domain_alloc_irqs() into msi_domain_free_irqs() to cure that.
Fixes: f3b0946d629c ("genirq/msi: Make sure PCI MSIs are activated early") Signed-off-by: Bixuan Cui cuibixuan@huawei.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210518033117.78104-1-cuibixuan@huawei.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/irq/msi.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 604974f2afb19..88269dd5a8cad 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -477,11 +477,6 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, return 0;
cleanup: - for_each_msi_vector(desc, i, dev) { - irq_data = irq_domain_get_irq_data(domain, i); - if (irqd_is_activated(irq_data)) - irq_domain_deactivate_irq(irq_data); - } msi_domain_free_irqs(domain, dev); return ret; } @@ -494,7 +489,15 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, */ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev) { + struct irq_data *irq_data; struct msi_desc *desc; + int i; + + for_each_msi_vector(desc, i, dev) { + irq_data = irq_domain_get_irq_data(domain, i); + if (irqd_is_activated(irq_data)) + irq_domain_deactivate_irq(irq_data); + }
for_each_msi_entry(desc, dev) { /*
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.205 commit 6aea847496c8c9a37a5df795c4fe42a0e5fcccc5
--------------------------------
commit 438553958ba19296663c6d6583d208dfb6792830 upstream.
The ordering of MSI-X enable in hardware is dysfunctional:
1) MSI-X is disabled in the control register 2) Various setup functions 3) pci_msi_setup_msi_irqs() is invoked which ends up accessing the MSI-X table entries 4) MSI-X is enabled and masked in the control register with the comment that enabling is required for some hardware to access the MSI-X table
Step #4 obviously contradicts #3. The history of this is an issue with the NIU hardware. When #4 was introduced the table access actually happened in msix_program_entries() which was invoked after enabling and masking MSI-X.
This was changed in commit d71d6432e105 ("PCI/MSI: Kill redundant call of irq_set_msi_desc() for MSI-X interrupts") which removed the table write from msix_program_entries().
Interestingly enough nobody noticed and either NIU still works or it did not get any testing with a kernel 3.19 or later.
Nevertheless this is inconsistent and there is no reason why MSI-X can't be enabled and masked in the control register early on, i.e. move step #4 above to step #1. This preserves the NIU workaround and has no side effects on other hardware.
Fixes: d71d6432e105 ("PCI/MSI: Kill redundant call of irq_set_msi_desc() for MSI-X interrupts") Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Marc Zyngier maz@kernel.org Reviewed-by: Ashok Raj ashok.raj@intel.com Reviewed-by: Marc Zyngier maz@kernel.org Acked-by: Bjorn Helgaas bhelgaas@google.com Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.344136412@linutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/msi.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 873848b134357..7a3a66ee5476a 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -754,18 +754,25 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, u16 control; void __iomem *base;
- /* Ensure MSI-X is disabled while it is set up */ - pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0); + /* + * Some devices require MSI-X to be enabled before the MSI-X + * registers can be accessed. Mask all the vectors to prevent + * interrupts coming in before they're fully set up. + */ + pci_msix_clear_and_set_ctrl(dev, 0, PCI_MSIX_FLAGS_MASKALL | + PCI_MSIX_FLAGS_ENABLE);
pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control); /* Request & Map MSI-X table region */ base = msix_map_region(dev, msix_table_size(control)); - if (!base) - return -ENOMEM; + if (!base) { + ret = -ENOMEM; + goto out_disable; + }
ret = msix_setup_entries(dev, base, entries, nvec, affd); if (ret) - return ret; + goto out_disable;
ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX); if (ret) @@ -776,14 +783,6 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, if (ret) goto out_free;
- /* - * Some devices require MSI-X to be enabled before we can touch the - * MSI-X registers. We need to mask all the vectors to prevent - * interrupts coming in before they're fully set up. - */ - pci_msix_clear_and_set_ctrl(dev, 0, - PCI_MSIX_FLAGS_MASKALL | PCI_MSIX_FLAGS_ENABLE); - msix_program_entries(dev, entries);
ret = populate_msi_sysfs(dev); @@ -818,6 +817,9 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, out_free: free_msi_irqs(dev);
+out_disable: + pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0); + return ret; }
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.205 commit 22f4a36d086d74f7abe9c4eaf65204048cd84f9c
--------------------------------
commit 361fd37397f77578735907341579397d5bed0a2d upstream.
msi_mask_irq() takes a mask and a flags argument. The mask argument is used to mask out bits from the cached mask and the flags argument to set bits.
Some places invoke it with a flags argument which sets bits which are not used by the device, i.e. when the device supports up to 8 vectors a full unmask in some places sets the mask to 0xFFFFFF00. While devices probably do not care, it's still bad practice.
Fixes: 7ba1930db02f ("PCI MSI: Unmask MSI if setup failed") Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Marc Zyngier maz@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.568173099@linutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/msi.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 7a3a66ee5476a..c1215310157cb 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -619,21 +619,21 @@ static int msi_capability_init(struct pci_dev *dev, int nvec, /* Configure MSI capability structure */ ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSI); if (ret) { - msi_mask_irq(entry, mask, ~mask); + msi_mask_irq(entry, mask, 0); free_msi_irqs(dev); return ret; }
ret = msi_verify_entries(dev); if (ret) { - msi_mask_irq(entry, mask, ~mask); + msi_mask_irq(entry, mask, 0); free_msi_irqs(dev); return ret; }
ret = populate_msi_sysfs(dev); if (ret) { - msi_mask_irq(entry, mask, ~mask); + msi_mask_irq(entry, mask, 0); free_msi_irqs(dev); return ret; } @@ -908,7 +908,7 @@ static void pci_msi_shutdown(struct pci_dev *dev) /* Return the device with MSI unmasked as initial states */ mask = msi_mask(desc->msi_attrib.multi_cap); /* Keep cached state to be restored */ - __pci_msi_desc_mask_irq(desc, mask, ~mask); + __pci_msi_desc_mask_irq(desc, mask, 0);
/* Restore dev->irq to its default pin-assertion irq */ dev->irq = desc->msi_attrib.default_irq;
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.205 commit c5b223cd04706589e5e6840e2ab7c4f879323ed9
--------------------------------
commit 689e6b5351573c38ccf92a0dd8b3e2c2241e4aff upstream.
The comments about preserving the cached state in pci_msi[x]_shutdown() are misleading as the MSI descriptors are freed right after those functions return. So there is nothing to restore. Preparatory change.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Marc Zyngier maz@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.621609423@linutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/msi.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index c1215310157cb..230238c47107f 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -907,7 +907,6 @@ static void pci_msi_shutdown(struct pci_dev *dev)
/* Return the device with MSI unmasked as initial states */ mask = msi_mask(desc->msi_attrib.multi_cap); - /* Keep cached state to be restored */ __pci_msi_desc_mask_irq(desc, mask, 0);
/* Restore dev->irq to its default pin-assertion irq */ @@ -993,10 +992,8 @@ static void pci_msix_shutdown(struct pci_dev *dev) }
/* Return the device with MSI-X masked as initial states */ - for_each_pci_msi_entry(entry, dev) { - /* Keep cached states to be restored */ + for_each_pci_msi_entry(entry, dev) __pci_msix_desc_mask_irq(entry, 1); - }
pci_msix_clear_and_set_ctrl(dev, PCI_MSIX_FLAGS_ENABLE, 0); pci_intx_for_msi(dev, 1);
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.205 commit 1b36c30a9335db941423c05b49a8266a84a82f95
--------------------------------
commit d28d4ad2a1aef27458b3383725bb179beb8d015c upstream.
No point in using the raw write function from shutdown. Preparatory change to introduce proper serialization for the msi_desc::masked cache.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Marc Zyngier maz@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.674391354@linutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 230238c47107f..b043200cf3acc 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -907,7 +907,7 @@ static void pci_msi_shutdown(struct pci_dev *dev)
/* Return the device with MSI unmasked as initial states */ mask = msi_mask(desc->msi_attrib.multi_cap); - __pci_msi_desc_mask_irq(desc, mask, 0); + msi_mask_irq(desc, mask, 0);
/* Restore dev->irq to its default pin-assertion irq */ dev->irq = desc->msi_attrib.default_irq;
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.205 commit 3c9534778d4cc2bd01e20d4dcffc55df0962aa12
--------------------------------
commit 77e89afc25f30abd56e76a809ee2884d7c1b63ce upstream.
Multi-MSI uses a single MSI descriptor and there is a single mask register when the device supports per vector masking. To avoid reading back the mask register the value is cached in the MSI descriptor and updates are done by clearing and setting bits in the cache and writing it to the device.
But nothing protects msi_desc::masked and the mask register from being modified concurrently on two different CPUs for two different Linux interrupts which belong to the same multi-MSI descriptor.
Add a lock to struct device and protect any operation on the mask and the mask register with it.
This makes the update of msi_desc::masked unconditional, but there is no place which requires a modification of the hardware register without updating the masked cache.
msi_mask_irq() is now an empty wrapper which will be cleaned up in follow up changes.
The problem goes way back to the initial support of multi-MSI, but picking the commit which introduced the mask cache is a valid cut off point (2.6.30).
Fixes: f2440d9acbe8 ("PCI MSI: Refactor interrupt masking code") Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Marc Zyngier maz@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.726833414@linutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/base/core.c | 1 + drivers/pci/msi.c | 19 ++++++++++--------- include/linux/device.h | 1 + include/linux/msi.h | 2 +- 4 files changed, 13 insertions(+), 10 deletions(-)
diff --git a/drivers/base/core.c b/drivers/base/core.c index e196e6a67e63c..f6d8a4246adf9 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1682,6 +1682,7 @@ void device_initialize(struct device *dev) device_pm_init(dev); set_dev_node(dev, -1); #ifdef CONFIG_GENERIC_MSI_IRQ + raw_spin_lock_init(&dev->msi_lock); INIT_LIST_HEAD(&dev->msi_list); #endif INIT_LIST_HEAD(&dev->links.consumers); diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index b043200cf3acc..14dc5c1495836 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -170,24 +170,25 @@ static inline __attribute_const__ u32 msi_mask(unsigned x) * reliably as devices without an INTx disable bit will then generate a * level IRQ which will never be cleared. */ -u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) +void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) { - u32 mask_bits = desc->masked; + raw_spinlock_t *lock = &desc->dev->msi_lock; + unsigned long flags;
if (pci_msi_ignore_mask || !desc->msi_attrib.maskbit) - return 0; + return;
- mask_bits &= ~mask; - mask_bits |= flag; + raw_spin_lock_irqsave(lock, flags); + desc->masked &= ~mask; + desc->masked |= flag; pci_write_config_dword(msi_desc_to_pci_dev(desc), desc->mask_pos, - mask_bits); - - return mask_bits; + desc->masked); + raw_spin_unlock_irqrestore(lock, flags); }
static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag) { - desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag); + __pci_msi_desc_mask_irq(desc, mask, flag); }
static void __iomem *pci_msix_desc_addr(struct msi_desc *desc) diff --git a/include/linux/device.h b/include/linux/device.h index d5c2c9dd1310b..954407693ea51 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1010,6 +1010,7 @@ struct device { struct dev_pin_info *pins; #endif #ifdef CONFIG_GENERIC_MSI_IRQ + raw_spinlock_t msi_lock; struct list_head msi_list; #endif
diff --git a/include/linux/msi.h b/include/linux/msi.h index 5dd171849a27e..62982e6afddfd 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -150,7 +150,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg); void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag); -u32 __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag); +void __pci_msi_desc_mask_irq(struct msi_desc *desc, u32 mask, u32 flag); void pci_msi_mask_irq(struct irq_data *data); void pci_msi_unmask_irq(struct irq_data *data);
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.205 commit 3b570884c868c12e3184627ce4b4a167e9d6f018
--------------------------------
commit 7d5ec3d3612396dc6d4b76366d20ab9fc06f399f upstream.
When MSI-X is enabled the ordering of calls is:
msix_map_region(); msix_setup_entries(); pci_msi_setup_msi_irqs(); msix_program_entries();
This has a few interesting issues:
1) msix_setup_entries() allocates the MSI descriptors and initializes them except for the msi_desc:masked member which is left zero initialized.
2) pci_msi_setup_msi_irqs() allocates the interrupt descriptors and sets up the MSI interrupts which ends up in pci_write_msi_msg() unless the interrupt chip provides its own irq_write_msi_msg() function.
3) msix_program_entries() does not do what the name suggests. It solely updates the entries array (if not NULL) and initializes the masked member for each MSI descriptor by reading the hardware state and then masks the entry.
Obviously this has some issues:
1) The uninitialized masked member of msi_desc prevents the enforcement of masking the entry in pci_write_msi_msg() depending on the cached masked bit. Aside of that half initialized data is a NONO in general
2) msix_program_entries() only ensures that the actually allocated entries are masked. This is wrong as experimentation with crash testing and crash kernel kexec has shown.
This limited testing unearthed that when the production kernel had more entries in use and unmasked when it crashed and the crash kernel allocated a smaller amount of entries, then a full scan of all entries found unmasked entries which were in use in the production kernel.
This is obviously a device or emulation issue as the device reset should mask all MSI-X table entries, but obviously that's just part of the paper specification.
Cure this by:
1) Masking all table entries in hardware 2) Initializing msi_desc::masked in msix_setup_entries() 3) Removing the mask dance in msix_program_entries() 4) Renaming msix_program_entries() to msix_update_entries() to reflect the purpose of that function.
As the masking of unused entries has never been done the Fixes tag refers to a commit in: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support") Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Marc Zyngier maz@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org Acked-by: Bjorn Helgaas bhelgaas@google.com Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.403833459@linutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/msi.c | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 14dc5c1495836..4d00f03d76dd8 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -686,6 +686,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, { struct cpumask *curmsk, *masks = NULL; struct msi_desc *entry; + void __iomem *addr; int ret, i;
if (affd) @@ -705,6 +706,7 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
entry->msi_attrib.is_msix = 1; entry->msi_attrib.is_64 = 1; + if (entries) entry->msi_attrib.entry_nr = entries[i].entry; else @@ -712,6 +714,10 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, entry->msi_attrib.default_irq = dev->irq; entry->mask_base = base;
+ addr = pci_msix_desc_addr(entry); + if (addr) + entry->masked = readl(addr + PCI_MSIX_ENTRY_VECTOR_CTRL); + list_add_tail(&entry->list, dev_to_msi_list(&dev->dev)); if (masks) curmsk++; @@ -722,21 +728,27 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base, return ret; }
-static void msix_program_entries(struct pci_dev *dev, - struct msix_entry *entries) +static void msix_update_entries(struct pci_dev *dev, struct msix_entry *entries) { struct msi_desc *entry; - int i = 0;
for_each_pci_msi_entry(entry, dev) { - if (entries) - entries[i++].vector = entry->irq; - entry->masked = readl(pci_msix_desc_addr(entry) + - PCI_MSIX_ENTRY_VECTOR_CTRL); - msix_mask_irq(entry, 1); + if (entries) { + entries->vector = entry->irq; + entries++; + } } }
+static void msix_mask_all(void __iomem *base, int tsize) +{ + u32 ctrl = PCI_MSIX_ENTRY_CTRL_MASKBIT; + int i; + + for (i = 0; i < tsize; i++, base += PCI_MSIX_ENTRY_SIZE) + writel(ctrl, base + PCI_MSIX_ENTRY_VECTOR_CTRL); +} + /** * msix_capability_init - configure device's MSI-X capability * @dev: pointer to the pci_dev data structure of MSI-X device function @@ -751,9 +763,9 @@ static void msix_program_entries(struct pci_dev *dev, static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, int nvec, const struct irq_affinity *affd) { - int ret; - u16 control; void __iomem *base; + int ret, tsize; + u16 control;
/* * Some devices require MSI-X to be enabled before the MSI-X @@ -765,12 +777,16 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control); /* Request & Map MSI-X table region */ - base = msix_map_region(dev, msix_table_size(control)); + tsize = msix_table_size(control); + base = msix_map_region(dev, tsize); if (!base) { ret = -ENOMEM; goto out_disable; }
+ /* Ensure that all table entries are masked. */ + msix_mask_all(base, tsize); + ret = msix_setup_entries(dev, base, entries, nvec, affd); if (ret) goto out_disable; @@ -784,7 +800,7 @@ static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries, if (ret) goto out_free;
- msix_program_entries(dev, entries); + msix_update_entries(dev, entries);
ret = populate_msi_sysfs(dev); if (ret)
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.205 commit b590b85fc91979a97cbb4ab1bcf888aa245cd5e3
--------------------------------
commit da181dc974ad667579baece33c2c8d2d1e4558d5 upstream.
The specification (PCIe r5.0, sec 6.1.4.5) states:
For MSI-X, a function is permitted to cache Address and Data values from unmasked MSI-X Table entries. However, anytime software unmasks a currently masked MSI-X Table entry either by clearing its Mask bit or by clearing the Function Mask bit, the function must update any Address or Data values that it cached from that entry. If software changes the Address or Data value of an entry while the entry is unmasked, the result is undefined.
The Linux kernel's MSI-X support never enforced that the entry is masked before the entry is modified hence the Fixes tag refers to a commit in: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Enforce the entry to be masked across the update.
There is no point in enforcing this to be handled at all possible call sites as this is just pointless code duplication and the common update function is the obvious place to enforce this.
Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support") Reported-by: Kevin Tian kevin.tian@intel.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Marc Zyngier maz@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org Acked-by: Bjorn Helgaas bhelgaas@google.com Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.462096385@linutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/msi.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 4d00f03d76dd8..497b7219c7be9 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -303,10 +303,25 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) /* Don't touch the hardware now */ } else if (entry->msi_attrib.is_msix) { void __iomem *base = pci_msix_desc_addr(entry); + bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT); + + /* + * The specification mandates that the entry is masked + * when the message is modified: + * + * "If software changes the Address or Data value of an + * entry while the entry is unmasked, the result is + * undefined." + */ + if (unmasked) + __pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT);
writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR); writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR); writel(msg->data, base + PCI_MSIX_ENTRY_DATA); + + if (unmasked) + __pci_msix_desc_mask_irq(entry, 0); } else { int pos = dev->msi_cap; u16 msgctl;
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.205 commit 153cc7c9dfefe646c8b2a74eb925b6620b915154
--------------------------------
commit b9255a7cb51754e8d2645b65dd31805e282b4f3e upstream.
Nothing enforces the posted writes to be visible when the function returns. Flush them even if the flush might be redundant when the entry is masked already as the unmask will flush as well. This is either setup or a rare affinity change event so the extra flush is not the end of the world.
While this is more a theoretical issue especially the logic in the X86 specific msi_set_affinity() function relies on the assumption that the update has reached the hardware when the function returns.
Again, as this never has been enforced the Fixes tag refers to a commit in: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Fixes: f036d4ea5fa7 ("[PATCH] ia32 Message Signalled Interrupt support") Signed-off-by: Thomas Gleixner tglx@linutronix.de Tested-by: Marc Zyngier maz@kernel.org Reviewed-by: Marc Zyngier maz@kernel.org Acked-by: Bjorn Helgaas bhelgaas@google.com Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210729222542.515188147@linutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/msi.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 497b7219c7be9..60f52a84fbaf2 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -322,6 +322,9 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
if (unmasked) __pci_msix_desc_mask_irq(entry, 0); + + /* Ensure that the writes are visible in the device */ + readl(base + PCI_MSIX_ENTRY_DATA); } else { int pos = dev->msi_cap; u16 msgctl; @@ -342,6 +345,8 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg) pci_write_config_word(dev, pos + PCI_MSI_DATA_32, msg->data); } + /* Ensure that the writes are visible in the device */ + pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl); } entry->msg = *msg; }
From: Saeed Mirzamohammadi saeed.mirzamohammadi@oracle.com
stable inclusion from linux-4.19.205 commit 6a9449e9568808930e7d4d83c33e320329113a67
--------------------------------
[ Upstream commit 327d5b2fee91c404a3956c324193892cf2cc9528 ]
The IOMMU driver calculates the guest addressability for a DMA request based on the value of the mgaw reported from the IOMMU. However, this is a fused value and as mentioned in the spec, the guest width should be calculated based on the minimum of supported adjusted guest address width (SAGAW) and MGAW.
This is from specification: "Guest addressability for a given DMA request is limited to the minimum of the value reported through this field and the adjusted guest address width of the corresponding page-table structure. (Adjusted guest address widths supported by hardware are reported through the SAGAW field)."
This causes domain initialization to fail and following errors appear for EHCI PCI driver:
[ 2.486393] ehci-pci 0000:01:00.4: EHCI Host Controller [ 2.486624] ehci-pci 0000:01:00.4: new USB bus registered, assigned bus number 1 [ 2.489127] ehci-pci 0000:01:00.4: DMAR: Allocating domain failed [ 2.489350] ehci-pci 0000:01:00.4: DMAR: 32bit DMA uses non-identity mapping [ 2.489359] ehci-pci 0000:01:00.4: can't setup: -12 [ 2.489531] ehci-pci 0000:01:00.4: USB bus 1 deregistered [ 2.490023] ehci-pci 0000:01:00.4: init 0000:01:00.4 fail, -12 [ 2.490358] ehci-pci: probe of 0000:01:00.4 failed with error -12
This issue happens when the value of the sagaw corresponds to a 48-bit agaw. This fix updates the calculation of the agaw based on the minimum of IOMMU's sagaw value and MGAW.
This issue happens on the code path of getting a private domain for a device. A private domain was needed when the domain of an iommu group couldn't meet the requirement of a device. The IOMMU core has been evolved to eliminate the need for private domain, hence this code path has alreay been removed from the upstream since commit 327d5b2fee91c ("iommu/vt-d: Allow 32bit devices to uses DMA domain"). Instead of back porting all patches that are required for removing the private domain, this simply fixes it in the affected stable kernel between v4.16 and v5.7.
[baolu: The orignal patch could be found here https://lore.kernel.org/linux-iommu/20210412202736.70765-1-saeed.mirzamohamm.... I added commit message according to Greg's comments at https://lore.kernel.org/linux-iommu/YHZ%2FT9x7Xjf1r6fI@kroah.com/.]
Cc: Joerg Roedel joro@8bytes.org Cc: Ashok Raj ashok.raj@intel.com Cc: stable@vger.kernel.org #v4.16+ Signed-off-by: Saeed Mirzamohammadi saeed.mirzamohammadi@oracle.com Tested-by: Camille Lu camille.lu@hpe.com Signed-off-by: Lu Baolu baolu.lu@linux.intel.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/intel-iommu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index 761185ac12c5e..7b0b790e1806d 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -1949,7 +1949,7 @@ static inline int guestwidth_to_adjustwidth(int gaw) static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu, int guest_width) { - int adjust_width, agaw; + int adjust_width, agaw, cap_width; unsigned long sagaw; int err;
@@ -1963,8 +1963,9 @@ static int domain_init(struct dmar_domain *domain, struct intel_iommu *iommu, domain_reserve_special_ranges(domain);
/* calculate AGAW */ - if (guest_width > cap_mgaw(iommu->cap)) - guest_width = cap_mgaw(iommu->cap); + cap_width = min_t(int, cap_mgaw(iommu->cap), agaw_to_width(iommu->agaw)); + if (guest_width > cap_width) + guest_width = cap_width; domain->gaw = guest_width; adjust_width = guestwidth_to_adjustwidth(guest_width); agaw = width_to_agaw(adjust_width);
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.205 commit e829367f47218de04587c2df3c4cb5ef87e35648
--------------------------------
commit f9dfb5e390fab2df9f7944bb91e7705aba14cd26 upstream.
The XSAVE init code initializes all enabled and supported components with XRSTOR(S) to init state. Then it XSAVEs the state of the components back into init_fpstate which is used in several places to fill in the init state of components.
This works correctly with XSAVE, but not with XSAVEOPT and XSAVES because those use the init optimization and skip writing state of components which are in init state. So init_fpstate.xsave still contains all zeroes after this operation.
There are two ways to solve that:
1) Use XSAVE unconditionally, but that requires to reshuffle the buffer when XSAVES is enabled because XSAVES uses compacted format.
2) Save the components which are known to have a non-zero init state by other means.
Looking deeper, #2 is the right thing to do because all components the kernel supports have all-zeroes init state except the legacy features (FP, SSE). Those cannot be hard coded because the states are not identical on all CPUs, but they can be saved with FXSAVE which avoids all conditionals.
Use FXSAVE to save the legacy FP/SSE components in init_fpstate along with a BUILD_BUG_ON() which reminds developers to validate that a newly added component has all zeroes init state. As a bonus remove the now unused copy_xregs_to_kernel_booting() crutch.
The XSAVE and reshuffle method can still be implemented in the unlikely case that components are added which have a non-zero init state and no other means to save them. For now, FXSAVE is just simple and good enough.
[ bp: Fix a typo or two in the text. ]
Fixes: 6bad06b76892 ("x86, xsave: Use xsaveopt in context-switch path when supported") Signed-off-by: Thomas Gleixner tglx@linutronix.de Signed-off-by: Borislav Petkov bp@suse.de Reviewed-by: Borislav Petkov bp@suse.de Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210618143444.587311343@linutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/include/asm/fpu/internal.h | 30 ++++++----------------- arch/x86/kernel/fpu/xstate.c | 38 ++++++++++++++++++++++++++--- 2 files changed, 43 insertions(+), 25 deletions(-)
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index fa2c93cb42a27..3379a97f16642 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -214,6 +214,14 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) } }
+static inline void fxsave(struct fxregs_state *fx) +{ + if (IS_ENABLED(CONFIG_X86_32)) + asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); + else + asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); +} + /* These macros all use (%edi)/(%rdi) as the single memory argument. */ #define XSAVE ".byte " REX_PREFIX "0x0f,0xae,0x27" #define XSAVEOPT ".byte " REX_PREFIX "0x0f,0xae,0x37" @@ -278,28 +286,6 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu) : "D" (st), "m" (*st), "a" (lmask), "d" (hmask) \ : "memory")
-/* - * This function is called only during boot time when x86 caps are not set - * up and alternative can not be used yet. - */ -static inline void copy_xregs_to_kernel_booting(struct xregs_state *xstate) -{ - u64 mask = -1; - u32 lmask = mask; - u32 hmask = mask >> 32; - int err; - - WARN_ON(system_state != SYSTEM_BOOTING); - - if (static_cpu_has(X86_FEATURE_XSAVES)) - XSTATE_OP(XSAVES, xstate, lmask, hmask, err); - else - XSTATE_OP(XSAVE, xstate, lmask, hmask, err); - - /* We should never fault when copying to a kernel buffer: */ - WARN_ON_FPU(err); -} - /* * This function is called only during boot time when x86 caps are not set * up and alternative can not be used yet. diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 87a57b7642d36..78372d5a35ecb 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -404,6 +404,24 @@ static void __init print_xstate_offset_size(void) } }
+/* + * All supported features have either init state all zeros or are + * handled in setup_init_fpu() individually. This is an explicit + * feature list and does not use XFEATURE_MASK*SUPPORTED to catch + * newly added supported features at build time and make people + * actually look at the init state for the new feature. + */ +#define XFEATURES_INIT_FPSTATE_HANDLED \ + (XFEATURE_MASK_FP | \ + XFEATURE_MASK_SSE | \ + XFEATURE_MASK_YMM | \ + XFEATURE_MASK_OPMASK | \ + XFEATURE_MASK_ZMM_Hi256 | \ + XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_PKRU | \ + XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR) + /* * setup the xstate image representing the init state */ @@ -411,6 +429,8 @@ static void __init setup_init_fpu_buf(void) { static int on_boot_cpu __initdata = 1;
+ BUILD_BUG_ON(XCNTXT_MASK != XFEATURES_INIT_FPSTATE_HANDLED); + WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0;
@@ -429,10 +449,22 @@ static void __init setup_init_fpu_buf(void) copy_kernel_to_xregs_booting(&init_fpstate.xsave);
/* - * Dump the init state again. This is to identify the init state - * of any feature which is not represented by all zero's. + * All components are now in init state. Read the state back so + * that init_fpstate contains all non-zero init state. This only + * works with XSAVE, but not with XSAVEOPT and XSAVES because + * those use the init optimization which skips writing data for + * components in init state. + * + * XSAVE could be used, but that would require to reshuffle the + * data when XSAVES is available because XSAVES uses xstate + * compaction. But doing so is a pointless exercise because most + * components have an all zeros init state except for the legacy + * ones (FP and SSE). Those can be saved with FXSAVE into the + * legacy area. Adding new features requires to ensure that init + * state is all zeroes or if not to add the necessary handling + * here. */ - copy_xregs_to_kernel_booting(&init_fpstate.xsave); + fxsave(&init_fpstate.fxsave); }
static int xfeature_uncompacted_offset(int xfeature_nr)
From: Ye Bin yebin10@huawei.com
stable inclusion from linux-4.19.205 commit e25e7495d72649cb50b42865356bfe272b3a2a6d
--------------------------------
[ Upstream commit bc546c0c9abb3bb2fb46866b3d1e6ade9695a5f6 ]
The following BUG_ON() was observed during RDAC scan:
[595952.944297] kernel BUG at drivers/scsi/device_handler/scsi_dh_rdac.c:427! [595952.951143] Internal error: Oops - BUG: 0 [#1] SMP ...... [595953.251065] Call trace: [595953.259054] check_ownership+0xb0/0x118 [595953.269794] rdac_bus_attach+0x1f0/0x4b0 [595953.273787] scsi_dh_handler_attach+0x3c/0xe8 [595953.278211] scsi_dh_add_device+0xc4/0xe8 [595953.282291] scsi_sysfs_add_sdev+0x8c/0x2a8 [595953.286544] scsi_probe_and_add_lun+0x9fc/0xd00 [595953.291142] __scsi_scan_target+0x598/0x630 [595953.295395] scsi_scan_target+0x120/0x130 [595953.299481] fc_user_scan+0x1a0/0x1c0 [scsi_transport_fc] [595953.304944] store_scan+0xb0/0x108 [595953.308420] dev_attr_store+0x44/0x60 [595953.312160] sysfs_kf_write+0x58/0x80 [595953.315893] kernfs_fop_write+0xe8/0x1f0 [595953.319888] __vfs_write+0x60/0x190 [595953.323448] vfs_write+0xac/0x1c0 [595953.326836] ksys_write+0x74/0xf0 [595953.330221] __arm64_sys_write+0x24/0x30
Code is in check_ownership:
list_for_each_entry_rcu(tmp, &h->ctlr->dh_list, node) { /* h->sdev should always be valid */ BUG_ON(!tmp->sdev); tmp->sdev->access_state = access_state; }
rdac_bus_attach initialize_controller list_add_rcu(&h->node, &h->ctlr->dh_list); h->sdev = sdev;
rdac_bus_detach list_del_rcu(&h->node); h->sdev = NULL;
Fix the race between rdac_bus_attach() and rdac_bus_detach() where h->sdev is NULL when processing the RDAC attach.
Link: https://lore.kernel.org/r/20210113063103.2698953-1-yebin10@huawei.com Reviewed-by: Bart Van Assche bvanassche@acm.org Signed-off-by: Ye Bin yebin10@huawei.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/scsi/device_handler/scsi_dh_rdac.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/device_handler/scsi_dh_rdac.c b/drivers/scsi/device_handler/scsi_dh_rdac.c index 6c629ef1bc4e3..b3c23edd4b6cb 100644 --- a/drivers/scsi/device_handler/scsi_dh_rdac.c +++ b/drivers/scsi/device_handler/scsi_dh_rdac.c @@ -453,8 +453,8 @@ static int initialize_controller(struct scsi_device *sdev, if (!h->ctlr) err = SCSI_DH_RES_TEMP_UNAVAIL; else { - list_add_rcu(&h->node, &h->ctlr->dh_list); h->sdev = sdev; + list_add_rcu(&h->node, &h->ctlr->dh_list); } spin_unlock(&list_lock); err = SCSI_DH_OK; @@ -779,11 +779,11 @@ static void rdac_bus_detach( struct scsi_device *sdev ) spin_lock(&list_lock); if (h->ctlr) { list_del_rcu(&h->node); - h->sdev = NULL; kref_put(&h->ctlr->kref, release_controller); } spin_unlock(&list_lock); sdev->handler_data = NULL; + synchronize_rcu(); kfree(h); }
From: Sreekanth Reddy sreekanth.reddy@broadcom.com
stable inclusion from linux-4.19.205 commit 460add3104945704bfd2b92441b48a30b2ee9f1e
--------------------------------
[ Upstream commit 70edd2e6f652f67d854981fd67f9ad0f1deaea92 ]
Avoid printing a 'target allocation failed' error if the driver target_alloc() callback function returns -ENXIO. This return value indicates that the corresponding H:C:T:L entry is empty.
Removing this error reduces the scan time if the user issues SCAN_WILD_CARD scan operation through sysfs parameter on a host with a lot of empty H:C:T:L entries.
Avoiding the printk on -ENXIO matches the behavior of the other callback functions during scanning.
Link: https://lore.kernel.org/r/20210726115402.1936-1-sreekanth.reddy@broadcom.com Signed-off-by: Sreekanth Reddy sreekanth.reddy@broadcom.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/scsi/scsi_scan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 27610816ccb8d..af9ee2feb82df 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -462,7 +462,8 @@ static struct scsi_target *scsi_alloc_target(struct device *parent, error = shost->hostt->target_alloc(starget);
if(error) { - dev_printk(KERN_ERR, dev, "target allocation failed, error %d\n", error); + if (error != -ENXIO) + dev_err(dev, "target allocation failed, error %d\n", error); /* don't want scsi_target_reap to do the final * put because it will be under the host lock */ scsi_target_destroy(starget);
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
stable inclusion from linux-4.19.205 commit 2d349b0a69edc4f7f09c543abd098ab3c7fe31c3
--------------------------------
[ Upstream commit 5acce0bff2a0420ce87d4591daeb867f47d552c2 ]
The following commands:
# echo 'read_max u64 size;' > synthetic_events # echo 'hist:keys=common_pid:count=count:onmax($count).trace(read_max,count)' > events/syscalls/sys_enter_read/trigger
Causes:
BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP CPU: 4 PID: 1763 Comm: bash Not tainted 5.14.0-rc2-test+ #155 Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 RIP: 0010:strcmp+0xc/0x20 Code: 75 f7 31 c0 0f b6 0c 06 88 0c 02 48 83 c0 01 84 c9 75 f1 4c 89 c0 c3 0f 1f 80 00 00 00 00 31 c0 eb 08 48 83 c0 01 84 d2 74 0f <0f> b6 14 07 3a 14 06 74 ef 19 c0 83 c8 01 c3 31 c0 c3 66 90 48 89 RSP: 0018:ffffb5fdc0963ca8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffffffb3a4e040 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff9714c0d0b640 RDI: 0000000000000000 RBP: 0000000000000000 R08: 00000022986b7cde R09: ffffffffb3a4dff8 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9714c50603c8 R13: 0000000000000000 R14: ffff97143fdf9e48 R15: ffff9714c01a2210 FS: 00007f1fa6785740(0000) GS:ffff9714da400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000002d863004 CR4: 00000000001706e0 Call Trace: __find_event_file+0x4e/0x80 action_create+0x6b7/0xeb0 ? kstrdup+0x44/0x60 event_hist_trigger_func+0x1a07/0x2130 trigger_process_regex+0xbd/0x110 event_trigger_write+0x71/0xd0 vfs_write+0xe9/0x310 ksys_write+0x68/0xe0 do_syscall_64+0x3b/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7f1fa6879e87
The problem was the "trace(read_max,count)" where the "count" should be "$count" as "onmax()" only handles variables (although it really should be able to figure out that "count" is a field of sys_enter_read). But there's a path that does not find the variable and ends up passing a NULL for the event, which ends up getting passed to "strcmp()".
Add a check for NULL to return and error on the command with:
# cat error_log hist:syscalls:sys_enter_read: error: Couldn't create or find variable Command: hist:keys=common_pid:count=count:onmax($count).trace(read_max,count) ^ Link: https://lkml.kernel.org/r/20210808003011.4037f8d0@oasis.local.home
Cc: Masami Hiramatsu mhiramat@kernel.org Cc: stable@vger.kernel.org Fixes: 50450603ec9cb tracing: Add 'onmax' hist trigger action support Reviewed-by: Tom Zanussi zanussi@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/trace_events_hist.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 76813bc91e1e0..638566ea78393 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3679,6 +3679,8 @@ onmatch_create_field_var(struct hist_trigger_data *hist_data, event = data->onmatch.match_event; }
+ if (!event) + goto free; /* * At this point, we're looking at a field on another * event. Because we can't modify a hist trigger on
From: Sergey Marinkevich sergey.marinkevich@eltex-co.ru
stable inclusion from linux-4.19.205 commit 991158d680774ca5010fe6e15aa3a61aebfdf688
--------------------------------
[ Upstream commit 2e34328b396a69b73661ba38d47d92b7cf21c2c4 ]
I got a problem on MIPS with Big-Endian is turned on: every time when NF trying to change TCP MSS it returns because of new.v16 was greater than old.v16. But real MSS was 1460 and my rule was like this:
add rule table chain tcp option maxseg size set 1400
And 1400 is lesser that 1460, not greater.
Later I founded that main causer is cast from u32 to __be16.
Debugging:
In example MSS = 1400(HEX: 0x578). Here is representation of each byte like it is in memory by addresses from left to right(e.g. [0x0 0x1 0x2 0x3]). LE — Little-Endian system, BE — Big-Endian, left column is type.
LE BE u32: [78 05 00 00] [00 00 05 78]
As you can see, u32 representation will be casted to u16 from different half of 4-byte address range. But actually nf_tables uses registers and store data of various size. Actually TCP MSS stored in 2 bytes. But registers are still u32 in definition:
struct nft_regs { union { u32 data[20]; struct nft_verdict verdict; }; };
So, access like regs->data[priv->sreg] exactly u32. So, according to table presents above, per-byte representation of stored TCP MSS in register will be:
LE BE (u32)regs->data[]: [78 05 00 00] [05 78 00 00] ^^ ^^
We see that register uses just half of u32 and other 2 bytes may be used for some another data. But in nft_exthdr_tcp_set_eval() it casted just like u32 -> __be16:
new.v16 = src
But u32 overfill __be16, so it get 2 low bytes. For clarity draw one more table(<xx xx> means that bytes will be used for cast).
LE BE u32: [<78 05> 00 00] [00 00 <05 78>] (u32)regs->data[]: [<78 05> 00 00] [05 78 <00 00>]
As you can see, for Little-Endian nothing changes, but for Big-endian we take the wrong half. In my case there is some other data instead of zeros, so new MSS was wrongly greater.
For shooting this bug I used solution for ports ranges. Applying of this patch does not affect Little-Endian systems.
Signed-off-by: Sergey Marinkevich sergey.marinkevich@eltex-co.ru Acked-by: Florian Westphal fw@strlen.de Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netfilter/nft_exthdr.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-)
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index 64e69d6683cab..93fee41060192 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -137,7 +137,6 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, unsigned int i, optl, tcphdr_len, offset; struct tcphdr *tcph; u8 *opt; - u32 src;
tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); if (!tcph) @@ -146,7 +145,6 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, opt = (u8 *)tcph; for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { union { - u8 octet; __be16 v16; __be32 v32; } old, new; @@ -167,13 +165,13 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, if (!tcph) return;
- src = regs->data[priv->sreg]; offset = i + priv->offset;
switch (priv->len) { case 2: old.v16 = get_unaligned((u16 *)(opt + offset)); - new.v16 = src; + new.v16 = (__force __be16)nft_reg_load16( + ®s->data[priv->sreg]);
switch (priv->type) { case TCPOPT_MSS: @@ -191,7 +189,7 @@ static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, old.v16, new.v16, false); break; case 4: - new.v32 = src; + new.v32 = regs->data[priv->sreg]; old.v32 = get_unaligned((u32 *)(opt + offset));
if (old.v32 == new.v32)
From: Shreyansh Chouhan chouhan.shreyansh630@gmail.com
stable inclusion from linux-4.19.206 commit c33471daf2763c5aee2b7926202c74b75c365119
--------------------------------
[ Upstream commit 1d011c4803c72f3907eccfc1ec63caefb852fcbf ]
Validate csum_start in gre_handle_offloads before we call _gre_xmit so that we do not crash later when the csum_start value is used in the lco_csum function call.
This patch deals with ipv4 code.
Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Reported-by: syzbot+ff8e1b9f2f36481e2efc@syzkaller.appspotmail.com Signed-off-by: Shreyansh Chouhan chouhan.shreyansh630@gmail.com Reviewed-by: Willem de Bruijn willemb@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/ip_gre.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index de6f89511a216..a8a37d1128201 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -449,6 +449,8 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
static int gre_handle_offloads(struct sk_buff *skb, bool csum) { + if (csum && skb_checksum_start(skb) < skb->data) + return -EINVAL; return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); }
From: Parav Pandit parav@nvidia.com
stable inclusion from linux-4.19.206 commit 68208dc42dd906fe626224000d85e9513dbe5199
--------------------------------
[ Upstream commit 43bb40c5b92659966bdf4bfe584fde0a3575a049 ]
When a virtio pci device undergo surprise removal (aka async removal in PCIe spec), mark the device as broken so that any upper layer drivers can abort any outstanding operation.
When a virtio net pci device undergo surprise removal which is used by a NetworkManager, a below call trace was observed.
kernel:watchdog: BUG: soft lockup - CPU#1 stuck for 26s! [kworker/1:1:27059] watchdog: BUG: soft lockup - CPU#1 stuck for 52s! [kworker/1:1:27059] CPU: 1 PID: 27059 Comm: kworker/1:1 Tainted: G S W I L 5.13.0-hotplug+ #8 Hardware name: Dell Inc. PowerEdge R640/0H28RR, BIOS 2.9.4 11/06/2020 Workqueue: events linkwatch_event RIP: 0010:virtnet_send_command+0xfc/0x150 [virtio_net] Call Trace: virtnet_set_rx_mode+0xcf/0x2a7 [virtio_net] ? __hw_addr_create_ex+0x85/0xc0 __dev_mc_add+0x72/0x80 igmp6_group_added+0xa7/0xd0 ipv6_mc_up+0x3c/0x60 ipv6_find_idev+0x36/0x80 addrconf_add_dev+0x1e/0xa0 addrconf_dev_config+0x71/0x130 addrconf_notify+0x1f5/0xb40 ? rtnl_is_locked+0x11/0x20 ? __switch_to_asm+0x42/0x70 ? finish_task_switch+0xaf/0x2c0 ? raw_notifier_call_chain+0x3e/0x50 raw_notifier_call_chain+0x3e/0x50 netdev_state_change+0x67/0x90 linkwatch_do_dev+0x3c/0x50 __linkwatch_run_queue+0xd2/0x220 linkwatch_event+0x21/0x30 process_one_work+0x1c8/0x370 worker_thread+0x30/0x380 ? process_one_work+0x370/0x370 kthread+0x118/0x140 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x1f/0x30
Hence, add the ability to abort the command on surprise removal which prevents infinite loop and system lockup.
Signed-off-by: Parav Pandit parav@nvidia.com Link: https://lore.kernel.org/r/20210721142648.1525924-5-parav@nvidia.com Signed-off-by: Michael S. Tsirkin mst@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/virtio/virtio_pci_common.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 45b04bc91f248..b7cc63f556eea 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -579,6 +579,13 @@ static void virtio_pci_remove(struct pci_dev *pci_dev) struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev); struct device *dev = get_device(&vp_dev->vdev.dev);
+ /* + * Device is marked broken on surprise removal so that virtio upper + * layers can abort any ongoing operation. + */ + if (!pci_device_is_present(pci_dev)) + virtio_break_device(&vp_dev->vdev); + pci_disable_sriov(pci_dev);
unregister_virtio_device(&vp_dev->vdev);
hulk inclusion category: bugfix bugzilla: NA CVE: NA
---------------------------
fix kabi broken in struct device. It's introduced by 77e89afc25f3 ("PCI/MSI: Protect msi_desc::masked for multi-MSI").
Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/device.h | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-)
diff --git a/include/linux/device.h b/include/linux/device.h index 954407693ea51..44a3dd381f55c 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -1010,7 +1010,6 @@ struct device { struct dev_pin_info *pins; #endif #ifdef CONFIG_GENERIC_MSI_IRQ - raw_spinlock_t msi_lock; struct list_head msi_list; #endif
@@ -1062,7 +1061,23 @@ struct device { bool offline:1; bool of_node_reused:1;
+#ifdef CONFIG_GENERIC_MSI_IRQ +#if !defined(CONFIG_DEBUG_SPINLOCK) && !defined(CONFIG_DEBUG_LOCK_ALLOC) +#ifndef __GENKSYMS__ + union { + raw_spinlock_t msi_lock; + unsigned long kabi_reserve1; + }; +#else + KABI_RESERVE(1) +#endif +#else + raw_spinlock_t msi_lock; KABI_RESERVE(1) +#endif +#else + KABI_RESERVE(1) +#endif KABI_RESERVE(2) KABI_RESERVE(3) KABI_RESERVE(4)