From: Jia He justin.he@arm.com
stable inclusion from linux-4.19.149 commit 8579a0440381353e0a71dd6a4d4371be8457eac4
--------------------------------
[ Upstream commit 83d116c53058d505ddef051e90ab27f57015b025 ]
When we tested pmdk unit test [1] vmmalloc_fork TEST3 on arm64 guest, there will be a double page fault in __copy_from_user_inatomic of cow_user_page.
To reproduce the bug, the cmd is as follows after you deployed everything: make -C src/test/vmmalloc_fork/ TEST_TIME=60m check
Below call trace is from arm64 do_page_fault for debugging purpose: [ 110.016195] Call trace: [ 110.016826] do_page_fault+0x5a4/0x690 [ 110.017812] do_mem_abort+0x50/0xb0 [ 110.018726] el1_da+0x20/0xc4 [ 110.019492] __arch_copy_from_user+0x180/0x280 [ 110.020646] do_wp_page+0xb0/0x860 [ 110.021517] __handle_mm_fault+0x994/0x1338 [ 110.022606] handle_mm_fault+0xe8/0x180 [ 110.023584] do_page_fault+0x240/0x690 [ 110.024535] do_mem_abort+0x50/0xb0 [ 110.025423] el0_da+0x20/0x24
The pte info before __copy_from_user_inatomic is (PTE_AF is cleared): [ffff9b007000] pgd=000000023d4f8003, pud=000000023da9b003, pmd=000000023d4b3003, pte=360000298607bd3
As told by Catalin: "On arm64 without hardware Access Flag, copying from user will fail because the pte is old and cannot be marked young. So we always end up with zeroed page after fork() + CoW for pfn mappings. we don't always have a hardware-managed access flag on arm64."
This patch fixes it by calling pte_mkyoung. Also, the parameter is changed because vmf should be passed to cow_user_page()
Add a WARN_ON_ONCE when __copy_from_user_inatomic() returns error in case there can be some obscure use-case (by Kirill).
[1] https://github.com/pmem/pmdk/tree/master/src/test/vmmalloc_fork
Signed-off-by: Jia He justin.he@arm.com Reported-by: Yibo Cai Yibo.Cai@arm.com Reviewed-by: Catalin Marinas catalin.marinas@arm.com Acked-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Signed-off-by: Catalin Marinas catalin.marinas@arm.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memory.c | 104 ++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 89 insertions(+), 15 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c index c65d3993ac63..c4c667c0b89d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -117,6 +117,18 @@ int randomize_va_space __read_mostly = 2; #endif
+#ifndef arch_faults_on_old_pte +static inline bool arch_faults_on_old_pte(void) +{ + /* + * Those arches which don't have hw access flag feature need to + * implement their own helper. By default, "true" means pagefault + * will be hit on old pte. + */ + return true; +} +#endif + static int __init disable_randmaps(char *s) { randomize_va_space = 0; @@ -2089,32 +2101,82 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, return same; }
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) +static inline bool cow_user_page(struct page *dst, struct page *src, + struct vm_fault *vmf) { + bool ret; + void *kaddr; + void __user *uaddr; + bool force_mkyoung; + struct vm_area_struct *vma = vmf->vma; + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = vmf->address; + debug_dma_assert_idle(src);
+ if (likely(src)) { + copy_user_highpage(dst, src, addr, vma); + return true; + } + /* * If the source page was a PFN mapping, we don't have * a "struct page" for it. We do a best-effort copy by * just copying from the original user address. If that * fails, we just zero-fill it. Live with it. */ - if (unlikely(!src)) { - void *kaddr = kmap_atomic(dst); - void __user *uaddr = (void __user *)(va & PAGE_MASK); + kaddr = kmap_atomic(dst); + uaddr = (void __user *)(addr & PAGE_MASK); + + /* + * On architectures with software "accessed" bits, we would + * take a double page fault, so mark it accessed here. + */ + force_mkyoung = arch_faults_on_old_pte() && !pte_young(vmf->orig_pte); + if (force_mkyoung) { + pte_t entry; + + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); + if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { + /* + * Other thread has already handled the fault + * and we don't need to do anything. If it's + * not the case, the fault will be triggered + * again on the same address. + */ + ret = false; + goto pte_unlock; + }
+ entry = pte_mkyoung(vmf->orig_pte); + if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) + update_mmu_cache(vma, addr, vmf->pte); + } + + /* + * This really shouldn't fail, because the page is there + * in the page tables. But it might just be unreadable, + * in which case we just give up and fill the result with + * zeroes. + */ + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { /* - * This really shouldn't fail, because the page is there - * in the page tables. But it might just be unreadable, - * in which case we just give up and fill the result with - * zeroes. + * Give a warn in case there can be some obscure + * use-case */ - if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) - clear_page(kaddr); - kunmap_atomic(kaddr); - flush_dcache_page(dst); - } else - copy_user_highpage(dst, src, va, vma); + WARN_ON_ONCE(1); + clear_page(kaddr); + } + + ret = true; + +pte_unlock: + if (force_mkyoung) + pte_unmap_unlock(vmf->pte, vmf->ptl); + kunmap_atomic(kaddr); + flush_dcache_page(dst); + + return ret; }
static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) @@ -2268,7 +2330,19 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) vmf->address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, vmf->address, vma); + + if (!cow_user_page(new_page, old_page, vmf)) { + /* + * COW failed, if the fault was solved by other, + * it's fine. If not, userspace would re-fault on + * the same address and we will handle the fault + * from the second attempt. + */ + put_page(new_page); + if (old_page) + put_page(old_page); + return 0; + } }
if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.149 commit 08f4fc4c8543a5b94299664eec3b6fafadac3ed0
--------------------------------
[ Upstream commit 9ed498c6280a2f2b51d02df96df53037272ede49 ]
sk->sk_backlog.tail might be read without holding the socket spinlock, we need to add proper READ_ONCE()/WRITE_ONCE() to silence the warnings.
KCSAN reported :
BUG: KCSAN: data-race in tcp_add_backlog / tcp_recvmsg
write to 0xffff8881265109f8 of 8 bytes by interrupt on cpu 1: __sk_add_backlog include/net/sock.h:907 [inline] sk_add_backlog include/net/sock.h:938 [inline] tcp_add_backlog+0x476/0xce0 net/ipv4/tcp_ipv4.c:1759 tcp_v4_rcv+0x1a70/0x1bd0 net/ipv4/tcp_ipv4.c:1947 ip_protocol_deliver_rcu+0x4d/0x420 net/ipv4/ip_input.c:204 ip_local_deliver_finish+0x110/0x140 net/ipv4/ip_input.c:231 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_local_deliver+0x133/0x210 net/ipv4/ip_input.c:252 dst_input include/net/dst.h:442 [inline] ip_rcv_finish+0x121/0x160 net/ipv4/ip_input.c:413 NF_HOOK include/linux/netfilter.h:305 [inline] NF_HOOK include/linux/netfilter.h:299 [inline] ip_rcv+0x18f/0x1a0 net/ipv4/ip_input.c:523 __netif_receive_skb_one_core+0xa7/0xe0 net/core/dev.c:4929 __netif_receive_skb+0x37/0xf0 net/core/dev.c:5043 netif_receive_skb_internal+0x59/0x190 net/core/dev.c:5133 napi_skb_finish net/core/dev.c:5596 [inline] napi_gro_receive+0x28f/0x330 net/core/dev.c:5629 receive_buf+0x284/0x30b0 drivers/net/virtio_net.c:1061 virtnet_receive drivers/net/virtio_net.c:1323 [inline] virtnet_poll+0x436/0x7d0 drivers/net/virtio_net.c:1428 napi_poll net/core/dev.c:6311 [inline] net_rx_action+0x3ae/0xa90 net/core/dev.c:6379 __do_softirq+0x115/0x33f kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0xbb/0xe0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:536 [inline] do_IRQ+0xa6/0x180 arch/x86/kernel/irq.c:263 ret_from_intr+0x0/0x19 native_safe_halt+0xe/0x10 arch/x86/kernel/paravirt.c:71 arch_cpu_idle+0x1f/0x30 arch/x86/kernel/process.c:571 default_idle_call+0x1e/0x40 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x1af/0x280 kernel/sched/idle.c:263 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:355 start_secondary+0x208/0x260 arch/x86/kernel/smpboot.c:264 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:241
read to 0xffff8881265109f8 of 8 bytes by task 8057 on cpu 0: tcp_recvmsg+0x46e/0x1b40 net/ipv4/tcp.c:2050 inet_recvmsg+0xbb/0x250 net/ipv4/af_inet.c:838 sock_recvmsg_nosec net/socket.c:871 [inline] sock_recvmsg net/socket.c:889 [inline] sock_recvmsg+0x92/0xb0 net/socket.c:885 sock_read_iter+0x15f/0x1e0 net/socket.c:967 call_read_iter include/linux/fs.h:1889 [inline] new_sync_read+0x389/0x4f0 fs/read_write.c:414 __vfs_read+0xb1/0xc0 fs/read_write.c:427 vfs_read fs/read_write.c:461 [inline] vfs_read+0x143/0x2c0 fs/read_write.c:446 ksys_read+0xd5/0x1b0 fs/read_write.c:587 __do_sys_read fs/read_write.c:597 [inline] __se_sys_read fs/read_write.c:595 [inline] __x64_sys_read+0x4c/0x60 fs/read_write.c:595 do_syscall_64+0xcc/0x370 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 8057 Comm: syz-fuzzer Not tainted 5.4.0-rc6+ #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Signed-off-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/crypto/chelsio/chtls/chtls_io.c | 10 +++++----- include/net/sock.h | 4 ++-- net/ipv4/tcp.c | 2 +- net/llc/af_llc.c | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/drivers/crypto/chelsio/chtls/chtls_io.c b/drivers/crypto/chelsio/chtls/chtls_io.c index 1f3e6000a106..61b309c7b2fc 100644 --- a/drivers/crypto/chelsio/chtls/chtls_io.c +++ b/drivers/crypto/chelsio/chtls/chtls_io.c @@ -1445,7 +1445,7 @@ static int chtls_pt_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, csk->wr_max_credits)) sk->sk_write_space(sk);
- if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break;
if (copied) { @@ -1478,7 +1478,7 @@ static int chtls_pt_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, break; } } - if (sk->sk_backlog.tail) { + if (READ_ONCE(sk->sk_backlog.tail)) { release_sock(sk); lock_sock(sk); chtls_cleanup_rbuf(sk, copied); @@ -1623,7 +1623,7 @@ static int peekmsg(struct sock *sk, struct msghdr *msg, break; }
- if (sk->sk_backlog.tail) { + if (READ_ONCE(sk->sk_backlog.tail)) { /* Do not sleep, just process backlog. */ release_sock(sk); lock_sock(sk); @@ -1755,7 +1755,7 @@ int chtls_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, csk->wr_max_credits)) sk->sk_write_space(sk);
- if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break;
if (copied) { @@ -1786,7 +1786,7 @@ int chtls_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } }
- if (sk->sk_backlog.tail) { + if (READ_ONCE(sk->sk_backlog.tail)) { release_sock(sk); lock_sock(sk); chtls_cleanup_rbuf(sk, copied); diff --git a/include/net/sock.h b/include/net/sock.h index b5fb4d367406..0a26b42a7b2f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -918,11 +918,11 @@ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) skb_dst_force(skb);
if (!sk->sk_backlog.tail) - sk->sk_backlog.head = skb; + WRITE_ONCE(sk->sk_backlog.head, skb); else sk->sk_backlog.tail->next = skb;
- sk->sk_backlog.tail = skb; + WRITE_ONCE(sk->sk_backlog.tail, skb); skb->next = NULL; }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 616ff2970f4f..4ce3397e6fcf 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2038,7 +2038,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
/* Well, if we have backlog, try to process it now yet. */
- if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break;
if (copied) { diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 5abb7f9b7ee5..fa0f3c1543ba 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -784,7 +784,7 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, } /* Well, if we have backlog, try to process it now yet. */
- if (copied >= target && !sk->sk_backlog.tail) + if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break;
if (copied) {
From: Al Viro viro@zeniv.linux.org.uk
stable inclusion from linux-4.19.149 commit 98e151baabddf8f26212739ad4f194591a1b22f6
--------------------------------
[ Upstream commit e84009336711d2bba885fc9cea66348ddfce3758 ]
We are overoptimistic about taking the fast path there; seeing the same value in ->d_parent after having grabbed a reference to that parent does *not* mean that it has remained our parent all along.
That wouldn't be a big deal (in the end it is our parent and we have grabbed the reference we are about to return), but... the situation with barriers is messed up.
We might have hit the following sequence:
d is a dentry of /tmp/a/b CPU1: CPU2: parent = d->d_parent (i.e. dentry of /tmp/a) rename /tmp/a/b to /tmp/b rmdir /tmp/a, making its dentry negative grab reference to parent, end up with cached parent->d_inode (NULL) mkdir /tmp/a, rename /tmp/b to /tmp/a/b recheck d->d_parent, which is back to original decide that everything's fine and return the reference we'd got.
The trouble is, caller (on CPU1) will observe dget_parent() returning an apparently negative dentry. It actually is positive, but CPU1 has stale ->d_inode cached.
Use d->d_seq to see if it has been moved instead of rechecking ->d_parent. NOTE: we are *NOT* going to retry on any kind of ->d_seq mismatch; we just go into the slow path in such case. We don't wait for ->d_seq to become even either - again, if we are racing with renames, we can bloody well go to slow path anyway.
Signed-off-by: Al Viro viro@zeniv.linux.org.uk Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/dcache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/dcache.c b/fs/dcache.c index 997c1efd5906..ef1641398eda 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -905,17 +905,19 @@ struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; + unsigned seq;
/* * Do optimistic parent lookup without any * locking. */ rcu_read_lock(); + seq = raw_seqcount_begin(&dentry->d_seq); ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { - if (likely(ret == READ_ONCE(dentry->d_parent))) + if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; dput(ret); }
From: Brian Foster bfoster@redhat.com
stable inclusion from linux-4.19.149 commit 03ad258887f66a51ece17e6604905616a5647c55
--------------------------------
[ Upstream commit 2a2b5932db67586bacc560cc065d62faece5b996 ]
The leaf format xattr addition helper xfs_attr3_leaf_add_work() adjusts the block freemap in a couple places. The first update drops the size of the freemap that the caller had already selected to place the xattr name/value data. Before the function returns, it also checks whether the entries array has encroached on a freemap range by virtue of the new entry addition. This is necessary because the entries array grows from the start of the block (but end of the block header) towards the end of the block while the name/value data grows from the end of the block in the opposite direction. If the associated freemap is already empty, however, size is zero and the subtraction underflows the field and causes corruption.
This is reproduced rarely by generic/070. The observed behavior is that a smaller sized freemap is aligned to the end of the entries list, several subsequent xattr additions land in larger freemaps and the entries list expands into the smaller freemap until it is fully consumed and then underflows. Note that it is not otherwise a corruption for the entries array to consume an empty freemap because the nameval list (i.e. the firstused pointer in the xattr header) starts beyond the end of the corrupted freemap.
Update the freemap size modification to account for the fact that the freemap entry can be empty and thus stale.
Signed-off-by: Brian Foster bfoster@redhat.com Reviewed-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/libxfs/xfs_attr_leaf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 39df2c5425fa..880fadb19fc0 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -1440,7 +1440,9 @@ xfs_attr3_leaf_add_work( for (i = 0; i < XFS_ATTR_LEAF_MAPSIZE; i++) { if (ichdr->freemap[i].base == tmp) { ichdr->freemap[i].base += sizeof(xfs_attr_leaf_entry_t); - ichdr->freemap[i].size -= sizeof(xfs_attr_leaf_entry_t); + ichdr->freemap[i].size -= + min_t(uint16_t, ichdr->freemap[i].size, + sizeof(xfs_attr_leaf_entry_t)); } } ichdr->usedbytes += xfs_attr_leaf_entsize(leaf, args->index);
From: Joe Perches joe@perches.com
stable inclusion from linux-4.19.149 commit 489cee9124d53ce20fb3f989449efa2aa51a4b18
--------------------------------
[ Upstream commit 5e1aada08cd19ea652b2d32a250501d09b02ff2e ]
Initialization is not guaranteed to zero padding bytes so use an explicit memset instead to avoid leaking any kernel content in any possible padding bytes.
Link: http://lkml.kernel.org/r/dfa331c00881d61c8ee51577a082d8bebd61805c.camel@perc... Signed-off-by: Joe Perches joe@perches.com Cc: Dan Carpenter error27@gmail.com Cc: Julia Lawall julia.lawall@lip6.fr Cc: Thomas Gleixner tglx@linutronix.de Cc: Kees Cook keescook@chromium.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sys.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/kernel/sys.c b/kernel/sys.c index 827289a34776..55e142077dcf 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1275,11 +1275,13 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) { - struct oldold_utsname tmp = {}; + struct oldold_utsname tmp;
if (!name) return -EFAULT;
+ memset(&tmp, 0, sizeof(tmp)); + down_read(&uts_sem); memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN); memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN);
From: "Darrick J. Wong" darrick.wong@oracle.com
stable inclusion from linux-4.19.149 commit 576f57da9107056935364824ecd2d78a07d542e6
--------------------------------
[ Upstream commit b1de6fc7520fe12949c070af0e8c0e4044cd3420 ]
Omar Sandoval reported that a 4G fallocate on the realtime device causes filesystem shutdowns due to a log reservation overflow that happens when we log the rtbitmap updates. Factor rtbitmap/rtsummary updates into the the tr_write and tr_itruncate log reservation calculation.
"The following reproducer results in a transaction log overrun warning for me:
mkfs.xfs -f -r rtdev=/dev/vdc -d rtinherit=1 -m reflink=0 /dev/vdb mount -o rtdev=/dev/vdc /dev/vdb /mnt fallocate -l 4G /mnt/foo
Reported-by: Omar Sandoval osandov@osandov.com Tested-by: Omar Sandoval osandov@osandov.com Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Brian Foster bfoster@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/libxfs/xfs_trans_resv.c | 96 +++++++++++++++++++++++++++------- 1 file changed, 77 insertions(+), 19 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index f99a7aefe418..2b3cc5a8ced1 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -197,6 +197,24 @@ xfs_calc_inode_chunk_res( return res; }
+/* + * Per-extent log reservation for the btree changes involved in freeing or + * allocating a realtime extent. We have to be able to log as many rtbitmap + * blocks as needed to mark inuse MAXEXTLEN blocks' worth of realtime extents, + * as well as the realtime summary block. + */ +unsigned int +xfs_rtalloc_log_count( + struct xfs_mount *mp, + unsigned int num_ops) +{ + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + unsigned int rtbmp_bytes; + + rtbmp_bytes = (MAXEXTLEN / mp->m_sb.sb_rextsize) / NBBY; + return (howmany(rtbmp_bytes, blksz) + 1) * num_ops; +} + /* * Various log reservation values. * @@ -219,13 +237,21 @@ xfs_calc_inode_chunk_res(
/* * In a write transaction we can allocate a maximum of 2 - * extents. This gives: + * extents. This gives (t1): * the inode getting the new extents: inode size * the inode's bmap btree: max depth * block size * the agfs of the ags from which the extents are allocated: 2 * sector * the superblock free block counter: sector size * the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size - * And the bmap_finish transaction can free bmap blocks in a join: + * Or, if we're writing to a realtime file (t2): + * the inode getting the new extents: inode size + * the inode's bmap btree: max depth * block size + * the agfs of the ags from which the extents are allocated: 2 * sector + * the superblock free block counter: sector size + * the realtime bitmap: ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 1 block + * the allocation btrees: 2 trees * (2 * max depth - 1) * block size + * And the bmap_finish transaction can free bmap blocks in a join (t3): * the agfs of the ags containing the blocks: 2 * sector size * the agfls of the ags containing the blocks: 2 * sector size * the super block free block counter: sector size @@ -235,40 +261,72 @@ STATIC uint xfs_calc_write_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), blksz) + + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t2 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), - XFS_FSB_TO_B(mp, 1)) + + blksz) + xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), - XFS_FSB_TO_B(mp, 1)))); + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 1), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 1), blksz); + } else { + t2 = 0; + } + + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); }
/* - * In truncating a file we free up to two extents at once. We can modify: + * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size * the inode's bmap btree: (max depth + 1) * block size - * And the bmap_finish transaction can free the blocks and bmap blocks: + * And the bmap_finish transaction can free the blocks and bmap blocks (t2): * the agf for each of the ags: 4 * sector size * the agfl for each of the ags: 4 * sector size * the super block to reflect the freed blocks: sector size * worst case split in allocation btrees per extent assuming 4 extents: * 4 exts * 2 trees * (2 * max depth - 1) * block size + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: 2 exts * ((MAXEXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size */ STATIC uint xfs_calc_itruncate_reservation( struct xfs_mount *mp) { - return XFS_DQUOT_LOGRES(mp) + - max((xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, - XFS_FSB_TO_B(mp, 1))), - (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), - XFS_FSB_TO_B(mp, 1)))); + unsigned int t1, t2, t3; + unsigned int blksz = XFS_FSB_TO_B(mp, 1); + + t1 = xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); + + t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 4), blksz); + + if (xfs_sb_version_hasrealtime(&mp->m_sb)) { + t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_log_count(mp, 2), blksz) + + xfs_calc_buf_res(xfs_allocfree_log_count(mp, 2), blksz); + } else { + t3 = 0; + } + + return XFS_DQUOT_LOGRES(mp) + max3(t1, t2, t3); }
/*
From: Vasily Averin vvs@virtuozzo.com
stable inclusion from linux-4.19.149 commit f2cd82a26fcf477d593c546d8a84fc7881ceaf27
--------------------------------
[ Upstream commit 1e3f9f073c47bee7c23e77316b07bc12338c5bba ]
if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output.
https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin vvs@virtuozzo.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/core/neighbour.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/net/core/neighbour.c b/net/core/neighbour.c index bf738ec68cb5..6e890f51b7d8 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2844,6 +2844,7 @@ static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu+1; return per_cpu_ptr(tbl->stats, cpu); } + (*pos)++; return NULL; }
From: Vasily Averin vvs@virtuozzo.com
stable inclusion from linux-4.19.149 commit e10f2f29ba39f70d657f563d332405ea8f0cb349
--------------------------------
[ Upstream commit a3ea86739f1bc7e121d921842f0f4a8ab1af94d9 ]
if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output.
https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin vvs@virtuozzo.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/route.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index b033fdd6437e..758acec80389 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -274,6 +274,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) *pos = cpu+1; return &per_cpu(rt_cache_stat, cpu); } + (*pos)++; return NULL;
}
From: Vasily Averin vvs@virtuozzo.com
stable inclusion from linux-4.19.149 commit 24cb471708d57962df50de38547dac45f0879f76
--------------------------------
[ Upstream commit 4fc427e0515811250647d44de38d87d7b0e0790f ]
if seq_file .next fuction does not change position index, read after some lseek can generate unexpected output.
https://bugzilla.kernel.org/show_bug.cgi?id=206283 Signed-off-by: Vasily Averin vvs@virtuozzo.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv6/ip6_fib.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 05a206202e23..b924941b96a3 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2377,14 +2377,13 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private;
+ ++(*pos); if (!v) goto iter_table;
n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next); - if (n) { - ++*pos; + if (n) return n; - }
iter_table: ipv6_route_check_sernum(iter); @@ -2392,8 +2391,6 @@ static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { - if (v) - ++*pos; return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w);
From: Kevin Kou qdkevin.kou@gmail.com
stable inclusion from linux-4.19.149 commit 479468bef2fa4845cd894ad352181b619195fe70
--------------------------------
[ Upstream commit f643ee295c1c63bc117fb052d4da681354d6f732 ]
The original patch bringed in the "SCTP ACK tracking trace event" feature was committed at Dec.20, 2017, it replaced jprobe usage with trace events, and bringed in two trace events, one is TRACE_EVENT(sctp_probe), another one is TRACE_EVENT(sctp_probe_path). The original patch intended to trigger the trace_sctp_probe_path in TRACE_EVENT(sctp_probe) as below code,
+TRACE_EVENT(sctp_probe, + + TP_PROTO(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk), + + TP_ARGS(ep, asoc, chunk), + + TP_STRUCT__entry( + __field(__u64, asoc) + __field(__u32, mark) + __field(__u16, bind_port) + __field(__u16, peer_port) + __field(__u32, pathmtu) + __field(__u32, rwnd) + __field(__u16, unack_data) + ), + + TP_fast_assign( + struct sk_buff *skb = chunk->skb; + + __entry->asoc = (unsigned long)asoc; + __entry->mark = skb->mark; + __entry->bind_port = ep->base.bind_addr.port; + __entry->peer_port = asoc->peer.port; + __entry->pathmtu = asoc->pathmtu; + __entry->rwnd = asoc->peer.rwnd; + __entry->unack_data = asoc->unack_data; + + if (trace_sctp_probe_path_enabled()) { + struct sctp_transport *sp; + + list_for_each_entry(sp, &asoc->peer.transport_addr_list, + transports) { + trace_sctp_probe_path(sp, asoc); + } + } + ),
But I found it did not work when I did testing, and trace_sctp_probe_path had no output, I finally found that there is trace buffer lock operation(trace_event_buffer_reserve) in include/trace/trace_events.h:
static notrace void \ trace_event_raw_event_##call(void *__data, proto) \ { \ struct trace_event_file *trace_file = __data; \ struct trace_event_data_offsets_##call __maybe_unused __data_offsets;\ struct trace_event_buffer fbuffer; \ struct trace_event_raw_##call *entry; \ int __data_size; \ \ if (trace_trigger_soft_disabled(trace_file)) \ return; \ \ __data_size = trace_event_get_offsets_##call(&__data_offsets, args); \ \ entry = trace_event_buffer_reserve(&fbuffer, trace_file, \ sizeof(*entry) + __data_size); \ \ if (!entry) \ return; \ \ tstruct \ \ { assign; } \ \ trace_event_buffer_commit(&fbuffer); \ }
The reason caused no output of trace_sctp_probe_path is that trace_sctp_probe_path written in TP_fast_assign part of TRACE_EVENT(sctp_probe), and it will be placed( { assign; } ) after the trace_event_buffer_reserve() when compiler expands Macro,
entry = trace_event_buffer_reserve(&fbuffer, trace_file, \ sizeof(*entry) + __data_size); \ \ if (!entry) \ return; \ \ tstruct \ \ { assign; } \
so trace_sctp_probe_path finally can not acquire trace_event_buffer and return no output, that is to say the nest of tracepoint entry function is not allowed. The function call flow is:
trace_sctp_probe() -> trace_event_raw_event_sctp_probe() -> lock buffer -> trace_sctp_probe_path() -> trace_event_raw_event_sctp_probe_path() --nested -> buffer has been locked and return no output.
This patch is to remove trace_sctp_probe_path from the TP_fast_assign part of TRACE_EVENT(sctp_probe) to avoid the nest of entry function, and trigger sctp_probe_path_trace in sctp_outq_sack.
After this patch, you can enable both events individually, # cd /sys/kernel/debug/tracing # echo 1 > events/sctp/sctp_probe/enable # echo 1 > events/sctp/sctp_probe_path/enable
Or, you can enable all the events under sctp.
# echo 1 > events/sctp/enable
Signed-off-by: Kevin Kou qdkevin.kou@gmail.com Acked-by: Marcelo Ricardo Leitner marcelo.leitner@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/trace/events/sctp.h | 9 --------- net/sctp/outqueue.c | 6 ++++++ 2 files changed, 6 insertions(+), 9 deletions(-)
diff --git a/include/trace/events/sctp.h b/include/trace/events/sctp.h index 7475c7be165a..d4aac3436595 100644 --- a/include/trace/events/sctp.h +++ b/include/trace/events/sctp.h @@ -75,15 +75,6 @@ TRACE_EVENT(sctp_probe, __entry->pathmtu = asoc->pathmtu; __entry->rwnd = asoc->peer.rwnd; __entry->unack_data = asoc->unack_data; - - if (trace_sctp_probe_path_enabled()) { - struct sctp_transport *sp; - - list_for_each_entry(sp, &asoc->peer.transport_addr_list, - transports) { - trace_sctp_probe_path(sp, asoc); - } - } ),
TP_printk("asoc=%#llx mark=%#x bind_port=%d peer_port=%d pathmtu=%d " diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 7bb8e5603298..d6e83a37a1ad 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -51,6 +51,7 @@ #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/stream_sched.h> +#include <trace/events/sctp.h>
/* Declare internal functions here. */ static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); @@ -1257,6 +1258,11 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk) /* Grab the association's destination address list. */ transport_list = &asoc->peer.transport_addr_list;
+ /* SCTP path tracepoint for congestion control debugging. */ + list_for_each_entry(transport, transport_list, transports) { + trace_sctp_probe_path(transport, asoc); + } + sack_ctsn = ntohl(sack->cum_tsn_ack); gap_ack_blocks = ntohs(sack->num_gap_ack_blocks); asoc->stats.gapcnt += gap_ack_blocks;
From: "Rafael J. Wysocki" rafael.j.wysocki@intel.com
stable inclusion from linux-4.19.149 commit 4913d773d113b1f61620baeadaa9d8ef3a4400c1
--------------------------------
[ Upstream commit 3df663a147fe077a6ee8444ec626738946e65547 ]
There is a race condition in acpi_ec_get_query_handler() theoretically allowing query handlers to go away before refernce counting them.
In order to avoid it, call kref_get() on query handlers under ec->mutex.
Also simplify the code a bit while at it.
Signed-off-by: Rafael J. Wysocki rafael.j.wysocki@intel.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/acpi/ec.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-)
diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 49e16f009095..9415a0041aaf 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1080,29 +1080,21 @@ void acpi_ec_dispatch_gpe(void) /* -------------------------------------------------------------------------- Event Management -------------------------------------------------------------------------- */ -static struct acpi_ec_query_handler * -acpi_ec_get_query_handler(struct acpi_ec_query_handler *handler) -{ - if (handler) - kref_get(&handler->kref); - return handler; -} - static struct acpi_ec_query_handler * acpi_ec_get_query_handler_by_value(struct acpi_ec *ec, u8 value) { struct acpi_ec_query_handler *handler; - bool found = false;
mutex_lock(&ec->mutex); list_for_each_entry(handler, &ec->list, node) { if (value == handler->query_bit) { - found = true; - break; + kref_get(&handler->kref); + mutex_unlock(&ec->mutex); + return handler; } } mutex_unlock(&ec->mutex); - return found ? acpi_ec_get_query_handler(handler) : NULL; + return NULL; }
static void acpi_ec_query_handler_release(struct kref *kref)
From: Josef Bacik jbacik@fb.com
stable inclusion from linux-4.19.149 commit 72913876dc5fe6ae97963b5674c1797b19f6efcd
--------------------------------
[ Upstream commit cbc3b92ce037f5e7536f6db157d185cd8b8f615c ]
I noticed when trying to use the trace-cmd python interface that reading the raw buffer wasn't working for kernel_stack events. This is because it uses a stubbed version of __dynamic_array that doesn't do the __data_loc trick and encode the length of the array into the field. Instead it just shows up as a size of 0. So change this to __array and set the len to FTRACE_STACK_ENTRIES since this is what we actually do in practice and matches how user_stack_trace works.
Link: http://lkml.kernel.org/r/1411589652-1318-1-git-send-email-jbacik@fb.com
Signed-off-by: Josef Bacik jbacik@fb.com [ Pulled from the archeological digging of my INBOX ] Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/trace_entries.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 06bb2fd9a56c..a97aad105d36 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -179,7 +179,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
F_STRUCT( __field( int, size ) - __dynamic_array(unsigned long, caller ) + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) ),
F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
From: Vasily Averin vvs@virtuozzo.com
stable inclusion from linux-4.19.149 commit 52f5a09ab7583ed497fc4b331311d71b7d8a6e12
--------------------------------
[ Upstream commit 10c8d69f314d557d94d74ec492575ae6a4f1eb1c ]
If seq_file .next fuction does not change position index, read after some lseek can generate unexpected output.
In Aug 2018 NeilBrown noticed commit 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and interface") "Some ->next functions do not increment *pos when they return NULL... Note that such ->next functions are buggy and should be fixed. A simple demonstration is
dd if=/proc/swaps bs=1000 skip=1
Choose any block size larger than the size of /proc/swaps. This will always show the whole last line of /proc/swaps"
Described problem is still actual. If you make lseek into middle of last output line following read will output end of last line and whole last line once again.
$ dd if=/proc/swaps bs=1 # usual output Filename Type Size Used Priority /dev/dm-0 partition 4194812 97536 -2 104+0 records in 104+0 records out 104 bytes copied
$ dd if=/proc/swaps bs=40 skip=1 # last line was generated twice dd: /proc/swaps: cannot skip to specified offset v/dm-0 partition 4194812 97536 -2 /dev/dm-0 partition 4194812 97536 -2 3+1 records in 3+1 records out 131 bytes copied
https://bugzilla.kernel.org/show_bug.cgi?id=206283
Link: http://lkml.kernel.org/r/bd8cfd7b-ac95-9b91-f9e7-e8438bd5047d@virtuozzo.com Signed-off-by: Vasily Averin vvs@virtuozzo.com Reviewed-by: Andrew Morton akpm@linux-foundation.org Cc: Jann Horn jannh@google.com Cc: Alexander Viro viro@zeniv.linux.org.uk Cc: Kees Cook keescook@chromium.org Cc: Hugh Dickins hughd@google.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/swapfile.c b/mm/swapfile.c index 41400ae4e7c1..765fca3451d2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2843,10 +2843,10 @@ static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) else type = si->type + 1;
+ ++(*pos); for (; (si = swap_type_to_swap_info(type)); type++) { if (!(si->flags & SWP_USED) || !si->swap_map) continue; - ++*pos; return si; }
From: Qian Cai cai@lca.pw
stable inclusion from linux-4.19.149 commit 1fea0234984c39748386a8b2eebaf8a1561e3075
--------------------------------
[ Upstream commit 86b18aaa2b5b5bb48e609cd591b3d2d0fdbe0442 ]
sk_buff.qlen can be accessed concurrently as noticed by KCSAN,
BUG: KCSAN: data-race in __skb_try_recv_from_queue / unix_dgram_sendmsg
read to 0xffff8a1b1d8a81c0 of 4 bytes by task 5371 on cpu 96: unix_dgram_sendmsg+0x9a9/0xb70 include/linux/skbuff.h:1821 net/unix/af_unix.c:1761 ____sys_sendmsg+0x33e/0x370 ___sys_sendmsg+0xa6/0xf0 __sys_sendmsg+0x69/0xf0 __x64_sys_sendmsg+0x51/0x70 do_syscall_64+0x91/0xb47 entry_SYSCALL_64_after_hwframe+0x49/0xbe
write to 0xffff8a1b1d8a81c0 of 4 bytes by task 1 on cpu 99: __skb_try_recv_from_queue+0x327/0x410 include/linux/skbuff.h:2029 __skb_try_recv_datagram+0xbe/0x220 unix_dgram_recvmsg+0xee/0x850 ____sys_recvmsg+0x1fb/0x210 ___sys_recvmsg+0xa2/0xf0 __sys_recvmsg+0x66/0xf0 __x64_sys_recvmsg+0x51/0x70 do_syscall_64+0x91/0xb47 entry_SYSCALL_64_after_hwframe+0x49/0xbe
Since only the read is operating as lockless, it could introduce a logic bug in unix_recvq_full() due to the load tearing. Fix it by adding a lockless variant of skb_queue_len() and unix_recvq_full() where READ_ONCE() is on the read while WRITE_ONCE() is on the write similar to the commit d7d16a89350a ("net: add skb_queue_empty_lockless()").
Signed-off-by: Qian Cai cai@lca.pw Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/skbuff.h | 14 +++++++++++++- net/unix/af_unix.c | 11 +++++++++-- 2 files changed, 22 insertions(+), 3 deletions(-)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index cbc0294f3989..703ce71caeac 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1688,6 +1688,18 @@ static inline __u32 skb_queue_len(const struct sk_buff_head *list_) return list_->qlen; }
+/** + * skb_queue_len_lockless - get queue length + * @list_: list to measure + * + * Return the length of an &sk_buff queue. + * This variant can be used in lockless contexts. + */ +static inline __u32 skb_queue_len_lockless(const struct sk_buff_head *list_) +{ + return READ_ONCE(list_->qlen); +} + /** * __skb_queue_head_init - initialize non-spinlock portions of sk_buff_head * @list: queue to initialize @@ -1895,7 +1907,7 @@ static inline void __skb_unlink(struct sk_buff *skb, struct sk_buff_head *list) { struct sk_buff *next, *prev;
- list->qlen--; + WRITE_ONCE(list->qlen, list->qlen - 1); next = skb->next; prev = skb->prev; skb->next = skb->prev = NULL; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2318e2e2748f..2020306468af 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -192,11 +192,17 @@ static inline int unix_may_send(struct sock *sk, struct sock *osk) return unix_peer(osk) == NULL || unix_our_peer(sk, osk); }
-static inline int unix_recvq_full(struct sock const *sk) +static inline int unix_recvq_full(const struct sock *sk) { return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog; }
+static inline int unix_recvq_full_lockless(const struct sock *sk) +{ + return skb_queue_len_lockless(&sk->sk_receive_queue) > + READ_ONCE(sk->sk_max_ack_backlog); +} + struct sock *unix_peer_get(struct sock *s) { struct sock *peer; @@ -1788,7 +1794,8 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, * - unix_peer(sk) == sk by time of get but disconnected before lock */ if (other != sk && - unlikely(unix_peer(other) != sk && unix_recvq_full(other))) { + unlikely(unix_peer(other) != sk && + unix_recvq_full_lockless(other))) { if (timeo) { timeo = unix_wait_for_peer(other, timeo);
From: Steve Grubb sgrubb@redhat.com
stable inclusion from linux-4.19.149 commit 67fd417f961254a409c2e64e026b9f2b41434c40
--------------------------------
[ Upstream commit 70b3eeed49e8190d97139806f6fbaf8964306cdb ]
Common Criteria calls out for any action that modifies the audit trail to be recorded. That usually is interpreted to mean insertion or removal of rules. It is not required to log modification of the inode information since the watch is still in effect. Additionally, if the rule is a never rule and the underlying file is one they do not want events for, they get an event for this bookkeeping update against their wishes.
Since no device/inode info is logged at insertion and no device/inode information is logged on update, there is nothing meaningful being communicated to the admin by the CONFIG_CHANGE updated_rules event. One can assume that the rule was not "modified" because it is still watching the intended target. If the device or inode cannot be resolved, then audit_panic is called which is sufficient.
The correct resolution is to drop logging config_update events since the watch is still in effect but just on another unknown inode.
Signed-off-by: Steve Grubb sgrubb@redhat.com Signed-off-by: Paul Moore paul@paul-moore.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/audit_watch.c | 2 -- 1 file changed, 2 deletions(-)
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 4f7262eba73d..50952d6d8120 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -317,8 +317,6 @@ static void audit_update_watch(struct audit_parent *parent, if (oentry->rule.exe) audit_remove_mark(oentry->rule.exe);
- audit_watch_log_rule_change(r, owatch, "updated_rules"); - call_rcu(&oentry->rcu, audit_free_rule_rcu); }
From: Vasily Averin vvs@virtuozzo.com
stable inclusion from linux-4.19.149 commit 64e0f9e159fe6b592e0fe26cfc1ce03f79d2a9db
--------------------------------
[ Upstream commit 8d269a8e2a8f0bca89022f4ec98de460acb90365 ]
If seq_file .next function does not change position index, read after some lseek can generate unexpected output.
$ dd if=/sys/fs/selinux/avc/cache_stats # usual output lookups hits misses allocations reclaims frees 817223 810034 7189 7189 6992 7037 1934894 1926896 7998 7998 7632 7683 1322812 1317176 5636 5636 5456 5507 1560571 1551548 9023 9023 9056 9115 0+1 records in 0+1 records out 189 bytes copied, 5,1564e-05 s, 3,7 MB/s
$# read after lseek to midle of last line $ dd if=/sys/fs/selinux/avc/cache_stats bs=180 skip=1 dd: /sys/fs/selinux/avc/cache_stats: cannot skip to specified offset 056 9115 <<<< end of last line 1560571 1551548 9023 9023 9056 9115 <<< whole last line once again 0+1 records in 0+1 records out 45 bytes copied, 8,7221e-05 s, 516 kB/s
$# read after lseek beyond end of of file $ dd if=/sys/fs/selinux/avc/cache_stats bs=1000 skip=1 dd: /sys/fs/selinux/avc/cache_stats: cannot skip to specified offset 1560571 1551548 9023 9023 9056 9115 <<<< generates whole last line 0+1 records in 0+1 records out 36 bytes copied, 9,0934e-05 s, 396 kB/s
https://bugzilla.kernel.org/show_bug.cgi?id=206283
Signed-off-by: Vasily Averin vvs@virtuozzo.com Acked-by: Stephen Smalley sds@tycho.nsa.gov Signed-off-by: Paul Moore paul@paul-moore.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- security/selinux/selinuxfs.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index f3a5a138a096..60b3f16bb5c7 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -1509,6 +1509,7 @@ static struct avc_cache_stats *sel_avc_get_stat_idx(loff_t *idx) *idx = cpu + 1; return &per_cpu(avc_cache_stats, cpu); } + (*idx)++; return NULL; }
From: Qian Cai cai@lca.pw
stable inclusion from linux-4.19.149 commit dca75ae683c1acc4dde1d1a5b65d3fe00d03013a
--------------------------------
[ Upstream commit e00d996a4317aff5351c4338dd97d390225412c2 ]
Fields in "struct timer_rand_state" could be accessed concurrently. Lockless plain reads and writes result in data races. Fix them by adding pairs of READ|WRITE_ONCE(). The data races were reported by KCSAN,
BUG: KCSAN: data-race in add_timer_randomness / add_timer_randomness
write to 0xffff9f320a0a01d0 of 8 bytes by interrupt on cpu 22: add_timer_randomness+0x100/0x190 add_timer_randomness at drivers/char/random.c:1152 add_disk_randomness+0x85/0x280 scsi_end_request+0x43a/0x4a0 scsi_io_completion+0xb7/0x7e0 scsi_finish_command+0x1ed/0x2a0 scsi_softirq_done+0x1c9/0x1d0 blk_done_softirq+0x181/0x1d0 __do_softirq+0xd9/0x57c irq_exit+0xa2/0xc0 do_IRQ+0x8b/0x190 ret_from_intr+0x0/0x42 cpuidle_enter_state+0x15e/0x980 cpuidle_enter+0x69/0xc0 call_cpuidle+0x23/0x40 do_idle+0x248/0x280 cpu_startup_entry+0x1d/0x1f start_secondary+0x1b2/0x230 secondary_startup_64+0xb6/0xc0
no locks held by swapper/22/0. irq event stamp: 32871382 _raw_spin_unlock_irqrestore+0x53/0x60 _raw_spin_lock_irqsave+0x21/0x60 _local_bh_enable+0x21/0x30 irq_exit+0xa2/0xc0
read to 0xffff9f320a0a01d0 of 8 bytes by interrupt on cpu 2: add_timer_randomness+0xe8/0x190 add_disk_randomness+0x85/0x280 scsi_end_request+0x43a/0x4a0 scsi_io_completion+0xb7/0x7e0 scsi_finish_command+0x1ed/0x2a0 scsi_softirq_done+0x1c9/0x1d0 blk_done_softirq+0x181/0x1d0 __do_softirq+0xd9/0x57c irq_exit+0xa2/0xc0 do_IRQ+0x8b/0x190 ret_from_intr+0x0/0x42 cpuidle_enter_state+0x15e/0x980 cpuidle_enter+0x69/0xc0 call_cpuidle+0x23/0x40 do_idle+0x248/0x280 cpu_startup_entry+0x1d/0x1f start_secondary+0x1b2/0x230 secondary_startup_64+0xb6/0xc0
no locks held by swapper/2/0. irq event stamp: 37846304 _raw_spin_unlock_irqrestore+0x53/0x60 _raw_spin_lock_irqsave+0x21/0x60 _local_bh_enable+0x21/0x30 irq_exit+0xa2/0xc0
Reported by Kernel Concurrency Sanitizer on: Hardware name: HP ProLiant BL660c Gen9, BIOS I38 10/17/2018
Link: https://lore.kernel.org/r/1582648024-13111-1-git-send-email-cai@lca.pw Signed-off-by: Qian Cai cai@lca.pw Signed-off-by: Theodore Ts'o tytso@mit.edu Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/char/random.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/drivers/char/random.c b/drivers/char/random.c index a9850bc2006a..c210a6f4551a 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1150,14 +1150,14 @@ static void add_timer_randomness(struct timer_rand_state *state, unsigned num) * We take into account the first, second and third-order deltas * in order to make our estimate. */ - delta = sample.jiffies - state->last_time; - state->last_time = sample.jiffies; + delta = sample.jiffies - READ_ONCE(state->last_time); + WRITE_ONCE(state->last_time, sample.jiffies);
- delta2 = delta - state->last_delta; - state->last_delta = delta; + delta2 = delta - READ_ONCE(state->last_delta); + WRITE_ONCE(state->last_delta, delta);
- delta3 = delta2 - state->last_delta2; - state->last_delta2 = delta2; + delta3 = delta2 - READ_ONCE(state->last_delta2); + WRITE_ONCE(state->last_delta2, delta2);
if (delta < 0) delta = -delta;
From: John Garry john.garry@huawei.com
stable inclusion from linux-4.19.149 commit 2002c630a95be88a7c4a8fc9a2ef31ac01f900d6
--------------------------------
[ Upstream commit 3f5777fbaf04c58d940526a22a2e0c813c837936 ]
The memory for global pointer is never freed during normal program execution, so let's do that in the main function exit as a good programming practice.
A stray blank line is also removed.
Reported-by: Jiri Olsa jolsa@redhat.com Signed-off-by: John Garry john.garry@huawei.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Andi Kleen ak@linux.intel.com Cc: James Clark james.clark@arm.com Cc: Joakim Zhang qiangqing.zhang@nxp.com Cc: Mark Rutland mark.rutland@arm.com Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Will Deacon will@kernel.org Cc: linuxarm@huawei.com Link: http://lore.kernel.org/lkml/1583406486-154841-2-git-send-email-john.garry@hu... Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/perf/pmu-events/jevents.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-)
diff --git a/tools/perf/pmu-events/jevents.c b/tools/perf/pmu-events/jevents.c index c17e59404171..6631970f9683 100644 --- a/tools/perf/pmu-events/jevents.c +++ b/tools/perf/pmu-events/jevents.c @@ -1064,10 +1064,9 @@ static int process_one_file(const char *fpath, const struct stat *sb, */ int main(int argc, char *argv[]) { - int rc; + int rc, ret = 0; int maxfds; char ldirname[PATH_MAX]; - const char *arch; const char *output_file; const char *start_dirname; @@ -1138,7 +1137,8 @@ int main(int argc, char *argv[]) /* Make build fail */ fclose(eventsfp); free_arch_std_events(); - return 1; + ret = 1; + goto out_free_mapfile; } else if (rc) { goto empty_map; } @@ -1156,14 +1156,17 @@ int main(int argc, char *argv[]) /* Make build fail */ fclose(eventsfp); free_arch_std_events(); - return 1; + ret = 1; }
- return 0; + + goto out_free_mapfile;
empty_map: fclose(eventsfp); create_empty_mapping(output_file); free_arch_std_events(); - return 0; +out_free_mapfile: + free(mapfile); + return ret; }
From: "Kirill A. Shutemov" kirill@shutemov.name
stable inclusion from linux-4.19.149 commit 2b294ac325c7ce3f36854b74d0d1d89dc1d1d8b8
--------------------------------
[ Upstream commit c3e5ea6ee574ae5e845a40ac8198de1fb63bb3ab ]
Jeff Moyer has reported that one of xfstests triggers a warning when run on DAX-enabled filesystem:
WARNING: CPU: 76 PID: 51024 at mm/memory.c:2317 wp_page_copy+0xc40/0xd50 ... wp_page_copy+0x98c/0xd50 (unreliable) do_wp_page+0xd8/0xad0 __handle_mm_fault+0x748/0x1b90 handle_mm_fault+0x120/0x1f0 __do_page_fault+0x240/0xd70 do_page_fault+0x38/0xd0 handle_page_fault+0x10/0x30
The warning happens on failed __copy_from_user_inatomic() which tries to copy data into a CoW page.
This happens because of race between MADV_DONTNEED and CoW page fault:
CPU0 CPU1 handle_mm_fault() do_wp_page() wp_page_copy() do_wp_page() madvise(MADV_DONTNEED) zap_page_range() zap_pte_range() ptep_get_and_clear_full() <TLB flush> __copy_from_user_inatomic() sees empty PTE and fails WARN_ON_ONCE(1) clear_page()
The solution is to re-try __copy_from_user_inatomic() under PTL after checking that PTE is matches the orig_pte.
The second copy attempt can still fail, like due to non-readable PTE, but there's nothing reasonable we can do about, except clearing the CoW page.
Reported-by: Jeff Moyer jmoyer@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Tested-by: Jeff Moyer jmoyer@redhat.com Cc: stable@vger.kernel.org Cc: Justin He Justin.He@arm.com Cc: Dan Williams dan.j.williams@intel.com Link: http://lkml.kernel.org/r/20200218154151.13349-1-kirill.shutemov@linux.intel.... Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memory.c | 35 +++++++++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 8 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c index c4c667c0b89d..d146d4231686 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2107,7 +2107,7 @@ static inline bool cow_user_page(struct page *dst, struct page *src, bool ret; void *kaddr; void __user *uaddr; - bool force_mkyoung; + bool locked = false; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; unsigned long addr = vmf->address; @@ -2132,11 +2132,11 @@ static inline bool cow_user_page(struct page *dst, struct page *src, * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ - force_mkyoung = arch_faults_on_old_pte() && !pte_young(vmf->orig_pte); - if (force_mkyoung) { + if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { pte_t entry;
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); + locked = true; if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { /* * Other thread has already handled the fault @@ -2160,18 +2160,37 @@ static inline bool cow_user_page(struct page *dst, struct page *src, * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { + if (locked) + goto warn; + + /* Re-validate under PTL if the page is still mapped */ + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); + locked = true; + if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { + /* The PTE changed under us. Retry page fault. */ + ret = false; + goto pte_unlock; + } + /* - * Give a warn in case there can be some obscure - * use-case + * The same page can be mapped back since last copy attampt. + * Try to copy again under PTL. */ - WARN_ON_ONCE(1); - clear_page(kaddr); + if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { + /* + * Give a warn in case there can be some obscure + * use-case + */ +warn: + WARN_ON_ONCE(1); + clear_page(kaddr); + } }
ret = true;
pte_unlock: - if (force_mkyoung) + if (locked) pte_unmap_unlock(vmf->pte, vmf->ptl); kunmap_atomic(kaddr); flush_dcache_page(dst);
From: "Darrick J. Wong" darrick.wong@oracle.com
stable inclusion from linux-4.19.149 commit 6ab959f1299512f9986db48347fff434ce7d33b8
--------------------------------
[ Upstream commit 1cb5deb5bc095c070c09a4540c45f9c9ba24be43 ]
If we decide that a directory free block is corrupt, we must take care not to leak a buffer pointer to the caller. After xfs_trans_brelse returns, the buffer can be freed or reused, which means that we have to set *bpp back to NULL.
Callers are supposed to notice the nonzero return value and not use the buffer pointer, but we should code more defensively, even if all current callers handle this situation correctly.
Fixes: de14c5f541e7 ("xfs: verify free block header fields") Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/libxfs/xfs_dir2_node.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index f1bb3434f51c..01e99806b941 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -214,6 +214,7 @@ __xfs_dir3_free_read( if (fa) { xfs_verifier_error(*bpp, -EFSCORRUPTED, fa); xfs_trans_brelse(tp, *bpp); + *bpp = NULL; return -EFSCORRUPTED; }
From: "Darrick J. Wong" darrick.wong@oracle.com
stable inclusion from linux-4.19.149 commit 7fff3f7fe9a8643ebfd40ab8ed4ff67dd8879fbc
--------------------------------
[ Upstream commit 2e107cf869eecc770e3f630060bb4e5f547d0fd8 ]
In xchk_dir_actor, we attempt to validate the directory hash structures by performing a directory entry lookup by (hashed) name. If the lookup returns ENOENT, that means that the hash information is corrupt. The _process_error functions don't catch this, so we have to add that explicitly.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Dave Chinner dchinner@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/scrub/dir.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c index cd3e4d768a18..33dfcba72c7a 100644 --- a/fs/xfs/scrub/dir.c +++ b/fs/xfs/scrub/dir.c @@ -156,6 +156,9 @@ xchk_dir_actor( xname.type = XFS_DIR3_FT_UNKNOWN;
error = xfs_dir_lookup(sdc->sc->tp, ip, &xname, &lookup_ino, NULL); + /* ENOENT means the hash lookup failed and the dir is corrupt */ + if (error == -ENOENT) + error = -EFSCORRUPTED; if (!xchk_fblock_process_error(sdc->sc, XFS_DATA_FORK, offset, &error)) goto out;
From: Ian Rogers irogers@google.com
stable inclusion from linux-4.19.149 commit a0100a363098c33fc1f89fdd778a2bdf91379ed7
--------------------------------
[ Upstream commit d4953f7ef1a2e87ef732823af35361404d13fea8 ]
Reproducible with a clang asan build and then running perf test in particular 'Parse event definition strings'.
Signed-off-by: Ian Rogers irogers@google.com Acked-by: Jiri Olsa jolsa@redhat.com Cc: Adrian Hunter adrian.hunter@intel.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Andi Kleen ak@linux.intel.com Cc: Leo Yan leo.yan@linaro.org Cc: Mark Rutland mark.rutland@arm.com Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Stephane Eranian eranian@google.com Cc: clang-built-linux@googlegroups.com Link: http://lore.kernel.org/lkml/20200314170356.62914-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/perf/util/evsel.c | 1 + tools/perf/util/parse-events.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-)
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 53062438e49f..c37594d57ab6 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1290,6 +1290,7 @@ void perf_evsel__exit(struct perf_evsel *evsel) thread_map__put(evsel->threads); zfree(&evsel->group_name); zfree(&evsel->name); + zfree(&evsel->pmu_name); perf_evsel__object.fini(evsel); }
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 95043cae5774..6d087d9acd5e 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1261,7 +1261,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, attr.type = pmu->type; evsel = __add_event(list, &parse_state->idx, &attr, NULL, pmu, NULL, auto_merge_stats); if (evsel) { - evsel->pmu_name = name; + evsel->pmu_name = name ? strdup(name) : NULL; evsel->use_uncore_alias = use_uncore_alias; return 0; } else { @@ -1302,7 +1302,7 @@ int parse_events_add_pmu(struct parse_events_state *parse_state, evsel->snapshot = info.snapshot; evsel->metric_expr = info.metric_expr; evsel->metric_name = info.metric_name; - evsel->pmu_name = name; + evsel->pmu_name = name ? strdup(name) : NULL; evsel->use_uncore_alias = use_uncore_alias; }
From: Vignesh Raghavendra vigneshr@ti.com
stable inclusion from linux-4.19.149 commit 20191760203e3d0d9d840764891ea83854a63ef8
--------------------------------
[ Upstream commit f19c3f6c8109b8bab000afd35580929958e087a9 ]
When port's throttle callback is called, it should stop pushing any more data into TTY buffer to avoid buffer overflow. This means driver has to stop HW from receiving more data and assert the HW flow control. For UARTs with auto HW flow control (such as 8250_omap) manual assertion of flow control line is not possible and only way is to allow RX FIFO to fill up, thus trigger auto HW flow control logic.
Therefore make sure that 8250 generic IRQ handler does not drain data when port is stopped (i.e UART_LSR_DR is unset in read_status_mask). Not servicing, RX FIFO would trigger auto HW flow control when FIFO occupancy reaches preset threshold, thus halting RX. Since, error conditions in UART_LSR register are cleared just by reading the register, data has to be drained in case there are FIFO errors, else error information will lost.
Signed-off-by: Vignesh Raghavendra vigneshr@ti.com Link: https://lore.kernel.org/r/20200319103230.16867-2-vigneshr@ti.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/tty/serial/8250/8250_port.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-)
diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 368904825d5b..65bc08a97044 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -1861,6 +1861,7 @@ int serial8250_handle_irq(struct uart_port *port, unsigned int iir) unsigned char status; unsigned long flags; struct uart_8250_port *up = up_to_u8250p(port); + bool skip_rx = false;
if (iir & UART_IIR_NO_INT) return 0; @@ -1869,7 +1870,20 @@ int serial8250_handle_irq(struct uart_port *port, unsigned int iir)
status = serial_port_in(port, UART_LSR);
- if (status & (UART_LSR_DR | UART_LSR_BI)) { + /* + * If port is stopped and there are no error conditions in the + * FIFO, then don't drain the FIFO, as this may lead to TTY buffer + * overflow. Not servicing, RX FIFO would trigger auto HW flow + * control when FIFO occupancy reaches preset threshold, thus + * halting RX. This only works when auto HW flow control is + * available. + */ + if (!(status & (UART_LSR_FIFOE | UART_LSR_BRK_ERROR_BITS)) && + (port->status & (UPSTAT_AUTOCTS | UPSTAT_AUTORTS)) && + !(port->read_status_mask & UART_LSR_DR)) + skip_rx = true; + + if (status & (UART_LSR_DR | UART_LSR_BI) && !skip_rx) { if (!up->dma || handle_rx_dma(up, iir)) status = serial8250_rx_chars(up, status); }
From: Peter Ujfalusi peter.ujfalusi@ti.com
stable inclusion from linux-4.19.149 commit 10aa90fed8aafbfe14e32552742d48eb8f806311
--------------------------------
[ Upstream commit 4ce35a3617c0ac758c61122b2218b6c8c9ac9398 ]
When booting j721e the following bug is printed:
[ 1.154821] BUG: sleeping function called from invalid context at kernel/sched/completion.c:99 [ 1.154827] in_atomic(): 0, irqs_disabled(): 128, non_block: 0, pid: 12, name: kworker/0:1 [ 1.154832] 3 locks held by kworker/0:1/12: [ 1.154836] #0: ffff000840030728 ((wq_completion)events){+.+.}, at: process_one_work+0x1d4/0x6e8 [ 1.154852] #1: ffff80001214fdd8 (deferred_probe_work){+.+.}, at: process_one_work+0x1d4/0x6e8 [ 1.154860] #2: ffff00084060b170 (&dev->mutex){....}, at: __device_attach+0x38/0x138 [ 1.154872] irq event stamp: 63096 [ 1.154881] hardirqs last enabled at (63095): [<ffff800010b74318>] _raw_spin_unlock_irqrestore+0x70/0x78 [ 1.154887] hardirqs last disabled at (63096): [<ffff800010b740d8>] _raw_spin_lock_irqsave+0x28/0x80 [ 1.154893] softirqs last enabled at (62254): [<ffff800010080c88>] _stext+0x488/0x564 [ 1.154899] softirqs last disabled at (62247): [<ffff8000100fdb3c>] irq_exit+0x114/0x140 [ 1.154906] CPU: 0 PID: 12 Comm: kworker/0:1 Not tainted 5.6.0-rc6-next-20200318-00094-g45e4089b0bd3 #221 [ 1.154911] Hardware name: Texas Instruments K3 J721E SoC (DT) [ 1.154917] Workqueue: events deferred_probe_work_func [ 1.154923] Call trace: [ 1.154928] dump_backtrace+0x0/0x190 [ 1.154933] show_stack+0x14/0x20 [ 1.154940] dump_stack+0xe0/0x148 [ 1.154946] ___might_sleep+0x150/0x1f0 [ 1.154952] __might_sleep+0x4c/0x80 [ 1.154957] wait_for_completion_timeout+0x40/0x140 [ 1.154964] ti_sci_set_device_state+0xa0/0x158 [ 1.154969] ti_sci_cmd_get_device_exclusive+0x14/0x20 [ 1.154977] ti_sci_dev_start+0x34/0x50 [ 1.154984] genpd_runtime_resume+0x78/0x1f8 [ 1.154991] __rpm_callback+0x3c/0x140 [ 1.154996] rpm_callback+0x20/0x80 [ 1.155001] rpm_resume+0x568/0x758 [ 1.155007] __pm_runtime_resume+0x44/0xb0 [ 1.155013] omap8250_probe+0x2b4/0x508 [ 1.155019] platform_drv_probe+0x50/0xa0 [ 1.155023] really_probe+0xd4/0x318 [ 1.155028] driver_probe_device+0x54/0xe8 [ 1.155033] __device_attach_driver+0x80/0xb8 [ 1.155039] bus_for_each_drv+0x74/0xc0 [ 1.155044] __device_attach+0xdc/0x138 [ 1.155049] device_initial_probe+0x10/0x18 [ 1.155053] bus_probe_device+0x98/0xa0 [ 1.155058] deferred_probe_work_func+0x74/0xb0 [ 1.155063] process_one_work+0x280/0x6e8 [ 1.155068] worker_thread+0x48/0x430 [ 1.155073] kthread+0x108/0x138 [ 1.155079] ret_from_fork+0x10/0x18
To fix the bug we need to first call pm_runtime_enable() prior to any pm_runtime calls.
Reported-by: Tomi Valkeinen tomi.valkeinen@ti.com Signed-off-by: Peter Ujfalusi peter.ujfalusi@ti.com Link: https://lore.kernel.org/r/20200320125200.6772-1-peter.ujfalusi@ti.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/tty/serial/8250/8250_omap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c index a019286f8bb6..a7e555e413a6 100644 --- a/drivers/tty/serial/8250/8250_omap.c +++ b/drivers/tty/serial/8250/8250_omap.c @@ -1227,11 +1227,11 @@ static int omap8250_probe(struct platform_device *pdev) spin_lock_init(&priv->rx_dma_lock);
device_init_wakeup(&pdev->dev, true); + pm_runtime_enable(&pdev->dev); pm_runtime_use_autosuspend(&pdev->dev); pm_runtime_set_autosuspend_delay(&pdev->dev, -1);
pm_runtime_irq_safe(&pdev->dev); - pm_runtime_enable(&pdev->dev);
pm_runtime_get_sync(&pdev->dev);
From: Vignesh Raghavendra vigneshr@ti.com
stable inclusion from linux-4.19.149 commit 69077bd8f19a34a5a3bb05fd8bc032aa7983ef80
--------------------------------
[ Upstream commit 7cf4df30a98175033e9849f7f16c46e96ba47f41 ]
Terminate and flush DMA internal buffers, before pushing RX data to higher layer. Otherwise, this will lead to data corruption, as driver would end up pushing stale buffer data to higher layer while actual data is still stuck inside DMA hardware and has yet not arrived at the memory. While at that, replace deprecated dmaengine_terminate_all() with dmaengine_terminate_async().
Signed-off-by: Vignesh Raghavendra vigneshr@ti.com Link: https://lore.kernel.org/r/20200319110344.21348-2-vigneshr@ti.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/tty/serial/8250/8250_omap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/drivers/tty/serial/8250/8250_omap.c b/drivers/tty/serial/8250/8250_omap.c index a7e555e413a6..cbd006fb7fbb 100644 --- a/drivers/tty/serial/8250/8250_omap.c +++ b/drivers/tty/serial/8250/8250_omap.c @@ -781,7 +781,10 @@ static void __dma_rx_do_complete(struct uart_8250_port *p) dmaengine_tx_status(dma->rxchan, dma->rx_cookie, &state);
count = dma->rx_size - state.residue; - + if (count < dma->rx_size) + dmaengine_terminate_async(dma->rxchan); + if (!count) + goto unlock; ret = tty_insert_flip_string(tty_port, dma->rx_buf, count);
p->port.icount.rx += ret; @@ -843,7 +846,6 @@ static void omap_8250_rx_dma_flush(struct uart_8250_port *p) spin_unlock_irqrestore(&priv->rx_dma_lock, flags);
__dma_rx_do_complete(p); - dmaengine_terminate_all(dma->rxchan); }
static int omap_8250_rx_dma(struct uart_8250_port *p)
From: Christophe JAILLET christophe.jaillet@wanadoo.fr
stable inclusion from linux-4.19.149 commit 9a1d2d2eadeb4886610c2c310c8f39d106608e17
--------------------------------
[ Upstream commit d74b181a028bb5a468f0c609553eff6a8fdf4887 ]
'snprintf' returns the number of characters which would be generated for the given input.
If the returned value is *greater than* or equal to the buffer size, it means that the output has been truncated.
Fix the overflow test accordingly.
Fixes: 7780c25bae59f ("perf tools: Allow ability to map cpus to nodes easily") Fixes: 92a7e1278005b ("perf cpumap: Add cpu__max_present_cpu()") Signed-off-by: Christophe JAILLET christophe.jaillet@wanadoo.fr Suggested-by: David Laight David.Laight@ACULAB.COM Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Don Zickus dzickus@redhat.com Cc: He Zhe zhe.he@windriver.com Cc: Jan Stancek jstancek@redhat.com Cc: Jiri Olsa jolsa@redhat.com Cc: Kan Liang kan.liang@linux.intel.com Cc: Mark Rutland mark.rutland@arm.com Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: kernel-janitors@vger.kernel.org Link: http://lore.kernel.org/lkml/20200324070319.10901-1-christophe.jaillet@wanado... Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/perf/util/cpumap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/tools/perf/util/cpumap.c b/tools/perf/util/cpumap.c index f93846edc1e0..827d844f4efb 100644 --- a/tools/perf/util/cpumap.c +++ b/tools/perf/util/cpumap.c @@ -462,7 +462,7 @@ static void set_max_cpu_num(void)
/* get the highest possible cpu number for a sparse allocation */ ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/possible", mnt); - if (ret == PATH_MAX) { + if (ret >= PATH_MAX) { pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); goto out; } @@ -473,7 +473,7 @@ static void set_max_cpu_num(void)
/* get the highest present cpu number for a sparse allocation */ ret = snprintf(path, PATH_MAX, "%s/devices/system/cpu/present", mnt); - if (ret == PATH_MAX) { + if (ret >= PATH_MAX) { pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); goto out; } @@ -501,7 +501,7 @@ static void set_max_node_num(void)
/* get the highest possible cpu number for a sparse allocation */ ret = snprintf(path, PATH_MAX, "%s/devices/system/node/possible", mnt); - if (ret == PATH_MAX) { + if (ret >= PATH_MAX) { pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); goto out; } @@ -586,7 +586,7 @@ int cpu__setup_cpunode_map(void) return 0;
n = snprintf(path, PATH_MAX, "%s/devices/system/node", mnt); - if (n == PATH_MAX) { + if (n >= PATH_MAX) { pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); return -1; } @@ -601,7 +601,7 @@ int cpu__setup_cpunode_map(void) continue;
n = snprintf(buf, PATH_MAX, "%s/%s", path, dent1->d_name); - if (n == PATH_MAX) { + if (n >= PATH_MAX) { pr_err("sysfs path crossed PATH_MAX(%d) size\n", PATH_MAX); continue; }
From: John Meneghini johnm@netapp.com
stable inclusion from linux-4.19.149 commit b3dc81c1987f687dfa9c30b87c78dd0a2e603c56
--------------------------------
[ Upstream commit 764e9332098c0e60251386a507fe46ac91276120 ]
The nvme multipath error handling defaults to controller reset if the error is unknown. There are, however, no existing nvme status codes that indicate a reset should be used, and resetting causes unnecessary disruption to the rest of IO.
Change nvme's error handling to first check if failover should happen. If not, let the normal error handling take over rather than reset the controller.
Based-on-a-patch-by: Christoph Hellwig hch@lst.de Reviewed-by: Hannes Reinecke hare@suse.de Signed-off-by: John Meneghini johnm@netapp.com Signed-off-by: Keith Busch kbusch@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 5 +---- drivers/nvme/host/multipath.c | 21 +++++++++------------ drivers/nvme/host/nvme.h | 5 +++-- 3 files changed, 13 insertions(+), 18 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 704fa4f0da5d..9c0836106460 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -261,11 +261,8 @@ void nvme_complete_rq(struct request *req) trace_nvme_complete_rq(req);
if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) { - if ((req->cmd_flags & REQ_NVME_MPATH) && - blk_path_error(status)) { - nvme_failover_req(req); + if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req)) return; - }
if (!blk_queue_dying(req->q)) { nvme_req(req)->retries++; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 838ee58d80cd..2b0479c5fdd8 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -72,17 +72,12 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, } }
-void nvme_failover_req(struct request *req) +bool nvme_failover_req(struct request *req) { struct nvme_ns *ns = req->q->queuedata; u16 status = nvme_req(req)->status; unsigned long flags;
- spin_lock_irqsave(&ns->head->requeue_lock, flags); - blk_steal_bios(&ns->head->requeue_list, req); - spin_unlock_irqrestore(&ns->head->requeue_lock, flags); - blk_mq_end_request(req, 0); - switch (status & 0x7ff) { case NVME_SC_ANA_TRANSITION: case NVME_SC_ANA_INACCESSIBLE: @@ -110,15 +105,17 @@ void nvme_failover_req(struct request *req) nvme_mpath_clear_current_path(ns); break; default: - /* - * Reset the controller for any non-ANA error as we don't know - * what caused the error. - */ - nvme_reset_ctrl(ns->ctrl); - break; + /* This was a non-ANA error so follow the normal error path. */ + return false; }
+ spin_lock_irqsave(&ns->head->requeue_lock, flags); + blk_steal_bios(&ns->head->requeue_list, req); + spin_unlock_irqrestore(&ns->head->requeue_lock, flags); + blk_mq_end_request(req, 0); + kblockd_schedule_work(&ns->head->requeue_work); + return true; }
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index cc4273f11989..31c1496f938f 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -477,7 +477,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys); void nvme_mpath_start_freeze(struct nvme_subsystem *subsys); void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, struct nvme_ctrl *ctrl, int *flags); -void nvme_failover_req(struct request *req); +bool nvme_failover_req(struct request *req); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id); @@ -521,8 +521,9 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance); }
-static inline void nvme_failover_req(struct request *req) +static inline bool nvme_failover_req(struct request *req) { + return false; } static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) {
From: Christophe JAILLET christophe.jaillet@wanadoo.fr
stable inclusion from linux-4.19.149 commit 38c46471f998067e7eff81b04d7238427ae40975
--------------------------------
[ Upstream commit b25b60d7bfb02a74bc3c2d998e09aab159df8059 ]
'maxlen' is the total size of the destination buffer. There is only one caller and this value is 256.
When we compute the size already used and what we would like to add in the buffer, the trailling NULL character is not taken into account. However, this trailling character will be added by the 'strcat' once we have checked that we have enough place.
So, there is a off-by-one issue and 1 byte of the stack could be erroneously overwridden.
Take into account the trailling NULL, when checking if there is enough place in the destination buffer.
While at it, also replace a 'sprintf' by a safer 'snprintf', check for output truncation and avoid a superfluous 'strlen'.
Fixes: dc9a16e49dbba ("svc: Add /proc/sys/sunrpc/transport files") Signed-off-by: Christophe JAILLET christophe.jaillet@wanadoo.fr [ cel: very minor fix to documenting comment Signed-off-by: Chuck Lever chuck.lever@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/sunrpc/svc_xprt.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-)
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index cf9ab055b1c2..2c8c3193dca7 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -103,8 +103,17 @@ void svc_unreg_xprt_class(struct svc_xprt_class *xcl) } EXPORT_SYMBOL_GPL(svc_unreg_xprt_class);
-/* - * Format the transport list for printing +/** + * svc_print_xprts - Format the transport list for printing + * @buf: target buffer for formatted address + * @maxlen: length of target buffer + * + * Fills in @buf with a string containing a list of transport names, each name + * terminated with '\n'. If the buffer is too small, some entries may be + * missing, but it is guaranteed that all lines in the output buffer are + * complete. + * + * Returns positive length of the filled-in string. */ int svc_print_xprts(char *buf, int maxlen) { @@ -117,9 +126,9 @@ int svc_print_xprts(char *buf, int maxlen) list_for_each_entry(xcl, &svc_xprt_class_list, xcl_list) { int slen;
- sprintf(tmpstr, "%s %d\n", xcl->xcl_name, xcl->xcl_max_payload); - slen = strlen(tmpstr); - if (len + slen > maxlen) + slen = snprintf(tmpstr, sizeof(tmpstr), "%s %d\n", + xcl->xcl_name, xcl->xcl_max_payload); + if (slen >= sizeof(tmpstr) || len + slen >= maxlen) break; len += slen; strcat(buf, tmpstr);
From: Stuart Hayes stuart.w.hayes@gmail.com
stable inclusion from linux-4.19.149 commit a8cc52270f3d8e8f4faf01ffd6c4a95bbfb55ba4
--------------------------------
[ Upstream commit 8edf5332c39340b9583cf9cba659eb7ec71f75b5 ]
Without this commit, a PCIe hotplug port can stop generating interrupts on hotplug events, so device adds and removals will not be seen:
The pciehp interrupt handler pciehp_isr() reads the Slot Status register and then writes back to it to clear the bits that caused the interrupt. If a different interrupt event bit gets set between the read and the write, pciehp_isr() returns without having cleared all of the interrupt event bits. If this happens when the MSI isn't masked (which by default it isn't in handle_edge_irq(), and which it will never be when MSI per-vector masking is not supported), we won't get any more hotplug interrupts from that device.
That is expected behavior, according to the PCIe Base Spec r5.0, section 6.7.3.4, "Software Notification of Hot-Plug Events".
Because the Presence Detect Changed and Data Link Layer State Changed event bits can both get set at nearly the same time when a device is added or removed, this is more likely to happen than it might seem. The issue was found (and can be reproduced rather easily) by connecting and disconnecting an NVMe storage device on at least one system model where the NVMe devices were being connected to an AMD PCIe port (PCI device 0x1022/0x1483).
Fix the issue by modifying pciehp_isr() to loop back and re-read the Slot Status register immediately after writing to it, until it sees that all of the event status bits have been cleared.
[lukas: drop loop count limitation, write "events" instead of "status", don't loop back in INTx and poll modes, tweak code comment & commit msg] Link: https://lore.kernel.org/r/78b4ced5072bfe6e369d20e8b47c279b8c7af12e.158212161... Tested-by: Stuart Hayes stuart.w.hayes@gmail.com Signed-off-by: Stuart Hayes stuart.w.hayes@gmail.com Signed-off-by: Lukas Wunner lukas@wunner.de Signed-off-by: Bjorn Helgaas bhelgaas@google.com Reviewed-by: Joerg Roedel jroedel@suse.de Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/hotplug/pciehp_hpc.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-)
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 7ec78081cc28..7a3cf281ae06 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -539,7 +539,7 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id) struct controller *ctrl = (struct controller *)dev_id; struct pci_dev *pdev = ctrl_dev(ctrl); struct device *parent = pdev->dev.parent; - u16 status, events; + u16 status, events = 0;
/* * Interrupts only occur in D3hot or shallower and only if enabled @@ -564,6 +564,7 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id) } }
+read_status: pcie_capability_read_word(pdev, PCI_EXP_SLTSTA, &status); if (status == (u16) ~0) { ctrl_info(ctrl, "%s: no response from device\n", __func__); @@ -576,24 +577,37 @@ static irqreturn_t pciehp_isr(int irq, void *dev_id) * Slot Status contains plain status bits as well as event * notification bits; right now we only want the event bits. */ - events = status & (PCI_EXP_SLTSTA_ABP | PCI_EXP_SLTSTA_PFD | - PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_CC | - PCI_EXP_SLTSTA_DLLSC); + status &= PCI_EXP_SLTSTA_ABP | PCI_EXP_SLTSTA_PFD | + PCI_EXP_SLTSTA_PDC | PCI_EXP_SLTSTA_CC | + PCI_EXP_SLTSTA_DLLSC;
/* * If we've already reported a power fault, don't report it again * until we've done something to handle it. */ if (ctrl->power_fault_detected) - events &= ~PCI_EXP_SLTSTA_PFD; + status &= ~PCI_EXP_SLTSTA_PFD;
+ events |= status; if (!events) { if (parent) pm_runtime_put(parent); return IRQ_NONE; }
- pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, events); + if (status) { + pcie_capability_write_word(pdev, PCI_EXP_SLTSTA, events); + + /* + * In MSI mode, all event bits must be zero before the port + * will send a new interrupt (PCIe Base Spec r5.0 sec 6.7.3.4). + * So re-read the Slot Status register in case a bit was set + * between read and write. + */ + if (pci_dev_msi_enabled(pdev) && !pciehp_poll_mode) + goto read_status; + } + ctrl_dbg(ctrl, "pending interrupts %#06x from Slot Status\n", events); if (parent) pm_runtime_put(parent);
From: Trond Myklebust trond.myklebust@hammerspace.com
stable inclusion from linux-4.19.149 commit 1f39a7cc5d07a58c53f3054b177bad93c243d3f9
--------------------------------
[ Upstream commit 08ca8b21f760c0ed5034a5c122092eec22ccf8f4 ]
When a subrequest is being detached from the subgroup, we want to ensure that it is not holding the group lock, or in the process of waiting for the group lock.
Fixes: 5b2b5187fa85 ("NFS: Fix nfs_page_group_destroy() and nfs_lock_and_join_requests() race cases") Signed-off-by: Trond Myklebust trond.myklebust@hammerspace.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/nfs/pagelist.c | 67 +++++++++++++++++++++++++++------------- fs/nfs/write.c | 10 ++++-- include/linux/nfs_page.h | 2 ++ 3 files changed, 55 insertions(+), 24 deletions(-)
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 5dae7c85d9b6..2c7d76b4c5e1 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -132,47 +132,70 @@ nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx) EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
/* - * nfs_page_group_lock - lock the head of the page group - * @req - request in group that is to be locked + * nfs_page_set_headlock - set the request PG_HEADLOCK + * @req: request that is to be locked * - * this lock must be held when traversing or modifying the page - * group list + * this lock must be held when modifying req->wb_head * * return 0 on success, < 0 on error */ int -nfs_page_group_lock(struct nfs_page *req) +nfs_page_set_headlock(struct nfs_page *req) { - struct nfs_page *head = req->wb_head; - - WARN_ON_ONCE(head != head->wb_head); - - if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags)) + if (!test_and_set_bit(PG_HEADLOCK, &req->wb_flags)) return 0;
- set_bit(PG_CONTENDED1, &head->wb_flags); + set_bit(PG_CONTENDED1, &req->wb_flags); smp_mb__after_atomic(); - return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, + return wait_on_bit_lock(&req->wb_flags, PG_HEADLOCK, TASK_UNINTERRUPTIBLE); }
/* - * nfs_page_group_unlock - unlock the head of the page group - * @req - request in group that is to be unlocked + * nfs_page_clear_headlock - clear the request PG_HEADLOCK + * @req: request that is to be locked */ void -nfs_page_group_unlock(struct nfs_page *req) +nfs_page_clear_headlock(struct nfs_page *req) { - struct nfs_page *head = req->wb_head; - - WARN_ON_ONCE(head != head->wb_head); - smp_mb__before_atomic(); - clear_bit(PG_HEADLOCK, &head->wb_flags); + clear_bit(PG_HEADLOCK, &req->wb_flags); smp_mb__after_atomic(); - if (!test_bit(PG_CONTENDED1, &head->wb_flags)) + if (!test_bit(PG_CONTENDED1, &req->wb_flags)) return; - wake_up_bit(&head->wb_flags, PG_HEADLOCK); + wake_up_bit(&req->wb_flags, PG_HEADLOCK); +} + +/* + * nfs_page_group_lock - lock the head of the page group + * @req: request in group that is to be locked + * + * this lock must be held when traversing or modifying the page + * group list + * + * return 0 on success, < 0 on error + */ +int +nfs_page_group_lock(struct nfs_page *req) +{ + int ret; + + ret = nfs_page_set_headlock(req); + if (ret || req->wb_head == req) + return ret; + return nfs_page_set_headlock(req->wb_head); +} + +/* + * nfs_page_group_unlock - unlock the head of the page group + * @req: request in group that is to be unlocked + */ +void +nfs_page_group_unlock(struct nfs_page *req) +{ + if (req != req->wb_head) + nfs_page_clear_headlock(req->wb_head); + nfs_page_clear_headlock(req); }
/* diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0f0d2ddd939c..4a5728acd6de 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -416,22 +416,28 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list, destroy_list = (subreq->wb_this_page == old_head) ? NULL : subreq->wb_this_page;
+ /* Note: lock subreq in order to change subreq->wb_head */ + nfs_page_set_headlock(subreq); WARN_ON_ONCE(old_head != subreq->wb_head);
/* make sure old group is not used */ subreq->wb_this_page = subreq; + subreq->wb_head = subreq;
clear_bit(PG_REMOVE, &subreq->wb_flags);
/* Note: races with nfs_page_group_destroy() */ if (!kref_read(&subreq->wb_kref)) { /* Check if we raced with nfs_page_group_destroy() */ - if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) + if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) { + nfs_page_clear_headlock(subreq); nfs_free_request(subreq); + } else + nfs_page_clear_headlock(subreq); continue; } + nfs_page_clear_headlock(subreq);
- subreq->wb_head = subreq; nfs_release_request(old_head);
if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) { diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index ad69430fd0eb..5162fc1533c2 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -142,6 +142,8 @@ extern void nfs_unlock_and_release_request(struct nfs_page *); extern int nfs_page_group_lock(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); +extern int nfs_page_set_headlock(struct nfs_page *req); +extern void nfs_page_clear_headlock(struct nfs_page *req); extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
/*
From: Nathan Chancellor natechancellor@gmail.com
stable inclusion from linux-4.19.149 commit afe001488e7e8e1108a2d9fcac3757713ffae503
--------------------------------
[ Upstream commit b0d14fc43d39203ae025f20ef4d5d25d9ccf4be1 ]
Clang warns:
mm/kmemleak.c:1955:28: warning: array comparison always evaluates to a constant [-Wtautological-compare] if (__start_ro_after_init < _sdata || __end_ro_after_init > _edata) ^ mm/kmemleak.c:1955:60: warning: array comparison always evaluates to a constant [-Wtautological-compare] if (__start_ro_after_init < _sdata || __end_ro_after_init > _edata)
These are not true arrays, they are linker defined symbols, which are just addresses. Using the address of operator silences the warning and does not change the resulting assembly with either clang/ld.lld or gcc/ld (tested with diff + objdump -Dr).
Suggested-by: Nick Desaulniers ndesaulniers@google.com Signed-off-by: Nathan Chancellor natechancellor@gmail.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Catalin Marinas catalin.marinas@arm.com Link: https://github.com/ClangBuiltLinux/linux/issues/895 Link: http://lkml.kernel.org/r/20200220051551.44000-1-natechancellor@gmail.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 5eeabece0c17..f54734abf946 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -2039,7 +2039,7 @@ void __init kmemleak_init(void) create_object((unsigned long)__bss_start, __bss_stop - __bss_start, KMEMLEAK_GREY, GFP_ATOMIC); /* only register .data..ro_after_init if not within .data */ - if (__start_ro_after_init < _sdata || __end_ro_after_init > _edata) + if (&__start_ro_after_init < &_sdata || &__end_ro_after_init > &_edata) create_object((unsigned long)__start_ro_after_init, __end_ro_after_init - __start_ro_after_init, KMEMLEAK_GREY, GFP_ATOMIC);
From: Xianting Tian xianting_tian@126.com
stable inclusion from linux-4.19.149 commit cebefe4f6fc0cf5721d443b91e8f43a66766fb06
--------------------------------
[ Upstream commit faffdfa04fa11ccf048cebdde73db41ede0679e0 ]
Mount failure issue happens under the scenario: Application forked dozens of threads to mount the same number of cramfs images separately in docker, but several mounts failed with high probability. Mount failed due to the checking result of the page(read from the superblock of loop dev) is not uptodate after wait_on_page_locked(page) returned in function cramfs_read:
wait_on_page_locked(page); if (!PageUptodate(page)) { ... }
The reason of the checking result of the page not uptodate: systemd-udevd read the loopX dev before mount, because the status of loopX is Lo_unbound at this time, so loop_make_request directly trigger the calling of io_end handler end_buffer_async_read, which called SetPageError(page). So It caused the page can't be set to uptodate in function end_buffer_async_read:
if(page_uptodate && !PageError(page)) { SetPageUptodate(page); }
Then mount operation is performed, it used the same page which is just accessed by systemd-udevd above, Because this page is not uptodate, it will launch a actual read via submit_bh, then wait on this page by calling wait_on_page_locked(page). When the I/O of the page done, io_end handler end_buffer_async_read is called, because no one cleared the page error(during the whole read path of mount), which is caused by systemd-udevd reading, so this page is still in "PageError" status, which can't be set to uptodate in function end_buffer_async_read, then caused mount failure.
But sometimes mount succeed even through systemd-udeved read loopX dev just before, The reason is systemd-udevd launched other loopX read just between step 3.1 and 3.2, the steps as below:
1, loopX dev default status is Lo_unbound; 2, systemd-udved read loopX dev (page is set to PageError); 3, mount operation 1) set loopX status to Lo_bound; ==>systemd-udevd read loopX dev<== 2) read loopX dev(page has no error) 3) mount succeed
As the loopX dev status is set to Lo_bound after step 3.1, so the other loopX dev read by systemd-udevd will go through the whole I/O stack, part of the call trace as below:
SYS_read vfs_read do_sync_read blkdev_aio_read generic_file_aio_read do_generic_file_read: ClearPageError(page); mapping->a_ops->readpage(filp, page);
here, mapping->a_ops->readpage() is blkdev_readpage. In latest kernel, some function name changed, the call trace as below:
blkdev_read_iter generic_file_read_iter generic_file_buffered_read: /* * A previous I/O error may have been due to temporary * failures, eg. mutipath errors. * Pg_error will be set again if readpage fails. */ ClearPageError(page); /* Start the actual read. The read will unlock the page*/ error=mapping->a_ops->readpage(flip, page);
We can see ClearPageError(page) is called before the actual read, then the read in step 3.2 succeed.
This patch is to add the calling of ClearPageError just before the actual read of read path of cramfs mount. Without the patch, the call trace as below when performing cramfs mount:
do_mount cramfs_read cramfs_blkdev_read read_cache_page do_read_cache_page: filler(data, page); or mapping->a_ops->readpage(data, page);
With the patch, the call trace as below when performing mount:
do_mount cramfs_read cramfs_blkdev_read read_cache_page: do_read_cache_page: ClearPageError(page); <== new add filler(data, page); or mapping->a_ops->readpage(data, page);
With the patch, mount operation trigger the calling of ClearPageError(page) before the actual read, the page has no error if no additional page error happen when I/O done.
Signed-off-by: Xianting Tian xianting_tian@126.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Matthew Wilcox (Oracle) willy@infradead.org Cc: Jan Kara jack@suse.cz Cc: yubin@h3c.com Link: http://lkml.kernel.org/r/1583318844-22971-1-git-send-email-xianting_tian@126... Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/filemap.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/mm/filemap.c b/mm/filemap.c index 8a8bf78d3ac0..52e888f8de49 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2981,6 +2981,14 @@ static struct page *do_read_cache_page(struct address_space *mapping, unlock_page(page); goto out; } + + /* + * A previous I/O error may have been due to temporary + * failures. + * Clear page error before actual read, PG_error will be + * set again if read page fails. + */ + ClearPageError(page); goto filler;
out:
From: Qian Cai cai@lca.pw
stable inclusion from linux-4.19.149 commit b73c744019721ea47340b37440a7f6a263beea54
--------------------------------
[ Upstream commit 5644e1fbbfe15ad06785502bbfe5751223e5841d ]
pgdat->kswapd_classzone_idx could be accessed concurrently in wakeup_kswapd(). Plain writes and reads without any lock protection result in data races. Fix them by adding a pair of READ|WRITE_ONCE() as well as saving a branch (compilers might well optimize the original code in an unintentional way anyway). While at it, also take care of pgdat->kswapd_order and non-kswapd threads in allow_direct_reclaim(). The data races were reported by KCSAN,
BUG: KCSAN: data-race in wakeup_kswapd / wakeup_kswapd
write to 0xffff9f427ffff2dc of 4 bytes by task 7454 on cpu 13: wakeup_kswapd+0xf1/0x400 wakeup_kswapd at mm/vmscan.c:3967 wake_all_kswapds+0x59/0xc0 wake_all_kswapds at mm/page_alloc.c:4241 __alloc_pages_slowpath+0xdcc/0x1290 __alloc_pages_slowpath at mm/page_alloc.c:4512 __alloc_pages_nodemask+0x3bb/0x450 alloc_pages_vma+0x8a/0x2c0 do_anonymous_page+0x16e/0x6f0 __handle_mm_fault+0xcd5/0xd40 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40
1 lock held by mtest01/7454: #0: ffff9f425afe8808 (&mm->mmap_sem#2){++++}, at: do_page_fault+0x143/0x6f9 do_user_addr_fault at arch/x86/mm/fault.c:1405 (inlined by) do_page_fault at arch/x86/mm/fault.c:1539 irq event stamp: 6944085 count_memcg_event_mm+0x1a6/0x270 count_memcg_event_mm+0x119/0x270 __do_softirq+0x34c/0x57c irq_exit+0xa2/0xc0
read to 0xffff9f427ffff2dc of 4 bytes by task 7472 on cpu 38: wakeup_kswapd+0xc8/0x400 wake_all_kswapds+0x59/0xc0 __alloc_pages_slowpath+0xdcc/0x1290 __alloc_pages_nodemask+0x3bb/0x450 alloc_pages_vma+0x8a/0x2c0 do_anonymous_page+0x16e/0x6f0 __handle_mm_fault+0xcd5/0xd40 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40
1 lock held by mtest01/7472: #0: ffff9f425a9ac148 (&mm->mmap_sem#2){++++}, at: do_page_fault+0x143/0x6f9 irq event stamp: 6793561 count_memcg_event_mm+0x1a6/0x270 count_memcg_event_mm+0x119/0x270 __do_softirq+0x34c/0x57c irq_exit+0xa2/0xc0
BUG: KCSAN: data-race in kswapd / wakeup_kswapd
write to 0xffff90973ffff2dc of 4 bytes by task 820 on cpu 6: kswapd+0x27c/0x8d0 kthread+0x1e0/0x200 ret_from_fork+0x27/0x50
read to 0xffff90973ffff2dc of 4 bytes by task 6299 on cpu 0: wakeup_kswapd+0xf3/0x450 wake_all_kswapds+0x59/0xc0 __alloc_pages_slowpath+0xdcc/0x1290 __alloc_pages_nodemask+0x3bb/0x450 alloc_pages_vma+0x8a/0x2c0 do_anonymous_page+0x170/0x700 __handle_mm_fault+0xc9f/0xd00 handle_mm_fault+0xfc/0x2f0 do_page_fault+0x263/0x6f9 page_fault+0x34/0x40
Signed-off-by: Qian Cai cai@lca.pw Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Andrew Morton akpm@linux-foundation.org Cc: Marco Elver elver@google.com Cc: Matthew Wilcox willy@infradead.org Link: http://lkml.kernel.org/r/1582749472-5171-1-git-send-email-cai@lca.pw Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/vmscan.c | 45 ++++++++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 19 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c index d8b6ba5bd29e..ac6f4cbef5ea 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3113,8 +3113,9 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
/* kswapd must be awake if processes are being throttled */ if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { - pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx, - (enum zone_type)ZONE_NORMAL); + if (READ_ONCE(pgdat->kswapd_classzone_idx) > ZONE_NORMAL) + WRITE_ONCE(pgdat->kswapd_classzone_idx, ZONE_NORMAL); + wake_up_interruptible(&pgdat->kswapd_wait); }
@@ -3635,9 +3636,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, enum zone_type prev_classzone_idx) { - if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) - return prev_classzone_idx; - return pgdat->kswapd_classzone_idx; + enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + + return curr_idx == MAX_NR_ZONES ? prev_classzone_idx : curr_idx; }
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, @@ -3681,8 +3682,11 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * the previous request that slept prematurely. */ if (remaining) { - pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); - pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); + WRITE_ONCE(pgdat->kswapd_classzone_idx, + kswapd_classzone_idx(pgdat, classzone_idx)); + + if (READ_ONCE(pgdat->kswapd_order) < reclaim_order) + WRITE_ONCE(pgdat->kswapd_order, reclaim_order); }
finish_wait(&pgdat->kswapd_wait, &wait); @@ -3764,12 +3768,12 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable();
- pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = MAX_NR_ZONES; + WRITE_ONCE(pgdat->kswapd_order, 0); + WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES); for ( ; ; ) { bool ret;
- alloc_order = reclaim_order = pgdat->kswapd_order; + alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
kswapd_try_sleep: @@ -3777,10 +3781,10 @@ static int kswapd(void *p) classzone_idx);
/* Read the new order and classzone_idx */ - alloc_order = reclaim_order = pgdat->kswapd_order; + alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order); classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); - pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = MAX_NR_ZONES; + WRITE_ONCE(pgdat->kswapd_order, 0); + WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES);
ret = try_to_freeze(); if (kthread_should_stop()) @@ -3825,20 +3829,23 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; + enum zone_type curr_idx;
if (!managed_zone(zone)) return;
if (!cpuset_zone_allowed(zone, gfp_flags)) return; + pgdat = zone->zone_pgdat; + curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx); + + if (curr_idx == MAX_NR_ZONES || curr_idx < classzone_idx) + WRITE_ONCE(pgdat->kswapd_classzone_idx, classzone_idx); + + if (READ_ONCE(pgdat->kswapd_order) < order) + WRITE_ONCE(pgdat->kswapd_order, order);
- if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) - pgdat->kswapd_classzone_idx = classzone_idx; - else - pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, - classzone_idx); - pgdat->kswapd_order = max(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return;
From: Jaewon Kim jaewon31.kim@samsung.com
stable inclusion from linux-4.19.149 commit 6bee7991f63e6ae8faba0c704f4d98575bb0312f
--------------------------------
[ Upstream commit 09ef5283fd96ac424ef0e569626f359bf9ab86c9 ]
On passing requirement to vm_unmapped_area, arch_get_unmapped_area and arch_get_unmapped_area_topdown did not set align_offset. Internally on both unmapped_area and unmapped_area_topdown, if info->align_mask is 0, then info->align_offset was meaningless.
But commit df529cabb7a2 ("mm: mmap: add trace point of vm_unmapped_area") always prints info->align_offset even though it is uninitialized.
Fix this uninitialized value issue by setting it to 0 explicitly.
Before: vm_unmapped_area: addr=0x755b155000 err=0 total_vm=0x15aaf0 flags=0x1 len=0x109000 lo=0x8000 hi=0x75eed48000 mask=0x0 ofs=0x4022
After: vm_unmapped_area: addr=0x74a4ca1000 err=0 total_vm=0x168ab1 flags=0x1 len=0x9000 lo=0x8000 hi=0x753d94b000 mask=0x0 ofs=0x0
Signed-off-by: Jaewon Kim jaewon31.kim@samsung.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Andrew Morton akpm@linux-foundation.org Cc: Matthew Wilcox (Oracle) willy@infradead.org Cc: Michel Lespinasse walken@google.com Cc: Borislav Petkov bp@suse.de Link: http://lkml.kernel.org/r/20200409094035.19457-1-jaewon31.kim@samsung.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Conflict: mm/mmap.c [yyl: adjust context] Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/mmap.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/mm/mmap.c b/mm/mmap.c index 7239d81acfd4..00702331afc1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2128,6 +2128,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.low_limit = mm->mmap_base; info.high_limit = TASK_SIZE; info.align_mask = 0; + info.align_offset = 0;
if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags); @@ -2177,6 +2178,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.low_limit = max(PAGE_SIZE, mmap_min_addr); info.high_limit = mm->mmap_base; info.align_mask = 0; + info.align_offset = 0;
if (enable_mmap_dvpp) dvpp_mmap_get_area(&info, flags);
From: Douglas Anderson dianders@chromium.org
stable inclusion from linux-4.19.149 commit b6256c2966706c279f54bdd2c6582c7c370e9467
--------------------------------
[ Upstream commit b849dd84b6ccfe32622988b79b7b073861fcf9f7 ]
While trying to "dd" to the block device for a USB stick, I encountered a hung task warning (blocked for > 120 seconds). I managed to come up with an easy way to reproduce this on my system (where /dev/sdb is the block device for my USB stick) with:
while true; do dd if=/dev/zero of=/dev/sdb bs=4M; done
With my reproduction here are the relevant bits from the hung task detector:
INFO: task udevd:294 blocked for more than 122 seconds. ... udevd D 0 294 1 0x00400008 Call trace: ... mutex_lock_nested+0x40/0x50 __blkdev_get+0x7c/0x3d4 blkdev_get+0x118/0x138 blkdev_open+0x94/0xa8 do_dentry_open+0x268/0x3a0 vfs_open+0x34/0x40 path_openat+0x39c/0xdf4 do_filp_open+0x90/0x10c do_sys_open+0x150/0x3c8 ...
... Showing all locks held in the system: ... 1 lock held by dd/2798: #0: ffffff814ac1a3b8 (&bdev->bd_mutex){+.+.}, at: __blkdev_put+0x50/0x204 ... dd D 0 2798 2764 0x00400208 Call trace: ... schedule+0x8c/0xbc io_schedule+0x1c/0x40 wait_on_page_bit_common+0x238/0x338 __lock_page+0x5c/0x68 write_cache_pages+0x194/0x500 generic_writepages+0x64/0xa4 blkdev_writepages+0x24/0x30 do_writepages+0x48/0xa8 __filemap_fdatawrite_range+0xac/0xd8 filemap_write_and_wait+0x30/0x84 __blkdev_put+0x88/0x204 blkdev_put+0xc4/0xe4 blkdev_close+0x28/0x38 __fput+0xe0/0x238 ____fput+0x1c/0x28 task_work_run+0xb0/0xe4 do_notify_resume+0xfc0/0x14bc work_pending+0x8/0x14
The problem appears related to the fact that my USB disk is terribly slow and that I have a lot of RAM in my system to cache things. Specifically my writes seem to be happening at ~15 MB/s and I've got ~4 GB of RAM in my system that can be used for buffering. To write 4 GB of buffer to disk thus takes ~4000 MB / ~15 MB/s = ~267 seconds.
The 267 second number is a problem because in __blkdev_put() we call sync_blockdev() while holding the bd_mutex. Any other callers who want the bd_mutex will be blocked for the whole time.
The problem is made worse because I believe blkdev_put() specifically tells other tasks (namely udev) to go try to access the device at right around the same time we're going to hold the mutex for a long time.
Putting some traces around this (after disabling the hung task detector), I could confirm: dd: 437.608600: __blkdev_put() right before sync_blockdev() for sdb udevd: 437.623901: blkdev_open() right before blkdev_get() for sdb dd: 661.468451: __blkdev_put() right after sync_blockdev() for sdb udevd: 663.820426: blkdev_open() right after blkdev_get() for sdb
A simple fix for this is to realize that sync_blockdev() works fine if you're not holding the mutex. Also, it's not the end of the world if you sync a little early (though it can have performance impacts). Thus we can make a guess that we're going to need to do the sync and then do it without holding the mutex. We still do one last sync with the mutex but it should be much, much faster.
With this, my hung task warnings for my test case are gone.
Signed-off-by: Douglas Anderson dianders@chromium.org Reviewed-by: Guenter Roeck groeck@chromium.org Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Jens Axboe axboe@kernel.dk Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/block_dev.c | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/fs/block_dev.c b/fs/block_dev.c index 6168c61acb2c..2db79b6d5e6b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1874,6 +1874,16 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) struct gendisk *disk = bdev->bd_disk; struct block_device *victim = NULL;
+ /* + * Sync early if it looks like we're the last one. If someone else + * opens the block device between now and the decrement of bd_openers + * then we did a sync that we didn't need to, but that's not the end + * of the world and we want to avoid long (could be several minute) + * syncs while holding the mutex. + */ + if (bdev->bd_openers == 1) + sync_blockdev(bdev); + mutex_lock_nested(&bdev->bd_mutex, for_part); if (mode & FMODE_WRITE) bdev->bd_write_openers--;
From: Ian Rogers irogers@google.com
stable inclusion from linux-4.19.149 commit 318af7241223eea9fc16413b04a6915518ab1e9c
--------------------------------
[ Upstream commit 266150c94c69429cf6d18e130237224a047f5061 ]
Realloc of size zero is a free not an error, avoid this causing a double free. Caught by clang's address sanitizer:
==2634==ERROR: AddressSanitizer: attempting double-free on 0x6020000015f0 in thread T0: #0 0x5649659297fd in free llvm/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:123:3 #1 0x5649659e9251 in __zfree tools/lib/zalloc.c:13:2 #2 0x564965c0f92c in mem2node__exit tools/perf/util/mem2node.c:114:2 #3 0x564965a08b4c in perf_c2c__report tools/perf/builtin-c2c.c:2867:2 #4 0x564965a0616a in cmd_c2c tools/perf/builtin-c2c.c:2989:10 #5 0x564965944348 in run_builtin tools/perf/perf.c:312:11 #6 0x564965943235 in handle_internal_command tools/perf/perf.c:364:8 #7 0x5649659440c4 in run_argv tools/perf/perf.c:408:2 #8 0x564965942e41 in main tools/perf/perf.c:538:3
0x6020000015f0 is located 0 bytes inside of 1-byte region [0x6020000015f0,0x6020000015f1) freed by thread T0 here: #0 0x564965929da3 in realloc third_party/llvm/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:164:3 #1 0x564965c0f55e in mem2node__init tools/perf/util/mem2node.c:97:16 #2 0x564965a08956 in perf_c2c__report tools/perf/builtin-c2c.c:2803:8 #3 0x564965a0616a in cmd_c2c tools/perf/builtin-c2c.c:2989:10 #4 0x564965944348 in run_builtin tools/perf/perf.c:312:11 #5 0x564965943235 in handle_internal_command tools/perf/perf.c:364:8 #6 0x5649659440c4 in run_argv tools/perf/perf.c:408:2 #7 0x564965942e41 in main tools/perf/perf.c:538:3
previously allocated by thread T0 here: #0 0x564965929c42 in calloc third_party/llvm/llvm-project/compiler-rt/lib/asan/asan_malloc_linux.cpp:154:3 #1 0x5649659e9220 in zalloc tools/lib/zalloc.c:8:9 #2 0x564965c0f32d in mem2node__init tools/perf/util/mem2node.c:61:12 #3 0x564965a08956 in perf_c2c__report tools/perf/builtin-c2c.c:2803:8 #4 0x564965a0616a in cmd_c2c tools/perf/builtin-c2c.c:2989:10 #5 0x564965944348 in run_builtin tools/perf/perf.c:312:11 #6 0x564965943235 in handle_internal_command tools/perf/perf.c:364:8 #7 0x5649659440c4 in run_argv tools/perf/perf.c:408:2 #8 0x564965942e41 in main tools/perf/perf.c:538:3
v2: add a WARN_ON_ONCE when the free condition arises.
Signed-off-by: Ian Rogers irogers@google.com Acked-by: Jiri Olsa jolsa@redhat.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Mark Rutland mark.rutland@arm.com Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Stephane Eranian eranian@google.com Cc: clang-built-linux@googlegroups.com Link: http://lore.kernel.org/lkml/20200320182347.87675-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/perf/util/mem2node.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tools/perf/util/mem2node.c b/tools/perf/util/mem2node.c index c6fd81c02586..81c5a2e438b7 100644 --- a/tools/perf/util/mem2node.c +++ b/tools/perf/util/mem2node.c @@ -1,5 +1,6 @@ #include <errno.h> #include <inttypes.h> +#include <asm/bug.h> #include <linux/bitmap.h> #include "mem2node.h" #include "util.h" @@ -92,7 +93,7 @@ int mem2node__init(struct mem2node *map, struct perf_env *env)
/* Cut unused entries, due to merging. */ tmp_entries = realloc(entries, sizeof(*entries) * j); - if (tmp_entries) + if (tmp_entries || WARN_ON_ONCE(j == 0)) entries = tmp_entries;
for (i = 0; i < j; i++) {
From: Miklos Szeredi mszeredi@redhat.com
stable inclusion from linux-4.19.149 commit 59da76a1713f7fd82d9c18ec72be99085b557027
--------------------------------
[ Upstream commit 32f98877c57bee6bc27f443a96f49678a2cd6a50 ]
page_count() is unstable. Unless there has been an RCU grace period between when the page was removed from the page cache and now, a speculative reference may exist from the page cache.
Reported-by: Matthew Wilcox willy@infradead.org Signed-off-by: Miklos Szeredi mszeredi@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/fuse/dev.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 01e6ea11822b..c51c9a6881e4 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -831,7 +831,6 @@ static int fuse_check_page(struct page *page) { if (page_mapcount(page) || page->mapping != NULL || - page_count(page) != 1 || (page->flags & PAGE_FLAGS_CHECK_AT_PREP & ~(1 << PG_locked | 1 << PG_referenced |
From: Anshuman Khandual anshuman.khandual@arm.com
stable inclusion from linux-4.19.149 commit e682e0d53c390467100dadd0cebcf8f4f0b9498e
--------------------------------
[ Upstream commit 1ed1b90a0594c8c9d31e8bb8be25a2b37717dc9e ]
ID_DFR0 based TraceFilt feature should not be exposed to guests. Hence lets drop it.
Cc: Catalin Marinas catalin.marinas@arm.com Cc: Will Deacon will@kernel.org Cc: Marc Zyngier maz@kernel.org Cc: Mark Rutland mark.rutland@arm.com Cc: James Morse james.morse@arm.com Cc: Suzuki K Poulose suzuki.poulose@arm.com Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org
Suggested-by: Mark Rutland mark.rutland@arm.com Signed-off-by: Anshuman Khandual anshuman.khandual@arm.com Reviewed-by: Suzuki K Poulose suzuki.poulose@arm.com Link: https://lore.kernel.org/r/1589881254-10082-3-git-send-email-anshuman.khandua... Signed-off-by: Will Deacon will@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/kernel/cpufeature.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 3c45f3fa7f98..6a00aa3cc876 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -305,7 +305,7 @@ static const struct arm64_ftr_bits ftr_id_pfr0[] = { };
static const struct arm64_ftr_bits ftr_id_dfr0[] = { - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 28, 4, 0), + /* [31:28] TraceFilt */ S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 24, 4, 0xf), /* PerfMon */ ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 20, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, 16, 4, 0),
From: Shreyas Joshi shreyas.joshi@biamp.com
stable inclusion from linux-4.19.149 commit c6a9585611a538466c8ad2421035c0ffa7fabc77
--------------------------------
[ Upstream commit 48021f98130880dd74286459a1ef48b5e9bc374f ]
If uboot passes a blank string to console_setup then it results in a trashed memory. Ultimately, the kernel crashes during freeing up the memory.
This fix checks if there is a blank parameter being passed to console_setup from uboot. In case it detects that the console parameter is blank then it doesn't setup the serial device and it gracefully exits.
Link: https://lore.kernel.org/r/20200522065306.83-1-shreyas.joshi@biamp.com Signed-off-by: Shreyas Joshi shreyas.joshi@biamp.com Acked-by: Sergey Senozhatsky sergey.senozhatsky@gmail.com [pmladek@suse.com: Better format the commit message and code, remove unnecessary brackets.] Signed-off-by: Petr Mladek pmladek@suse.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/printk/printk.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c93b9cef8148..79158fc8e578 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2172,6 +2172,9 @@ static int __init console_setup(char *str) char *s, *options, *brl_options = NULL; int idx;
+ if (str[0] == 0) + return 1; + if (_braille_console_setup(&str, &brl_options)) return 1;
From: Qian Cai cai@lca.pw
stable inclusion from linux-4.19.149 commit b7e24664cc816717ca2a45b773d950a9188fb5c1
--------------------------------
[ Upstream commit 1518ac272e789cae8c555d69951b032a275b7602 ]
Finished a qemu-kvm (-device vfio-pci,host=0001:01:00.0) triggers a few memory leaks after a while because vfio_pci_set_ctx_trigger_single() calls eventfd_ctx_fdget() without the matching eventfd_ctx_put() later. Fix it by calling eventfd_ctx_put() for those memory in vfio_pci_release() before vfio_device_release().
unreferenced object 0xebff008981cc2b00 (size 128): comm "qemu-kvm", pid 4043, jiffies 4294994816 (age 9796.310s) hex dump (first 32 bytes): 01 00 00 00 6b 6b 6b 6b 00 00 00 00 ad 4e ad de ....kkkk.....N.. ff ff ff ff 6b 6b 6b 6b ff ff ff ff ff ff ff ff ....kkkk........ backtrace: [<00000000917e8f8d>] slab_post_alloc_hook+0x74/0x9c [<00000000df0f2aa2>] kmem_cache_alloc_trace+0x2b4/0x3d4 [<000000005fcec025>] do_eventfd+0x54/0x1ac [<0000000082791a69>] __arm64_sys_eventfd2+0x34/0x44 [<00000000b819758c>] do_el0_svc+0x128/0x1dc [<00000000b244e810>] el0_sync_handler+0xd0/0x268 [<00000000d495ef94>] el0_sync+0x164/0x180 unreferenced object 0x29ff008981cc4180 (size 128): comm "qemu-kvm", pid 4043, jiffies 4294994818 (age 9796.290s) hex dump (first 32 bytes): 01 00 00 00 6b 6b 6b 6b 00 00 00 00 ad 4e ad de ....kkkk.....N.. ff ff ff ff 6b 6b 6b 6b ff ff ff ff ff ff ff ff ....kkkk........ backtrace: [<00000000917e8f8d>] slab_post_alloc_hook+0x74/0x9c [<00000000df0f2aa2>] kmem_cache_alloc_trace+0x2b4/0x3d4 [<000000005fcec025>] do_eventfd+0x54/0x1ac [<0000000082791a69>] __arm64_sys_eventfd2+0x34/0x44 [<00000000b819758c>] do_el0_svc+0x128/0x1dc [<00000000b244e810>] el0_sync_handler+0xd0/0x268 [<00000000d495ef94>] el0_sync+0x164/0x180
Signed-off-by: Qian Cai cai@lca.pw Signed-off-by: Alex Williamson alex.williamson@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/vfio/pci/vfio_pci.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 9f72a6ee13b5..86cd8bdfa9f2 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -409,6 +409,10 @@ static void vfio_pci_release(void *device_data) if (!(--vdev->refcnt)) { vfio_spapr_pci_eeh_release(vdev->pdev); vfio_pci_disable(vdev); + if (vdev->err_trigger) + eventfd_ctx_put(vdev->err_trigger); + if (vdev->req_trigger) + eventfd_ctx_put(vdev->req_trigger); }
mutex_unlock(&driver_lock);
From: Ian Rogers irogers@google.com
stable inclusion from linux-4.19.149 commit 56540590ce7c316947d6740edc0403182a1e1ade
--------------------------------
[ Upstream commit 3efc899d9afb3d03604f191a0be9669eabbfc4aa ]
If allocated, perf_pkg_mask and metric_events need freeing.
Signed-off-by: Ian Rogers irogers@google.com Reviewed-by: Andi Kleen ak@linux.intel.com Cc: Adrian Hunter adrian.hunter@intel.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Jiri Olsa jolsa@redhat.com Cc: Mark Rutland mark.rutland@arm.com Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Stephane Eranian eranian@google.com Link: http://lore.kernel.org/lkml/20200512235918.10732-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/perf/util/evsel.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index c37594d57ab6..4390a8d8ba0b 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1291,6 +1291,8 @@ void perf_evsel__exit(struct perf_evsel *evsel) zfree(&evsel->group_name); zfree(&evsel->name); zfree(&evsel->pmu_name); + zfree(&evsel->per_pkg_mask); + zfree(&evsel->metric_events); perf_evsel__object.fini(evsel); }
From: Jiri Olsa jolsa@kernel.org
stable inclusion from linux-4.19.149 commit d911653688c588c22bdbc83459f87961c9d4399e
--------------------------------
[ Upstream commit ea9eb1f456a08c18feb485894185f7a4e31cc8a4 ]
Joakim reported wrong duration_time value for interval bigger than 4000 [1].
The problem is in the interval value we pass to update_stats function, which is typed as 'unsigned int' and overflows when we get over 2^32 (happens between intervals 4000 and 5000).
Retyping the passed value to unsigned long long.
[1] https://www.spinics.net/lists/linux-perf-users/msg11777.html
Fixes: b90f1333ef08 ("perf stat: Update walltime_nsecs_stats in interval mode") Reported-by: Joakim Zhang qiangqing.zhang@nxp.com Signed-off-by: Jiri Olsa jolsa@kernel.org Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Andi Kleen ak@linux.intel.com Cc: Michael Petlan mpetlan@redhat.com Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Link: http://lore.kernel.org/lkml/20200518131445.3745083-1-jolsa@kernel.org Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/perf/builtin-stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 6aae10ff954c..adabe9d4dc86 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -422,7 +422,7 @@ static void process_interval(void) }
init_stats(&walltime_nsecs_stats); - update_stats(&walltime_nsecs_stats, stat_config.interval * 1000000); + update_stats(&walltime_nsecs_stats, stat_config.interval * 1000000ULL); print_counters(&rs, 0, NULL); }
From: Xie XiuQi xiexiuqi@huawei.com
stable inclusion from linux-4.19.149 commit dd155a48a0c9b53404b30f6f92ccf9f8160378c1
--------------------------------
[ Upstream commit 07e9a6f538cbeecaf5c55b6f2991416f873cdcbd ]
Need to free "str" before return when asprintf() failed to avoid memory leak.
Signed-off-by: Xie XiuQi xiexiuqi@huawei.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Hongbo Yao yaohongbo@huawei.com Cc: Jiri Olsa jolsa@redhat.com Cc: Li Bin huawei.libin@huawei.com Cc: Mark Rutland mark.rutland@arm.com Cc: Namhyung Kim namhyung@kernel.org Link: http://lore.kernel.org/lkml/20200521133218.30150-4-liwei391@huawei.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/perf/util/sort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 46daa22b86e3..85ff4f68adc0 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -2690,7 +2690,7 @@ static char *prefix_if_not_in(const char *pre, char *str) return str;
if (asprintf(&n, "%s,%s", pre, str) < 0) - return NULL; + n = NULL;
free(str); return n;
From: Ian Rogers irogers@google.com
stable inclusion from linux-4.19.149 commit cc6ae85020035734eb13597fd6e8b0074897b837
--------------------------------
[ Upstream commit a159e2fe89b4d1f9fb54b0ae418b961e239bf617 ]
Avoid a simple memory leak.
Signed-off-by: Ian Rogers irogers@google.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Alexei Starovoitov ast@kernel.org Cc: Andi Kleen ak@linux.intel.com Cc: Andrii Nakryiko andriin@fb.com Cc: Cong Wang xiyou.wangcong@gmail.com Cc: Daniel Borkmann daniel@iogearbox.net Cc: Jin Yao yao.jin@linux.intel.com Cc: Jiri Olsa jolsa@redhat.com Cc: John Fastabend john.fastabend@gmail.com Cc: John Garry john.garry@huawei.com Cc: Kajol Jain kjain@linux.ibm.com Cc: Kan Liang kan.liang@linux.intel.com Cc: Kim Phillips kim.phillips@amd.com Cc: Mark Rutland mark.rutland@arm.com Cc: Martin KaFai Lau kafai@fb.com Cc: Namhyung Kim namhyung@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Song Liu songliubraving@fb.com Cc: Stephane Eranian eranian@google.com Cc: Vince Weaver vincent.weaver@maine.edu Cc: Yonghong Song yhs@fb.com Cc: bpf@vger.kernel.org Cc: kp singh kpsingh@chromium.org Cc: netdev@vger.kernel.org Link: http://lore.kernel.org/lkml/20200508053629.210324-10-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/perf/util/metricgroup.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/tools/perf/util/metricgroup.c b/tools/perf/util/metricgroup.c index 8b3dafe3fac3..6dcc6e1182a5 100644 --- a/tools/perf/util/metricgroup.c +++ b/tools/perf/util/metricgroup.c @@ -171,6 +171,7 @@ static int metricgroup__setup_events(struct list_head *groups, if (!evsel) { pr_debug("Cannot resolve %s: %s\n", eg->metric_name, eg->metric_expr); + free(metric_events); continue; } for (i = 0; i < eg->idnum; i++) @@ -178,11 +179,13 @@ static int metricgroup__setup_events(struct list_head *groups, me = metricgroup__lookup(metric_events_list, evsel, true); if (!me) { ret = -ENOMEM; + free(metric_events); break; } expr = malloc(sizeof(struct metric_expr)); if (!expr) { ret = -ENOMEM; + free(metric_events); break; } expr->metric_expr = eg->metric_expr;
From: Adrian Hunter adrian.hunter@intel.com
stable inclusion from linux-4.19.149 commit a63689c06a6dd5c0cf2a9221927b9b1b2b2bb9c1
--------------------------------
[ Upstream commit 61f82e3fb697a8e85f22fdec786528af73dc36d1 ]
In the absence of any modules, no "modules" map is created, but there are other executable pages to map, due to eBPF JIT, kprobe or ftrace. Map them by recognizing that the first "module" symbol is not necessarily from a module, and adjust the map accordingly.
Signed-off-by: Adrian Hunter adrian.hunter@intel.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Borislav Petkov bp@alien8.de Cc: H. Peter Anvin hpa@zytor.com Cc: Jiri Olsa jolsa@redhat.com Cc: Leo Yan leo.yan@linaro.org Cc: Mark Rutland mark.rutland@arm.com Cc: Masami Hiramatsu mhiramat@kernel.org Cc: Mathieu Poirier mathieu.poirier@linaro.org Cc: Peter Zijlstra peterz@infradead.org Cc: Steven Rostedt (VMware) rostedt@goodmis.org Cc: x86@kernel.org Link: http://lore.kernel.org/lkml/20200512121922.8997-10-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/perf/util/symbol-elf.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index a701a8a48f00..166c621e0223 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -1421,6 +1421,7 @@ struct kcore_copy_info { u64 first_symbol; u64 last_symbol; u64 first_module; + u64 first_module_symbol; u64 last_module_symbol; size_t phnum; struct list_head phdrs; @@ -1497,6 +1498,8 @@ static int kcore_copy__process_kallsyms(void *arg, const char *name, char type, return 0;
if (strchr(name, '[')) { + if (!kci->first_module_symbol || start < kci->first_module_symbol) + kci->first_module_symbol = start; if (start > kci->last_module_symbol) kci->last_module_symbol = start; return 0; @@ -1694,6 +1697,10 @@ static int kcore_copy__calc_maps(struct kcore_copy_info *kci, const char *dir, kci->etext += page_size; }
+ if (kci->first_module_symbol && + (!kci->first_module || kci->first_module_symbol < kci->first_module)) + kci->first_module = kci->first_module_symbol; + kci->first_module = round_down(kci->first_module, page_size);
if (kci->last_module_symbol) {
From: Alex Williamson alex.williamson@redhat.com
stable inclusion from linux-4.19.149 commit 41a77298809e7be112f91972d794aa231fbe27aa
--------------------------------
[ Upstream commit 5c5866c593bbd444d0339ede6a8fb5f14ff66d72 ]
The next use of the device will generate an underflow from the stale reference.
Cc: Qian Cai cai@lca.pw Fixes: 1518ac272e78 ("vfio/pci: fix memory leaks of eventfd ctx") Reported-by: Daniel Wagner dwagner@suse.de Reviewed-by: Cornelia Huck cohuck@redhat.com Tested-by: Daniel Wagner dwagner@suse.de Signed-off-by: Alex Williamson alex.williamson@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/vfio/pci/vfio_pci.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 86cd8bdfa9f2..94fad366312f 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -409,10 +409,14 @@ static void vfio_pci_release(void *device_data) if (!(--vdev->refcnt)) { vfio_spapr_pci_eeh_release(vdev->pdev); vfio_pci_disable(vdev); - if (vdev->err_trigger) + if (vdev->err_trigger) { eventfd_ctx_put(vdev->err_trigger); - if (vdev->req_trigger) + vdev->err_trigger = NULL; + } + if (vdev->req_trigger) { eventfd_ctx_put(vdev->req_trigger); + vdev->req_trigger = NULL; + } }
mutex_unlock(&driver_lock);
From: Zhang Xiaoxu zhangxiaoxu5@huawei.com
stable inclusion from linux-4.19.149 commit 5f7ca306c7db558fc81d9b1a45d59d5e1332a8a0
--------------------------------
[ Upstream commit 95a3d8f3af9b0d63b43f221b630beaab9739d13a ]
When xfstests generic/451, there is an BUG at mm/memcontrol.c: page:ffffea000560f2c0 refcount:2 mapcount:0 mapping:000000008544e0ea index:0xf mapping->aops:cifs_addr_ops dentry name:"tst-aio-dio-cycle-write.451" flags: 0x2fffff80000001(locked) raw: 002fffff80000001 ffffc90002023c50 ffffea0005280088 ffff88815cda0210 raw: 000000000000000f 0000000000000000 00000002ffffffff ffff88817287d000 page dumped because: VM_BUG_ON_PAGE(page->mem_cgroup) page->mem_cgroup:ffff88817287d000 ------------[ cut here ]------------ kernel BUG at mm/memcontrol.c:2659! invalid opcode: 0000 [#1] SMP CPU: 2 PID: 2038 Comm: xfs_io Not tainted 5.8.0-rc1 #44 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS ?-20190727_ 073836-buildvm-ppc64le-16.ppc.4 RIP: 0010:commit_charge+0x35/0x50 Code: 0d 48 83 05 54 b2 02 05 01 48 89 77 38 c3 48 c7 c6 78 4a ea ba 48 83 05 38 b2 02 05 01 e8 63 0d9 RSP: 0018:ffffc90002023a50 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff88817287d000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff88817ac97ea0 RDI: ffff88817ac97ea0 RBP: ffffea000560f2c0 R08: 0000000000000203 R09: 0000000000000005 R10: 0000000000000030 R11: ffffc900020237a8 R12: 0000000000000000 R13: 0000000000000001 R14: 0000000000000001 R15: ffff88815a1272c0 FS: 00007f5071ab0800(0000) GS:ffff88817ac80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055efcd5ca000 CR3: 000000015d312000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: mem_cgroup_charge+0x166/0x4f0 __add_to_page_cache_locked+0x4a9/0x710 add_to_page_cache_locked+0x15/0x20 cifs_readpages+0x217/0x1270 read_pages+0x29a/0x670 page_cache_readahead_unbounded+0x24f/0x390 __do_page_cache_readahead+0x3f/0x60 ondemand_readahead+0x1f1/0x470 page_cache_async_readahead+0x14c/0x170 generic_file_buffered_read+0x5df/0x1100 generic_file_read_iter+0x10c/0x1d0 cifs_strict_readv+0x139/0x170 new_sync_read+0x164/0x250 __vfs_read+0x39/0x60 vfs_read+0xb5/0x1e0 ksys_pread64+0x85/0xf0 __x64_sys_pread64+0x22/0x30 do_syscall_64+0x69/0x150 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f5071fcb1af Code: Bad RIP value. RSP: 002b:00007ffde2cdb8e0 EFLAGS: 00000293 ORIG_RAX: 0000000000000011 RAX: ffffffffffffffda RBX: 00007ffde2cdb990 RCX: 00007f5071fcb1af RDX: 0000000000001000 RSI: 000055efcd5ca000 RDI: 0000000000000003 RBP: 0000000000000003 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000001000 R11: 0000000000000293 R12: 0000000000000001 R13: 000000000009f000 R14: 0000000000000000 R15: 0000000000001000 Modules linked in: ---[ end trace 725fa14a3e1af65c ]---
Since commit 3fea5a499d57 ("mm: memcontrol: convert page cache to a new mem_cgroup_charge() API") not cancel the page charge, the pages maybe double add to pagecache: thread1 | thread2 cifs_readpages readpages_get_pages add_to_page_cache_locked(head,index=n)=0 | readpages_get_pages | add_to_page_cache_locked(head,index=n+1)=0 add_to_page_cache_locked(head, index=n+1)=-EEXIST then, will next loop with list head page's index=n+1 and the page->mapping not NULL readpages_get_pages add_to_page_cache_locked(head, index=n+1) commit_charge VM_BUG_ON_PAGE
So, we should not do the next loop when any page add to page cache failed.
Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Zhang Xiaoxu zhangxiaoxu5@huawei.com Signed-off-by: Steve French stfrench@microsoft.com Acked-by: Ronnie Sahlberg lsahlber@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/cifs/file.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 23b10982a0f4..35729722ac59 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -3812,7 +3812,8 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, break;
__SetPageLocked(page); - if (add_to_page_cache_locked(page, mapping, page->index, gfp)) { + rc = add_to_page_cache_locked(page, mapping, page->index, gfp); + if (rc) { __ClearPageLocked(page); break; } @@ -3828,6 +3829,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, struct list_head *page_list, unsigned num_pages) { int rc; + int err = 0; struct list_head tmplist; struct cifsFileInfo *open_file = file->private_data; struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); @@ -3872,7 +3874,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, * the order of declining indexes. When we put the pages in * the rdata->pages, then we want them in increasing order. */ - while (!list_empty(page_list)) { + while (!list_empty(page_list) && !err) { unsigned int i, nr_pages, bytes, rsize; loff_t offset; struct page *page, *tpage; @@ -3896,9 +3898,10 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, return 0; }
- rc = readpages_get_pages(mapping, page_list, rsize, &tmplist, + nr_pages = 0; + err = readpages_get_pages(mapping, page_list, rsize, &tmplist, &nr_pages, &offset, &bytes); - if (rc) { + if (!nr_pages) { add_credits_and_wake_if(server, credits, 0); break; }
From: Sagi Grimberg sagi@grimberg.me
stable inclusion from linux-4.19.149 commit 03dfb191acea76e6f92379abdbb5335139b28ffa
--------------------------------
[ Upstream commit 3b4b19721ec652ad2c4fe51dfbe5124212b5f581 ]
Revert fab7772bfbcf ("nvme-multipath: revalidate nvme_ns_head gendisk in nvme_validate_ns")
When adding a new namespace to the head disk (via nvme_mpath_set_live) we will see partition scan which triggers I/O on the mpath device node. This process will usually be triggered from the scan_work which holds the scan_lock. If I/O blocks (if we got ana change currently have only available paths but none are accessible) this can deadlock on the head disk bd_mutex as both partition scan I/O takes it, and head disk revalidation takes it to check for resize (also triggered from scan_work on a different path). See trace [1].
The mpath disk revalidation was originally added to detect online disk size change, but this is no longer needed since commit cb224c3af4df ("nvme: Convert to use set_capacity_revalidate_and_notify") which already updates resize info without unnecessarily revalidating the disk (the mpath disk doesn't even implement .revalidate_disk fop).
[1]:
Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 9c0836106460..801c5b082782 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1591,7 +1591,6 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->head->disk) { nvme_update_disk_info(ns->head->disk, ns, id); blk_queue_stack_limits(ns->head->disk->queue, ns->queue); - revalidate_disk(ns->head->disk); } #endif }
From: Zeng Tao prime.zeng@hisilicon.com
stable inclusion from linux-4.19.149 commit 0d1682ca6d1314c27d07afacda4dd51baf5fcd94
--------------------------------
[ Upstream commit b872d0640840018669032b20b6375a478ed1f923 ]
The vfio_pci_release call will free and clear the error and request eventfd ctx while these ctx could be in use at the same time in the function like vfio_pci_request, and it's expected to protect them under the vdev->igate mutex, which is missing in vfio_pci_release.
This issue is introduced since commit 1518ac272e78 ("vfio/pci: fix memory leaks of eventfd ctx"),and since commit 5c5866c593bb ("vfio/pci: Clear error and request eventfd ctx after releasing"), it's very easily to trigger the kernel panic like this:
[ 9513.904346] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000008 [ 9513.913091] Mem abort info: [ 9513.915871] ESR = 0x96000006 [ 9513.918912] EC = 0x25: DABT (current EL), IL = 32 bits [ 9513.924198] SET = 0, FnV = 0 [ 9513.927238] EA = 0, S1PTW = 0 [ 9513.930364] Data abort info: [ 9513.933231] ISV = 0, ISS = 0x00000006 [ 9513.937048] CM = 0, WnR = 0 [ 9513.940003] user pgtable: 4k pages, 48-bit VAs, pgdp=0000007ec7d12000 [ 9513.946414] [0000000000000008] pgd=0000007ec7d13003, p4d=0000007ec7d13003, pud=0000007ec728c003, pmd=0000000000000000 [ 9513.956975] Internal error: Oops: 96000006 [#1] PREEMPT SMP [ 9513.962521] Modules linked in: vfio_pci vfio_virqfd vfio_iommu_type1 vfio hclge hns3 hnae3 [last unloaded: vfio_pci] [ 9513.972998] CPU: 4 PID: 1327 Comm: bash Tainted: G W 5.8.0-rc4+ #3 [ 9513.980443] Hardware name: Huawei TaiShan 2280 V2/BC82AMDC, BIOS 2280-V2 CS V3.B270.01 05/08/2020 [ 9513.989274] pstate: 80400089 (Nzcv daIf +PAN -UAO BTYPE=--) [ 9513.994827] pc : _raw_spin_lock_irqsave+0x48/0x88 [ 9513.999515] lr : eventfd_signal+0x6c/0x1b0 [ 9514.003591] sp : ffff800038a0b960 [ 9514.006889] x29: ffff800038a0b960 x28: ffff007ef7f4da10 [ 9514.012175] x27: ffff207eefbbfc80 x26: ffffbb7903457000 [ 9514.017462] x25: ffffbb7912191000 x24: ffff007ef7f4d400 [ 9514.022747] x23: ffff20be6e0e4c00 x22: 0000000000000008 [ 9514.028033] x21: 0000000000000000 x20: 0000000000000000 [ 9514.033321] x19: 0000000000000008 x18: 0000000000000000 [ 9514.038606] x17: 0000000000000000 x16: ffffbb7910029328 [ 9514.043893] x15: 0000000000000000 x14: 0000000000000001 [ 9514.049179] x13: 0000000000000000 x12: 0000000000000002 [ 9514.054466] x11: 0000000000000000 x10: 0000000000000a00 [ 9514.059752] x9 : ffff800038a0b840 x8 : ffff007ef7f4de60 [ 9514.065038] x7 : ffff007fffc96690 x6 : fffffe01faffb748 [ 9514.070324] x5 : 0000000000000000 x4 : 0000000000000000 [ 9514.075609] x3 : 0000000000000000 x2 : 0000000000000001 [ 9514.080895] x1 : ffff007ef7f4d400 x0 : 0000000000000000 [ 9514.086181] Call trace: [ 9514.088618] _raw_spin_lock_irqsave+0x48/0x88 [ 9514.092954] eventfd_signal+0x6c/0x1b0 [ 9514.096691] vfio_pci_request+0x84/0xd0 [vfio_pci] [ 9514.101464] vfio_del_group_dev+0x150/0x290 [vfio] [ 9514.106234] vfio_pci_remove+0x30/0x128 [vfio_pci] [ 9514.111007] pci_device_remove+0x48/0x108 [ 9514.115001] device_release_driver_internal+0x100/0x1b8 [ 9514.120200] device_release_driver+0x28/0x38 [ 9514.124452] pci_stop_bus_device+0x68/0xa8 [ 9514.128528] pci_stop_and_remove_bus_device+0x20/0x38 [ 9514.133557] pci_iov_remove_virtfn+0xb4/0x128 [ 9514.137893] sriov_disable+0x3c/0x108 [ 9514.141538] pci_disable_sriov+0x28/0x38 [ 9514.145445] hns3_pci_sriov_configure+0x48/0xb8 [hns3] [ 9514.150558] sriov_numvfs_store+0x110/0x198 [ 9514.154724] dev_attr_store+0x44/0x60 [ 9514.158373] sysfs_kf_write+0x5c/0x78 [ 9514.162018] kernfs_fop_write+0x104/0x210 [ 9514.166010] __vfs_write+0x48/0x90 [ 9514.169395] vfs_write+0xbc/0x1c0 [ 9514.172694] ksys_write+0x74/0x100 [ 9514.176079] __arm64_sys_write+0x24/0x30 [ 9514.179987] el0_svc_common.constprop.4+0x110/0x200 [ 9514.184842] do_el0_svc+0x34/0x98 [ 9514.188144] el0_svc+0x14/0x40 [ 9514.191185] el0_sync_handler+0xb0/0x2d0 [ 9514.195088] el0_sync+0x140/0x180 [ 9514.198389] Code: b9001020 d2800000 52800022 f9800271 (885ffe61) [ 9514.204455] ---[ end trace 648de00c8406465f ]--- [ 9514.212308] note: bash[1327] exited with preempt_count 1
Cc: Qian Cai cai@lca.pw Cc: Alex Williamson alex.williamson@redhat.com Fixes: 1518ac272e78 ("vfio/pci: fix memory leaks of eventfd ctx") Signed-off-by: Zeng Tao prime.zeng@hisilicon.com Signed-off-by: Alex Williamson alex.williamson@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/vfio/pci/vfio_pci.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 94fad366312f..58e7336b2748 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -409,14 +409,19 @@ static void vfio_pci_release(void *device_data) if (!(--vdev->refcnt)) { vfio_spapr_pci_eeh_release(vdev->pdev); vfio_pci_disable(vdev); + mutex_lock(&vdev->igate); if (vdev->err_trigger) { eventfd_ctx_put(vdev->err_trigger); vdev->err_trigger = NULL; } + mutex_unlock(&vdev->igate); + + mutex_lock(&vdev->igate); if (vdev->req_trigger) { eventfd_ctx_put(vdev->req_trigger); vdev->req_trigger = NULL; } + mutex_unlock(&vdev->igate); }
mutex_unlock(&driver_lock);
From: Jin Yao yao.jin@linux.intel.com
stable inclusion from linux-4.19.149 commit 31c5c44707d8eb6809100a512b0877da51f795c2
--------------------------------
[ Upstream commit 8510895bafdbf7c4dd24c22946d925691135c2b2 ]
A big uncore event group is split into multiple small groups which only include the uncore events from the same PMU. This has been supported in the commit 3cdc5c2cb924a ("perf parse-events: Handle uncore event aliases in small groups properly").
If the event's PMU name starts to repeat, it must be a new event. That can be used to distinguish the leader from other members. But now it only compares the pointer of pmu_name (leader->pmu_name == evsel->pmu_name).
If we use "perf stat -M LLC_MISSES.PCIE_WRITE -a" on cascadelakex, the event list is:
evsel->name evsel->pmu_name --------------------------------------------------------------- unc_iio_data_req_of_cpu.mem_write.part0 uncore_iio_4 (as leader) unc_iio_data_req_of_cpu.mem_write.part0 uncore_iio_2 unc_iio_data_req_of_cpu.mem_write.part0 uncore_iio_0 unc_iio_data_req_of_cpu.mem_write.part0 uncore_iio_5 unc_iio_data_req_of_cpu.mem_write.part0 uncore_iio_3 unc_iio_data_req_of_cpu.mem_write.part0 uncore_iio_1 unc_iio_data_req_of_cpu.mem_write.part1 uncore_iio_4 ......
For the event "unc_iio_data_req_of_cpu.mem_write.part1" with "uncore_iio_4", it should be the event from PMU "uncore_iio_4". It's not a new leader for this PMU.
But if we use "(leader->pmu_name == evsel->pmu_name)", the check would be failed and the event is stored to leaders[] as a new PMU leader.
So this patch uses strcmp to compare the PMU name between events.
Fixes: d4953f7ef1a2 ("perf parse-events: Fix 3 use after frees found with clang ASAN") Signed-off-by: Jin Yao yao.jin@linux.intel.com Acked-by: Jiri Olsa jolsa@redhat.com Cc: Alexander Shishkin alexander.shishkin@linux.intel.com Cc: Andi Kleen ak@linux.intel.com Cc: Jin Yao yao.jin@intel.com Cc: Kan Liang kan.liang@linux.intel.com Cc: Peter Zijlstra peterz@infradead.org Link: http://lore.kernel.org/lkml/20200430003618.17002-1-yao.jin@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo acme@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/perf/util/parse-events.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 6d087d9acd5e..0eff0c3ba9ee 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1421,12 +1421,11 @@ parse_events__set_leader_for_uncore_aliase(char *name, struct list_head *list, * event. That can be used to distinguish the leader from * other members, even they have the same event name. */ - if ((leader != evsel) && (leader->pmu_name == evsel->pmu_name)) { + if ((leader != evsel) && + !strcmp(leader->pmu_name, evsel->pmu_name)) { is_leader = false; continue; } - /* The name is always alias name */ - WARN_ON(strcmp(leader->name, evsel->name));
/* Store the leader event for each PMU */ leaders[nr_pmu++] = (uintptr_t) evsel;
From: Anthony Iliopoulos ailiop@suse.com
stable inclusion from linux-4.19.149 commit 906c9129787bf890f3f1b562ddac45c3ec0965a8
--------------------------------
[ Upstream commit 05b29021fba5e725dd385151ef00b6340229b500 ]
Commit 3b4b19721ec652 ("nvme: fix possible deadlock when I/O is blocked") reverted multipath head disk revalidation due to deadlocks caused by holding the bd_mutex during revalidate.
Updating the multipath disk blockdev size is still required though for userspace to be able to observe any resizing while the device is mounted. Directly update the bdev inode size to avoid unnecessarily holding the bdev->bd_mutex.
Fixes: 3b4b19721ec652 ("nvme: fix possible deadlock when I/O is blocked")
Signed-off-by: Anthony Iliopoulos ailiop@suse.com Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/nvme.h | 13 +++++++++++++ 2 files changed, 14 insertions(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 801c5b082782..b12a658ad2d0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1591,6 +1591,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) if (ns->head->disk) { nvme_update_disk_info(ns->head->disk, ns, id); blk_queue_stack_limits(ns->head->disk->queue, ns->queue); + nvme_mpath_update_disk_size(ns->head->disk); } #endif } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 31c1496f938f..42397795bc98 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -503,6 +503,16 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) kblockd_schedule_work(&head->requeue_work); }
+static inline void nvme_mpath_update_disk_size(struct gendisk *disk) +{ + struct block_device *bdev = bdget_disk(disk, 0); + + if (bdev) { + bd_set_size(bdev, get_capacity(disk) << SECTOR_SHIFT); + bdput(bdev); + } +} + extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state;
@@ -569,6 +579,9 @@ static inline void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys) static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys) { } +static inline void nvme_mpath_update_disk_size(struct gendisk *disk) +{ +} #endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_NVM
From: Sven Schnelle svens@linux.ibm.com
stable inclusion from linux-4.19.149 commit aafa75ff39d05ad8011c1b8fa118c36acec9661a
--------------------------------
[ Upstream commit 73ac74c7d489756d2313219a108809921dbfaea1 ]
Switch order so that locking state is consistent even if the IRQ tracer calls into lockdep again.
Acked-by: Peter Zijlstra peterz@infradead.org Signed-off-by: Sven Schnelle svens@linux.ibm.com Signed-off-by: Vasily Gorbik gor@linux.ibm.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/trace_preemptirq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c index 71f553cceb3c..0e373cb0106b 100644 --- a/kernel/trace/trace_preemptirq.c +++ b/kernel/trace/trace_preemptirq.c @@ -59,14 +59,14 @@ EXPORT_SYMBOL(trace_hardirqs_on_caller);
__visible void trace_hardirqs_off_caller(unsigned long caller_addr) { + lockdep_hardirqs_off(CALLER_ADDR0); + if (!this_cpu_read(tracing_irq_cpu)) { this_cpu_write(tracing_irq_cpu, 1); tracer_hardirqs_off(CALLER_ADDR0, caller_addr); if (!in_nmi()) trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); } - - lockdep_hardirqs_off(CALLER_ADDR0); } EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_TRACE_IRQFLAGS */
From: Yonghong Song yhs@fb.com
stable inclusion from linux-4.19.149 commit e1a75e94a3acf78e6afdd548a5d504fc29cbc953
--------------------------------
[ Upstream commit ce880cb825fcc22d4e39046a6c3a3a7f6603883d ]
Running selftest ./btf_btf -p the kernel had the following warning: [ 51.528185] WARNING: CPU: 3 PID: 1756 at kernel/bpf/hashtab.c:717 htab_map_get_next_key+0x2eb/0x300 [ 51.529217] Modules linked in: [ 51.529583] CPU: 3 PID: 1756 Comm: test_btf Not tainted 5.9.0-rc1+ #878 [ 51.530346] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.9.3-1.el7.centos 04/01/2014 [ 51.531410] RIP: 0010:htab_map_get_next_key+0x2eb/0x300 ... [ 51.542826] Call Trace: [ 51.543119] map_seq_next+0x53/0x80 [ 51.543528] seq_read+0x263/0x400 [ 51.543932] vfs_read+0xad/0x1c0 [ 51.544311] ksys_read+0x5f/0xe0 [ 51.544689] do_syscall_64+0x33/0x40 [ 51.545116] entry_SYSCALL_64_after_hwframe+0x44/0xa9
The related source code in kernel/bpf/hashtab.c: 709 static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) 710 { 711 struct bpf_htab *htab = container_of(map, struct bpf_htab, map); 712 struct hlist_nulls_head *head; 713 struct htab_elem *l, *next_l; 714 u32 hash, key_size; 715 int i = 0; 716 717 WARN_ON_ONCE(!rcu_read_lock_held());
In kernel/bpf/inode.c, bpffs map pretty print calls map->ops->map_get_next_key() without holding a rcu_read_lock(), hence causing the above warning. To fix the issue, just surrounding map->ops->map_get_next_key() with rcu read lock.
Fixes: a26ca7c982cb ("bpf: btf: Add pretty print support to the basic arraymap") Reported-by: Alexei Starovoitov ast@kernel.org Signed-off-by: Yonghong Song yhs@fb.com Signed-off-by: Alexei Starovoitov ast@kernel.org Acked-by: Andrii Nakryiko andriin@fb.com Cc: Martin KaFai Lau kafai@fb.com Link: https://lore.kernel.org/bpf/20200916004401.146277-1-yhs@fb.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index c04815bb15cc..11fade89c1f3 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -207,10 +207,12 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos) else prev_key = key;
+ rcu_read_lock(); if (map->ops->map_get_next_key(map, prev_key, key)) { map_iter(m)->done = true; - return NULL; + key = NULL; } + rcu_read_unlock(); return key; }
From: Tom Rix trix@redhat.com
stable inclusion from linux-4.19.149 commit 240dd5118a9e0454f280ffeae63f22bd14735733
--------------------------------
commit 46bbe5c671e06f070428b9be142cc4ee5cedebac upstream.
clang static analyzer reports this problem
trace_events_hist.c:3824:3: warning: Attempt to free released memory kfree(hist_data->attrs->var_defs.name[i]);
In parse_var_defs() if there is a problem allocating var_defs.expr, the earlier var_defs.name is freed. This free is duplicated by free_var_defs() which frees the rest of the list.
Because free_var_defs() has to run anyway, remove the second free fom parse_var_defs().
Link: https://lkml.kernel.org/r/20200907135845.15804-1-trix@redhat.com
Cc: stable@vger.kernel.org Fixes: 30350d65ac56 ("tracing: Add variable support to hist triggers") Reviewed-by: Tom Zanussi tom.zanussi@linux.intel.com Signed-off-by: Tom Rix trix@redhat.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/trace_events_hist.c | 1 - 1 file changed, 1 deletion(-)
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0fb92d0c7b20..f9493e1d31d5 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -4124,7 +4124,6 @@ static int parse_var_defs(struct hist_trigger_data *hist_data)
s = kstrdup(field_str, GFP_KERNEL); if (!s) { - kfree(hist_data->attrs->var_defs.name[n_vars]); ret = -ENOMEM; goto free; }
From: Masami Hiramatsu mhiramat@kernel.org
stable inclusion from linux-4.19.149 commit ce7ff920092130f249b75f9fe177edb3362fefe8
--------------------------------
commit 3031313eb3d549b7ad6f9fbcc52ba04412e3eb9e upstream.
Commit 0cb2f1372baa ("kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler") fixed one bug but not completely fixed yet. If we run a kprobe_module.tc of ftracetest, kernel showed a warning as below.
# ./ftracetest test.d/kprobe/kprobe_module.tc
=== Ftrace unit tests === [1] Kprobe dynamic event - probing module ... [ 22.400215] ------------[ cut here ]------------ [ 22.400962] Failed to disarm kprobe-ftrace at trace_printk_irq_work+0x0/0x7e [trace_printk] (-2) [ 22.402139] WARNING: CPU: 7 PID: 200 at kernel/kprobes.c:1091 __disarm_kprobe_ftrace.isra.0+0x7e/0xa0 [ 22.403358] Modules linked in: trace_printk(-) [ 22.404028] CPU: 7 PID: 200 Comm: rmmod Not tainted 5.9.0-rc2+ #66 [ 22.404870] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/2014 [ 22.406139] RIP: 0010:__disarm_kprobe_ftrace.isra.0+0x7e/0xa0 [ 22.406947] Code: 30 8b 03 eb c9 80 3d e5 09 1f 01 00 75 dc 49 8b 34 24 89 c2 48 c7 c7 a0 c2 05 82 89 45 e4 c6 05 cc 09 1f 01 01 e8 a9 c7 f0 ff <0f> 0b 8b 45 e4 eb b9 89 c6 48 c7 c7 70 c2 05 82 89 45 e4 e8 91 c7 [ 22.409544] RSP: 0018:ffffc90000237df0 EFLAGS: 00010286 [ 22.410385] RAX: 0000000000000000 RBX: ffffffff83066024 RCX: 0000000000000000 [ 22.411434] RDX: 0000000000000001 RSI: ffffffff810de8d3 RDI: ffffffff810de8d3 [ 22.412687] RBP: ffffc90000237e10 R08: 0000000000000001 R09: 0000000000000001 [ 22.413762] R10: 0000000000000000 R11: 0000000000000001 R12: ffff88807c478640 [ 22.414852] R13: ffffffff8235ebc0 R14: ffffffffa00060c0 R15: 0000000000000000 [ 22.415941] FS: 00000000019d48c0(0000) GS:ffff88807d7c0000(0000) knlGS:0000000000000000 [ 22.417264] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 22.418176] CR2: 00000000005bb7e3 CR3: 0000000078f7a000 CR4: 00000000000006a0 [ 22.419309] Call Trace: [ 22.419990] kill_kprobe+0x94/0x160 [ 22.420652] kprobes_module_callback+0x64/0x230 [ 22.421470] notifier_call_chain+0x4f/0x70 [ 22.422184] blocking_notifier_call_chain+0x49/0x70 [ 22.422979] __x64_sys_delete_module+0x1ac/0x240 [ 22.423733] do_syscall_64+0x38/0x50 [ 22.424366] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 22.425176] RIP: 0033:0x4bb81d [ 22.425741] Code: 00 c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 e0 ff ff ff f7 d8 64 89 01 48 [ 22.428726] RSP: 002b:00007ffc70fef008 EFLAGS: 00000246 ORIG_RAX: 00000000000000b0 [ 22.430169] RAX: ffffffffffffffda RBX: 00000000019d48a0 RCX: 00000000004bb81d [ 22.431375] RDX: 0000000000000000 RSI: 0000000000000880 RDI: 00007ffc70fef028 [ 22.432543] RBP: 0000000000000880 R08: 00000000ffffffff R09: 00007ffc70fef320 [ 22.433692] R10: 0000000000656300 R11: 0000000000000246 R12: 00007ffc70fef028 [ 22.434635] R13: 0000000000000000 R14: 0000000000000002 R15: 0000000000000000 [ 22.435682] irq event stamp: 1169 [ 22.436240] hardirqs last enabled at (1179): [<ffffffff810df542>] console_unlock+0x422/0x580 [ 22.437466] hardirqs last disabled at (1188): [<ffffffff810df19b>] console_unlock+0x7b/0x580 [ 22.438608] softirqs last enabled at (866): [<ffffffff81c0038e>] __do_softirq+0x38e/0x490 [ 22.439637] softirqs last disabled at (859): [<ffffffff81a00f42>] asm_call_on_stack+0x12/0x20 [ 22.440690] ---[ end trace 1e7ce7e1e4567276 ]--- [ 22.472832] trace_kprobe: This probe might be able to register after target module is loaded. Continue.
This is because the kill_kprobe() calls disarm_kprobe_ftrace() even if the given probe is not enabled. In that case, ftrace_set_filter_ip() fails because the given probe point is not registered to ftrace.
Fix to check the given (going) probe is enabled before invoking disarm_kprobe_ftrace().
Link: https://lkml.kernel.org/r/159888672694.1411785.5987998076694782591.stgit@dev...
Fixes: 0cb2f1372baa ("kprobes: Fix NULL pointer dereference at kprobe_ftrace_handler") Cc: Ingo Molnar mingo@kernel.org Cc: "Naveen N . Rao" naveen.n.rao@linux.ibm.com Cc: Anil S Keshavamurthy anil.s.keshavamurthy@intel.com Cc: David Miller davem@davemloft.net Cc: Muchun Song songmuchun@bytedance.com Cc: Chengming Zhou zhouchengming@bytedance.com Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu mhiramat@kernel.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/kprobes.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f231d3f335a3..42dc9347da14 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -2072,9 +2072,10 @@ static void kill_kprobe(struct kprobe *p)
/* * The module is going away. We should disarm the kprobe which - * is using ftrace. + * is using ftrace, because ftrace framework is still available at + * MODULE_STATE_GOING notification. */ - if (kprobe_ftrace(p)) + if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed) disarm_kprobe_ftrace(p); }
From: Gao Xiang hsiangkao@redhat.com
stable inclusion from linux-4.19.149 commit f3e8ed3d33fa963f1b6827977696235852cdd8d9
--------------------------------
commit 41663430588c737dd735bad5a0d1ba325dcabd59 upstream.
SWP_FS is used to make swap_{read,write}page() go through the filesystem, and it's only used for swap files over NFS. So, !SWP_FS means non NFS for now, it could be either file backed or device backed. Something similar goes with legacy SWP_FILE.
So in order to achieve the goal of the original patch, SWP_BLKDEV should be used instead.
FS corruption can be observed with SSD device + XFS + fragmented swapfile due to CONFIG_THP_SWAP=y.
I reproduced the issue with the following details:
Environment:
QEMU + upstream kernel + buildroot + NVMe (2 GB)
Kernel config:
CONFIG_BLK_DEV_NVME=y CONFIG_THP_SWAP=y
Some reproducible steps:
mkfs.xfs -f /dev/nvme0n1 mkdir /tmp/mnt mount /dev/nvme0n1 /tmp/mnt bs="32k" sz="1024m" # doesn't matter too much, I also tried 16m xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw xfs_io -f -c "pwrite -F -S 0 -b $bs 0 $sz" -c "fdatasync" /tmp/mnt/sw xfs_io -f -c "pwrite -R -b $bs 0 $sz" -c "fsync" /tmp/mnt/sw
mkswap /tmp/mnt/sw swapon /tmp/mnt/sw
stress --vm 2 --vm-bytes 600M # doesn't matter too much as well
Symptoms: - FS corruption (e.g. checksum failure) - memory corruption at: 0xd2808010 - segfault
Fixes: f0eea189e8e9 ("mm, THP, swap: Don't allocate huge cluster for file backed swap device") Fixes: 38d8b4e6bdc8 ("mm, THP, swap: delay splitting THP during swap out") Signed-off-by: Gao Xiang hsiangkao@redhat.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: "Huang, Ying" ying.huang@intel.com Reviewed-by: Yang Shi shy828301@gmail.com Acked-by: Rafael Aquini aquini@redhat.com Cc: Matthew Wilcox willy@infradead.org Cc: Carlos Maiolino cmaiolino@redhat.com Cc: Eric Sandeen esandeen@redhat.com Cc: Dave Chinner david@fromorbit.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200820045323.7809-1-hsiangkao@redhat.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: mm/swapfile.c [yyl: keep same as mainline] Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/swapfile.c b/mm/swapfile.c index 765fca3451d2..46e6b0c06f09 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1000,7 +1000,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) goto nextsi; } if (size == SWAPFILE_CLUSTER) { - if (!(si->flags & SWP_FS)) + if (si->flags & SWP_BLKDEV) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
stable inclusion from linux-4.19.150 commit 2fd5a462eb7b39694ae013450dc47d84cdf7204a
--------------------------------
commit b40341fad6cc2daa195f8090fd3348f18fff640a upstream.
The first thing that the ftrace function callback helper functions should do is to check for recursion. Peter Zijlstra found that when "rcu_is_watching()" had its notrace removed, it caused perf function tracing to crash. This is because the call of rcu_is_watching() is tested before function recursion is checked and and if it is traced, it will cause an infinite recursion loop.
rcu_is_watching() should still stay notrace, but to prevent this should never had crashed in the first place. The recursion prevention must be the first thing done in callback functions.
Link: https://lore.kernel.org/r/20200929112541.GM2628@hirez.programming.kicks-ass....
Cc: stable@vger.kernel.org Cc: Paul McKenney paulmck@kernel.org Fixes: c68c0fa293417 ("ftrace: Have ftrace_ops_get_func() handle RCU and PER_CPU flags too") Acked-by: Peter Zijlstra (Intel) peterz@infradead.org Reported-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/ftrace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 264b7262d6a9..fd42d3e685f4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6363,16 +6363,14 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, { int bit;
- if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching()) - return; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return;
preempt_disable_notrace();
- op->func(ip, parent_ip, op, regs); + if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) + op->func(ip, parent_ip, op, regs);
preempt_enable_notrace(); trace_clear_recursion(bit);
From: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com
stable inclusion from linux-4.19.150 commit c2df194a0d50bc1370c6761f5b80d3a32f42bcd4
--------------------------------
[ Upstream commit 52a3974feb1a3eec25d8836d37a508b67b0a9cd0 ]
Get and put the reference to the ctrl in the nvme_dev_open() and nvme_dev_release() before and after module get/put for ctrl in char device file operations.
Introduce char_dev relase function, get/put the controller and module which allows us to fix the potential Oops which can be easily reproduced with a passthru ctrl (although the problem also exists with pure user access):
Entering kdb (current=0xffff8887f8290000, pid 3128) on processor 30 Oops: (null) due to oops @ 0xffffffffa01019ad CPU: 30 PID: 3128 Comm: bash Tainted: G W OE 5.8.0-rc4nvme-5.9+ #35 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.4 RIP: 0010:nvme_free_ctrl+0x234/0x285 [nvme_core] Code: 57 10 a0 e8 73 bf 02 e1 ba 3d 11 00 00 48 c7 c6 98 33 10 a0 48 c7 c7 1d 57 10 a0 e8 5b bf 02 e1 8 RSP: 0018:ffffc90001d63de0 EFLAGS: 00010246 RAX: ffffffffa05c0440 RBX: ffff8888119e45a0 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff8888177e9550 RDI: ffff8888119e43b0 RBP: ffff8887d4768000 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: ffffc90001d63c90 R12: ffff8888119e43b0 R13: ffff8888119e5108 R14: dead000000000100 R15: ffff8888119e5108 FS: 00007f1ef27b0740(0000) GS:ffff888817600000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffffa05c0470 CR3: 00000007f6bee000 CR4: 00000000003406e0 Call Trace: device_release+0x27/0x80 kobject_put+0x98/0x170 nvmet_passthru_ctrl_disable+0x4a/0x70 [nvmet] nvmet_passthru_enable_store+0x4c/0x90 [nvmet] configfs_write_file+0xe6/0x150 vfs_write+0xba/0x1e0 ksys_write+0x5f/0xe0 do_syscall_64+0x52/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f1ef1eb2840 Code: Bad RIP value. RSP: 002b:00007fffdbff0eb8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f1ef1eb2840 RDX: 0000000000000002 RSI: 00007f1ef27d2000 RDI: 0000000000000001 RBP: 00007f1ef27d2000 R08: 000000000000000a R09: 00007f1ef27b0740 R10: 0000000000000001 R11: 0000000000000246 R12: 00007f1ef2186400 R13: 0000000000000002 R14: 0000000000000001 R15: 0000000000000000
With this patch fix we take the module ref count in nvme_dev_open() and release that ref count in newly introduced nvme_dev_release().
Signed-off-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index b12a658ad2d0..8b478e402f62 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2584,10 +2584,24 @@ static int nvme_dev_open(struct inode *inode, struct file *file) return -EWOULDBLOCK; }
+ nvme_get_ctrl(ctrl); + if (!try_module_get(ctrl->ops->module)) + return -EINVAL; + file->private_data = ctrl; return 0; }
+static int nvme_dev_release(struct inode *inode, struct file *file) +{ + struct nvme_ctrl *ctrl = + container_of(inode->i_cdev, struct nvme_ctrl, cdev); + + module_put(ctrl->ops->module); + nvme_put_ctrl(ctrl); + return 0; +} + static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) { struct nvme_ns *ns; @@ -2648,6 +2662,7 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd, static const struct file_operations nvme_dev_fops = { .owner = THIS_MODULE, .open = nvme_dev_open, + .release = nvme_dev_release, .unlocked_ioctl = nvme_dev_ioctl, .compat_ioctl = nvme_dev_ioctl, };
From: Jeffrey Mitchell jeffrey.mitchell@starlab.io
stable inclusion from linux-4.19.150 commit 345c6f260c89e417de6e7d81f3366bd5079f48a3
--------------------------------
[ Upstream commit d33030e2ee3508d65db5644551435310df86010e ]
nfs_readdir_page_filler() iterates over entries in a directory, reusing the same security label buffer, but does not reset the buffer's length. This causes decode_attr_security_label() to return -ERANGE if an entry's security label is longer than the previous one's. This error, in nfs4_decode_dirent(), only gets passed up as -EAGAIN, which causes another failed attempt to copy into the buffer. The second error is ignored and the remaining entries do not show up in ls, specifically the getdents64() syscall.
Reproduce by creating multiple files in NFS and giving one of the later files a longer security label. ls will not see that file nor any that are added afterwards, though they will exist on the backend.
In nfs_readdir_page_filler(), reset security label buffer length before every reuse
Signed-off-by: Jeffrey Mitchell jeffrey.mitchell@starlab.io Fixes: b4487b935452 ("nfs: Fix getxattr kernel panic and memory overflow") Signed-off-by: Trond Myklebust trond.myklebust@hammerspace.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/nfs/dir.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 4ae726e70d87..733fd9e4f0a1 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -553,6 +553,9 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
do { + if (entry->label) + entry->label->len = NFS4_MAXLABELLEN; + status = xdr_decode(desc, entry, &stream); if (status != 0) { if (status == -EAGAIN)
From: Thibaut Sautereau thibaut.sautereau@ssi.gouv.fr
stable inclusion from linux-4.19.150 commit a4ebc2d6aa3ac2aa92cac8f6f53662df2c4904c9
--------------------------------
[ Upstream commit 09a6b0bc3be793ca8cba580b7992d73e9f68f15d ]
Commit f227e3ec3b5c ("random32: update the net random state on interrupt and activity") broke compilation and was temporarily fixed by Linus in 83bdc7275e62 ("random32: remove net_rand_state from the latent entropy gcc plugin") by entirely moving net_rand_state out of the things handled by the latent_entropy GCC plugin.
From what I understand when reading the plugin code, using the __latent_entropy attribute on a declaration was the wrong part and simply keeping the __latent_entropy attribute on the variable definition was the correct fix.
Fixes: 83bdc7275e62 ("random32: remove net_rand_state from the latent entropy gcc plugin") Acked-by: Willy Tarreau w@1wt.eu Cc: Emese Revfy re.emese@gmail.com Signed-off-by: Thibaut Sautereau thibaut.sautereau@ssi.gouv.fr Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- lib/random32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/random32.c b/lib/random32.c index 036de0c93e22..b6f3325e38e4 100644 --- a/lib/random32.c +++ b/lib/random32.c @@ -48,7 +48,7 @@ static inline void prandom_state_selftest(void) } #endif
-DEFINE_PER_CPU(struct rnd_state, net_rand_state); +DEFINE_PER_CPU(struct rnd_state, net_rand_state) __latent_entropy;
/** * prandom_u32_state - seeded pseudo-random number generator.
From: Laurent Dufour ldufour@linux.ibm.com
stable inclusion from linux-4.19.150 commit 25eaea1b33f2569f69a82dfddb3fb05384143bd0
--------------------------------
commit c1d0da83358a2316d9be7f229f26126dbaa07468 upstream.
Patch series "mm: fix memory to node bad links in sysfs", v3.
Sometimes, firmware may expose interleaved memory layout like this:
Early memory node ranges node 1: [mem 0x0000000000000000-0x000000011fffffff] node 2: [mem 0x0000000120000000-0x000000014fffffff] node 1: [mem 0x0000000150000000-0x00000001ffffffff] node 0: [mem 0x0000000200000000-0x000000048fffffff] node 2: [mem 0x0000000490000000-0x00000007ffffffff]
In that case, we can see memory blocks assigned to multiple nodes in sysfs:
$ ls -l /sys/devices/system/memory/memory21 total 0 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node1 -> ../../node/node1 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node2 -> ../../node/node2 -rw-r--r-- 1 root root 65536 Aug 24 05:27 online -r--r--r-- 1 root root 65536 Aug 24 05:27 phys_device -r--r--r-- 1 root root 65536 Aug 24 05:27 phys_index drwxr-xr-x 2 root root 0 Aug 24 05:27 power -r--r--r-- 1 root root 65536 Aug 24 05:27 removable -rw-r--r-- 1 root root 65536 Aug 24 05:27 state lrwxrwxrwx 1 root root 0 Aug 24 05:25 subsystem -> ../../../../bus/memory -rw-r--r-- 1 root root 65536 Aug 24 05:25 uevent -r--r--r-- 1 root root 65536 Aug 24 05:27 valid_zones
The same applies in the node's directory with a memory21 link in both the node1 and node2's directory.
This is wrong but doesn't prevent the system to run. However when later, one of these memory blocks is hot-unplugged and then hot-plugged, the system is detecting an inconsistency in the sysfs layout and a BUG_ON() is raised:
kernel BUG at /Users/laurent/src/linux-ppc/mm/memory_hotplug.c:1084! LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: rpadlpar_io rpaphp pseries_rng rng_core vmx_crypto gf128mul binfmt_misc ip_tables x_tables xfs libcrc32c crc32c_vpmsum autofs4 CPU: 8 PID: 10256 Comm: drmgr Not tainted 5.9.0-rc1+ #25 Call Trace: add_memory_resource+0x23c/0x340 (unreliable) __add_memory+0x5c/0xf0 dlpar_add_lmb+0x1b4/0x500 dlpar_memory+0x1f8/0xb80 handle_dlpar_errorlog+0xc0/0x190 dlpar_store+0x198/0x4a0 kobj_attr_store+0x30/0x50 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 vfs_write+0xe8/0x290 ksys_write+0xdc/0x130 system_call_exception+0x160/0x270 system_call_common+0xf0/0x27c
This has been seen on PowerPC LPAR.
The root cause of this issue is that when node's memory is registered, the range used can overlap another node's range, thus the memory block is registered to multiple nodes in sysfs.
There are two issues here:
(a) The sysfs memory and node's layouts are broken due to these multiple links
(b) The link errors in link_mem_sections() should not lead to a system panic.
To address (a) register_mem_sect_under_node should not rely on the system state to detect whether the link operation is triggered by a hot plug operation or not. This is addressed by the patches 1 and 2 of this series.
Issue (b) will be addressed separately.
This patch (of 2):
The memmap_context enum is used to detect whether a memory operation is due to a hot-add operation or happening at boot time.
Make it general to the hotplug operation and rename it as meminit_context.
There is no functional change introduced by this patch
Suggested-by: David Hildenbrand david@redhat.com Signed-off-by: Laurent Dufour ldufour@linux.ibm.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: David Hildenbrand david@redhat.com Reviewed-by: Oscar Salvador osalvador@suse.de Acked-by: Michal Hocko mhocko@suse.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: "Rafael J . Wysocki" rafael@kernel.org Cc: Nathan Lynch nathanl@linux.ibm.com Cc: Scott Cheloha cheloha@linux.ibm.com Cc: Tony Luck tony.luck@intel.com Cc: Fenghua Yu fenghua.yu@intel.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200915094143.79181-1-ldufour@linux.ibm.com Link: https://lkml.kernel.org/r/20200915132624.9723-1-ldufour@linux.ibm.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/ia64/mm/init.c | 6 +++--- include/linux/mm.h | 2 +- include/linux/mmzone.h | 11 ++++++++--- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 11 ++++++----- 5 files changed, 19 insertions(+), 13 deletions(-)
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 79e5cc70f1fd..561e2573bd34 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -499,7 +499,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg) if (map_start < map_end) memmap_init_zone((unsigned long)(map_end - map_start), args->nid, args->zone, page_to_pfn(map_start), - MEMMAP_EARLY, NULL); + MEMINIT_EARLY, NULL); return 0; }
@@ -508,8 +508,8 @@ memmap_init (unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { if (!vmem_map) { - memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, - NULL); + memmap_init_zone(size, nid, zone, start_pfn, + MEMINIT_EARLY, NULL); } else { struct page *start; struct memmap_init_callback_data args; diff --git a/include/linux/mm.h b/include/linux/mm.h index 369dc740963a..679860bb2238 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2223,7 +2223,7 @@ static inline void zero_resv_unavail(void) {}
extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, - enum memmap_context, struct vmem_altmap *); + enum meminit_context, struct vmem_altmap *); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 51f74519b0e8..01349f51fe88 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -766,10 +766,15 @@ bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx); -enum memmap_context { - MEMMAP_EARLY, - MEMMAP_HOTPLUG, +/* + * Memory initialization context, use to differentiate memory added by + * the platform statically or via memory hotplug interface. + */ +enum meminit_context { + MEMINIT_EARLY, + MEMINIT_HOTPLUG, }; + extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ae6e46937d96..5dfd8dd328d3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -733,7 +733,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * are reserved so nobody should be touching them so we should be safe */ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, - MEMMAP_HOTPLUG, altmap); + MEMINIT_HOTPLUG, altmap);
set_zone_contiguous(zone); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e394ae5ba2a7..f394e0df9c87 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5594,7 +5594,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat) * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn, enum memmap_context context, + unsigned long start_pfn, enum meminit_context context, struct vmem_altmap *altmap) { unsigned long end_pfn = start_pfn + size; @@ -5621,7 +5621,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory. */ - if (context != MEMMAP_EARLY) + if (context != MEMINIT_EARLY) goto not_early;
if (!early_pfn_valid(pfn)) { @@ -5659,7 +5659,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, not_early: page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); - if (context == MEMMAP_HOTPLUG) + if (context == MEMINIT_HOTPLUG) SetPageReserved(page);
/* @@ -5674,7 +5674,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * check here not to call set_pageblock_migratetype() against * pfn out of zone. * - * Please note that MEMMAP_HOTPLUG path doesn't clear memmap + * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in sparse_add_one_section */ if (!(pfn & (pageblock_nr_pages - 1))) { @@ -5695,7 +5695,8 @@ static void __meminit zone_init_free_lists(struct zone *zone)
#ifndef __HAVE_ARCH_MEMMAP_INIT #define memmap_init(size, nid, zone, start_pfn) \ - memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL) + memmap_init_zone((size), (nid), (zone), (start_pfn), \ + MEMINIT_EARLY, NULL) #endif
static int zone_batchsize(struct zone *zone)
From: Laurent Dufour ldufour@linux.ibm.com
stable inclusion from linux-4.19.150 commit b6f69f72c15d7f973f5709c5351f378f235b3654
--------------------------------
commit f85086f95fa36194eb0db5cd5c12e56801b98523 upstream.
In register_mem_sect_under_node() the system_state's value is checked to detect whether the call is made during boot time or during an hot-plug operation. Unfortunately, that check against SYSTEM_BOOTING is wrong because regular memory is registered at SYSTEM_SCHEDULING state. In addition, memory hot-plug operation can be triggered at this system state by the ACPI [1]. So checking against the system state is not enough.
The consequence is that on system with interleaved node's ranges like this:
Early memory node ranges node 1: [mem 0x0000000000000000-0x000000011fffffff] node 2: [mem 0x0000000120000000-0x000000014fffffff] node 1: [mem 0x0000000150000000-0x00000001ffffffff] node 0: [mem 0x0000000200000000-0x000000048fffffff] node 2: [mem 0x0000000490000000-0x00000007ffffffff]
This can be seen on PowerPC LPAR after multiple memory hot-plug and hot-unplug operations are done. At the next reboot the node's memory ranges can be interleaved and since the call to link_mem_sections() is made in topology_init() while the system is in the SYSTEM_SCHEDULING state, the node's id is not checked, and the sections registered to multiple nodes:
$ ls -l /sys/devices/system/memory/memory21/node* total 0 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node1 -> ../../node/node1 lrwxrwxrwx 1 root root 0 Aug 24 05:27 node2 -> ../../node/node2
In that case, the system is able to boot but if later one of theses memory blocks is hot-unplugged and then hot-plugged, the sysfs inconsistency is detected and this is triggering a BUG_ON():
kernel BUG at /Users/laurent/src/linux-ppc/mm/memory_hotplug.c:1084! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries Modules linked in: rpadlpar_io rpaphp pseries_rng rng_core vmx_crypto gf128mul binfmt_misc ip_tables x_tables xfs libcrc32c crc32c_vpmsum autofs4 CPU: 8 PID: 10256 Comm: drmgr Not tainted 5.9.0-rc1+ #25 Call Trace: add_memory_resource+0x23c/0x340 (unreliable) __add_memory+0x5c/0xf0 dlpar_add_lmb+0x1b4/0x500 dlpar_memory+0x1f8/0xb80 handle_dlpar_errorlog+0xc0/0x190 dlpar_store+0x198/0x4a0 kobj_attr_store+0x30/0x50 sysfs_kf_write+0x64/0x90 kernfs_fop_write+0x1b0/0x290 vfs_write+0xe8/0x290 ksys_write+0xdc/0x130 system_call_exception+0x160/0x270 system_call_common+0xf0/0x27c
This patch addresses the root cause by not relying on the system_state value to detect whether the call is due to a hot-plug operation. An extra parameter is added to link_mem_sections() detailing whether the operation is due to a hot-plug operation.
[1] According to Oscar Salvador, using this qemu command line, ACPI memory hotplug operations are raised at SYSTEM_SCHEDULING state:
$QEMU -enable-kvm -machine pc -smp 4,sockets=4,cores=1,threads=1 -cpu host -monitor pty \ -m size=$MEM,slots=255,maxmem=4294967296k \ -numa node,nodeid=0,cpus=0-3,mem=512 -numa node,nodeid=1,mem=512 \ -object memory-backend-ram,id=memdimm0,size=134217728 -device pc-dimm,node=0,memdev=memdimm0,id=dimm0,slot=0 \ -object memory-backend-ram,id=memdimm1,size=134217728 -device pc-dimm,node=0,memdev=memdimm1,id=dimm1,slot=1 \ -object memory-backend-ram,id=memdimm2,size=134217728 -device pc-dimm,node=0,memdev=memdimm2,id=dimm2,slot=2 \ -object memory-backend-ram,id=memdimm3,size=134217728 -device pc-dimm,node=0,memdev=memdimm3,id=dimm3,slot=3 \ -object memory-backend-ram,id=memdimm4,size=134217728 -device pc-dimm,node=1,memdev=memdimm4,id=dimm4,slot=4 \ -object memory-backend-ram,id=memdimm5,size=134217728 -device pc-dimm,node=1,memdev=memdimm5,id=dimm5,slot=5 \ -object memory-backend-ram,id=memdimm6,size=134217728 -device pc-dimm,node=1,memdev=memdimm6,id=dimm6,slot=6 \
Fixes: 4fbce633910e ("mm/memory_hotplug.c: make register_mem_sect_under_node() a callback of walk_memory_range()") Signed-off-by: Laurent Dufour ldufour@linux.ibm.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: David Hildenbrand david@redhat.com Reviewed-by: Oscar Salvador osalvador@suse.de Acked-by: Michal Hocko mhocko@suse.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: "Rafael J. Wysocki" rafael@kernel.org Cc: Fenghua Yu fenghua.yu@intel.com Cc: Nathan Lynch nathanl@linux.ibm.com Cc: Scott Cheloha cheloha@linux.ibm.com Cc: Tony Luck tony.luck@intel.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20200915094143.79181-3-ldufour@linux.ibm.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/base/node.c | 84 ++++++++++++++++++++++++++++---------------- include/linux/node.h | 11 +++--- mm/memory_hotplug.c | 3 +- 3 files changed, 63 insertions(+), 35 deletions(-)
diff --git a/drivers/base/node.c b/drivers/base/node.c index 6fb55a58a1bb..0938337daa7f 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -403,10 +403,32 @@ static int __ref get_nid_for_pfn(unsigned long pfn) return pfn_to_nid(pfn); }
+static int do_register_memory_block_under_node(int nid, + struct memory_block *mem_blk) +{ + int ret; + + /* + * If this memory block spans multiple nodes, we only indicate + * the last processed node. + */ + mem_blk->nid = nid; + + ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, + &mem_blk->dev.kobj, + kobject_name(&mem_blk->dev.kobj)); + if (ret) + return ret; + + return sysfs_create_link_nowarn(&mem_blk->dev.kobj, + &node_devices[nid]->dev.kobj, + kobject_name(&node_devices[nid]->dev.kobj)); +} + /* register memory section under specified node if it spans that node */ -int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) +int register_mem_block_under_node_early(struct memory_block *mem_blk, void *arg) { - int ret, nid = *(int *)arg; + int nid = *(int *)arg; unsigned long pfn, sect_start_pfn, sect_end_pfn;
sect_start_pfn = section_nr_to_pfn(mem_blk->start_section_nr); @@ -426,38 +448,33 @@ int register_mem_sect_under_node(struct memory_block *mem_blk, void *arg) }
/* - * We need to check if page belongs to nid only for the boot - * case, during hotplug we know that all pages in the memory - * block belong to the same node. - */ - if (system_state == SYSTEM_BOOTING) { - page_nid = get_nid_for_pfn(pfn); - if (page_nid < 0) - continue; - if (page_nid != nid) - continue; - } - - /* - * If this memory block spans multiple nodes, we only indicate - * the last processed node. + * We need to check if page belongs to nid only at the boot + * case because node's ranges can be interleaved. */ - mem_blk->nid = nid; - - ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj, - &mem_blk->dev.kobj, - kobject_name(&mem_blk->dev.kobj)); - if (ret) - return ret; + page_nid = get_nid_for_pfn(pfn); + if (page_nid < 0) + continue; + if (page_nid != nid) + continue;
- return sysfs_create_link_nowarn(&mem_blk->dev.kobj, - &node_devices[nid]->dev.kobj, - kobject_name(&node_devices[nid]->dev.kobj)); + return do_register_memory_block_under_node(nid, mem_blk); } /* mem section does not span the specified node */ return 0; }
+/* + * During hotplug we know that all pages in the memory block belong to the same + * node. + */ +static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk, + void *arg) +{ + int nid = *(int *)arg; + + return do_register_memory_block_under_node(nid, mem_blk); +} + /* * Unregister a memory block device under the node it spans. Memory blocks * with multiple nodes cannot be offlined and therefore also never be removed. @@ -473,10 +490,17 @@ void unregister_memory_block_under_nodes(struct memory_block *mem_blk) kobject_name(&node_devices[mem_blk->nid]->dev.kobj)); }
-int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn) +int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn, + enum meminit_context context) { - return walk_memory_range(start_pfn, end_pfn, (void *)&nid, - register_mem_sect_under_node); + walk_memory_blocks_func_t func; + + if (context == MEMINIT_HOTPLUG) + func = register_mem_block_under_node_hotplug; + else + func = register_mem_block_under_node_early; + + return walk_memory_range(start_pfn, end_pfn, (void *)&nid, func); }
#ifdef CONFIG_HUGETLBFS diff --git a/include/linux/node.h b/include/linux/node.h index 708939bae9aa..a79ec4492650 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -32,11 +32,13 @@ extern struct node *node_devices[]; typedef void (*node_registration_func_t)(struct node *);
#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_NUMA) -extern int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn); +int link_mem_sections(int nid, unsigned long start_pfn, + unsigned long end_pfn, + enum meminit_context context); #else static inline int link_mem_sections(int nid, unsigned long start_pfn, - unsigned long end_pfn) + unsigned long end_pfn, + enum meminit_context context) { return 0; } @@ -61,7 +63,8 @@ static inline int register_one_node(int nid) if (error) return error; /* link memory sections under this node */ - error = link_mem_sections(nid, start_pfn, end_pfn); + error = link_mem_sections(nid, start_pfn, end_pfn, + MEMINIT_EARLY); }
return error; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5dfd8dd328d3..2de62385e336 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1097,7 +1097,8 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) }
/* link memory sections under this node.*/ - ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1)); + ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), + MEMINIT_HOTPLUG); BUG_ON(ret);
/* create new memmap entry */
From: Al Viro viro@zeniv.linux.org.uk
stable inclusion from linux-4.19.150 commit 3e3bbc4d23eeb90bf282e98c7dfeca7702df3169
--------------------------------
commit f8d4f44df056c5b504b0d49683fb7279218fd207 upstream.
Signed-off-by: Al Viro viro@zeniv.linux.org.uk Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/eventpoll.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 76f9079a6802..d7310a967a0a 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1450,6 +1450,22 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, RCU_INIT_POINTER(epi->ws, NULL); }
+ /* Add the current item to the list of active epoll hook for this file */ + spin_lock(&tfile->f_lock); + list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); + spin_unlock(&tfile->f_lock); + + /* + * Add the current item to the RB tree. All RB tree operations are + * protected by "mtx", and ep_insert() is called with "mtx" held. + */ + ep_rbtree_insert(ep, epi); + + /* now check if we've created too many backpaths */ + error = -EINVAL; + if (full_check && reverse_path_check()) + goto error_remove_epi; + /* Initialize the poll table using the queue callback */ epq.epi = epi; init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); @@ -1472,22 +1488,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, if (epi->nwait < 0) goto error_unregister;
- /* Add the current item to the list of active epoll hook for this file */ - spin_lock(&tfile->f_lock); - list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links); - spin_unlock(&tfile->f_lock); - - /* - * Add the current item to the RB tree. All RB tree operations are - * protected by "mtx", and ep_insert() is called with "mtx" held. - */ - ep_rbtree_insert(ep, epi); - - /* now check if we've created too many backpaths */ - error = -EINVAL; - if (full_check && reverse_path_check()) - goto error_remove_epi; - /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irq(&ep->wq.lock);
@@ -1516,6 +1516,8 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
return 0;
+error_unregister: + ep_unregister_pollwait(ep, epi); error_remove_epi: spin_lock(&tfile->f_lock); list_del_rcu(&epi->fllink); @@ -1523,9 +1525,6 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
rb_erase_cached(&epi->rbn, &ep->rbr);
-error_unregister: - ep_unregister_pollwait(ep, epi); - /* * We need to do this because an event could have been arrived on some * allocated wait queue. Note that we don't care about the ep->ovflist
From: Al Viro viro@zeniv.linux.org.uk
stable inclusion from linux-4.19.150 commit ff329915a5b1f6778344a6fc7b060c991376b095
--------------------------------
commit 18306c404abe18a0972587a6266830583c60c928 upstream.
removes the need to clear it, along with the races.
Signed-off-by: Al Viro viro@zeniv.linux.org.uk Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/eventpoll.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index d7310a967a0a..e4f2c1c54dfe 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -222,8 +222,7 @@ struct eventpoll { struct file *file;
/* used to optimize loop detection check */ - int visited; - struct list_head visited_list_link; + u64 gen;
#ifdef CONFIG_NET_RX_BUSY_POLL /* used to track busy poll napi_id */ @@ -273,6 +272,8 @@ static long max_user_watches __read_mostly; */ static DEFINE_MUTEX(epmutex);
+static u64 loop_check_gen = 0; + /* Used to check for epoll file descriptor inclusion loops */ static struct nested_calls poll_loop_ncalls;
@@ -282,9 +283,6 @@ static struct kmem_cache *epi_cache __read_mostly; /* Slab cache used to allocate "struct eppoll_entry" */ static struct kmem_cache *pwq_cache __read_mostly;
-/* Visited nodes during ep_loop_check(), so we can unset them when we finish */ -static LIST_HEAD(visited_list); - /* * List of files with newly added links, where we may need to limit the number * of emanating paths. Protected by the epmutex. @@ -1869,13 +1867,12 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) struct epitem *epi;
mutex_lock_nested(&ep->mtx, call_nests + 1); - ep->visited = 1; - list_add(&ep->visited_list_link, &visited_list); + ep->gen = loop_check_gen; for (rbp = rb_first_cached(&ep->rbr); rbp; rbp = rb_next(rbp)) { epi = rb_entry(rbp, struct epitem, rbn); if (unlikely(is_file_epoll(epi->ffd.file))) { ep_tovisit = epi->ffd.file->private_data; - if (ep_tovisit->visited) + if (ep_tovisit->gen == loop_check_gen) continue; error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, ep_loop_check_proc, epi->ffd.file, @@ -1916,18 +1913,8 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) */ static int ep_loop_check(struct eventpoll *ep, struct file *file) { - int ret; - struct eventpoll *ep_cur, *ep_next; - - ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, + return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, ep_loop_check_proc, file, ep, current); - /* clear visited list */ - list_for_each_entry_safe(ep_cur, ep_next, &visited_list, - visited_list_link) { - ep_cur->visited = 0; - list_del(&ep_cur->visited_list_link); - } - return ret; }
static void clear_tfile_check_list(void) @@ -2149,6 +2136,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, error_tgt_fput: if (full_check) { clear_tfile_check_list(); + loop_check_gen++; mutex_unlock(&epmutex); }
From: Al Viro viro@zeniv.linux.org.uk
stable inclusion from linux-4.19.150 commit 90ef231ba534d43033884b8560df26e608ca0a21
--------------------------------
commit fe0a916c1eae8e17e86c3753d13919177d63ed7e upstream.
Checking for the lack of epitems refering to the epoll we want to insert into is not enough; we might have an insertion of that epoll into another one that has already collected the set of files to recheck for excessive reverse paths, but hasn't gotten to creating/inserting the epitem for it.
However, any such insertion in progress can be detected - it will update the generation count in our epoll when it's done looking through it for files to check. That gets done under ->mtx of our epoll and that allows us to detect that safely.
We are *not* holding epmutex here, so the generation count is not stable. However, since both the update of ep->gen by loop check and (later) insertion into ->f_ep_link are done with ep->mtx held, we are fine - the sequence is grab epmutex bump loop_check_gen ... grab tep->mtx // 1 tep->gen = loop_check_gen ... drop tep->mtx // 2 ... grab tep->mtx // 3 ... insert into ->f_ep_link ... drop tep->mtx // 4 bump loop_check_gen drop epmutex and if the fastpath check in another thread happens for that eventpoll, it can come * before (1) - in that case fastpath is just fine * after (4) - we'll see non-empty ->f_ep_link, slow path taken * between (2) and (3) - loop_check_gen is stable, with ->mtx providing barriers and we end up taking slow path.
Note that ->f_ep_link emptiness check is slightly racy - we are protected against insertions into that list, but removals can happen right under us. Not a problem - in the worst case we'll end up taking a slow path for no good reason.
Signed-off-by: Al Viro viro@zeniv.linux.org.uk Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/eventpoll.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e4f2c1c54dfe..59f9d28964ad 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -2076,6 +2076,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, mutex_lock_nested(&ep->mtx, 0); if (op == EPOLL_CTL_ADD) { if (!list_empty(&f.file->f_ep_links) || + ep->gen == loop_check_gen || is_file_epoll(tf.file)) { full_check = 1; mutex_unlock(&ep->mtx);
From: Al Viro viro@zeniv.linux.org.uk
stable inclusion from linux-4.19.150 commit ced8ce5d2157142c469eccc5eef5ea8ad579fa5e
--------------------------------
commit 3701cb59d892b88d569427586f01491552f377b1 upstream.
or get freed, for that matter, if it's a long (separately stored) name.
Signed-off-by: Al Viro viro@zeniv.linux.org.uk Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/eventpoll.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 59f9d28964ad..cf332e8f6bdf 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1376,7 +1376,7 @@ static int reverse_path_check(void)
static int ep_create_wakeup_source(struct epitem *epi) { - const char *name; + struct name_snapshot n; struct wakeup_source *ws;
if (!epi->ep->ws) { @@ -1385,8 +1385,9 @@ static int ep_create_wakeup_source(struct epitem *epi) return -ENOMEM; }
- name = epi->ffd.file->f_path.dentry->d_name.name; - ws = wakeup_source_register(name); + take_dentry_name_snapshot(&n, epi->ffd.file->f_path.dentry); + ws = wakeup_source_register(n.name); + release_dentry_name_snapshot(&n);
if (!ws) return -ENOMEM;
From: Will McVicker willmcvicker@google.com
stable inclusion from linux-4.19.150 commit 289fe546ea16c2dcb57c5198c5a7b7387604530e
--------------------------------
commit 1cc5ef91d2ff94d2bf2de3b3585423e8a1051cb6 upstream.
The indexes to the nf_nat_l[34]protos arrays come from userspace. So check the tuple's family, e.g. l3num, when creating the conntrack in order to prevent an OOB memory access during setup. Here is an example kernel panic on 4.14.180 when userspace passes in an index greater than NFPROTO_NUMPROTO.
Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in:... Process poc (pid: 5614, stack limit = 0x00000000a3933121) CPU: 4 PID: 5614 Comm: poc Tainted: G S W O 4.14.180-g051355490483 Hardware name: Qualcomm Technologies, Inc. SM8150 V2 PM8150 Google Inc. MSM task: 000000002a3dfffe task.stack: 00000000a3933121 pc : __cfi_check_fail+0x1c/0x24 lr : __cfi_check_fail+0x1c/0x24 ... Call trace: __cfi_check_fail+0x1c/0x24 name_to_dev_t+0x0/0x468 nfnetlink_parse_nat_setup+0x234/0x258 ctnetlink_parse_nat_setup+0x4c/0x228 ctnetlink_new_conntrack+0x590/0xc40 nfnetlink_rcv_msg+0x31c/0x4d4 netlink_rcv_skb+0x100/0x184 nfnetlink_rcv+0xf4/0x180 netlink_unicast+0x360/0x770 netlink_sendmsg+0x5a0/0x6a4 ___sys_sendmsg+0x314/0x46c SyS_sendmsg+0xb4/0x108 el0_svc_naked+0x34/0x38
This crash is not happening since 5.4+, however, ctnetlink still allows for creating entries with unsupported layer 3 protocol number.
Fixes: c1d10adb4a521 ("[NETFILTER]: Add ctnetlink port for nf_conntrack") Signed-off-by: Will McVicker willmcvicker@google.com [pablo@netfilter.org: rebased original patch on top of nf.git] Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netfilter/nf_conntrack_netlink.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 31fa94064a62..0b89609a6e9d 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1129,6 +1129,8 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], if (!tb[CTA_TUPLE_IP]) return -EINVAL;
+ if (l3num != NFPROTO_IPV4 && l3num != NFPROTO_IPV6) + return -EOPNOTSUPP; tuple->src.l3num = l3num;
err = ctnetlink_parse_tuple_ip(tb[CTA_TUPLE_IP], tuple);
From: Linus Torvalds torvalds@linux-foundation.org
stable inclusion from linux-4.19.151 commit 33acb78c859f1a0bd3c6b67801fada16f99614f6
--------------------------------
commit 4013c1496c49615d90d36b9d513eee8e369778e9 upstream.
Kernel threads intentionally do CLONE_FS in order to follow any changes that 'init' does to set up the root directory (or cwd).
It is admittedly a bit odd, but it avoids the situation where 'init' does some extensive setup to initialize the system environment, and then we execute a usermode helper program, and it uses the original FS setup from boot time that may be very limited and incomplete.
[ Both Al Viro and Eric Biederman point out that 'pivot_root()' will follow the root regardless, since it fixes up other users of root (see chroot_fs_refs() for details), but overmounting root and doing a chroot() would not. ]
However, Vegard Nossum noticed that the CLONE_FS not only means that we follow the root and current working directories, it also means we share umask with whatever init changed it to. That wasn't intentional.
Just reset umask to the original default (0022) before actually starting the usermode helper program.
Reported-by: Vegard Nossum vegard.nossum@oracle.com Cc: Al Viro viro@zeniv.linux.org.uk Acked-by: Eric W. Biederman ebiederm@xmission.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: kernel/umh.c [yyl: adjust context] Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/umh.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/kernel/umh.c b/kernel/umh.c index 6ff36a7f038b..53611efb10cb 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -13,6 +13,7 @@ #include <linux/cred.h> #include <linux/file.h> #include <linux/fdtable.h> +#include <linux/fs_struct.h> #include <linux/workqueue.h> #include <linux/security.h> #include <linux/mount.h> @@ -76,6 +77,14 @@ static int call_usermodehelper_exec_async(void *data) flush_signal_handlers(current, 1); spin_unlock_irq(¤t->sighand->siglock);
+ /* + * Initial kernel threads share ther FS with init, in order to + * get the init root directory. But we've now created a new + * thread that is going to execve a user process and has its own + * 'struct fs_struct'. Reset umask to the default. + */ + current->fs->umask = 0022; + /* * Kthreadd can be restricted to a set of processors if the user wants * to protect other processors from OS latencies. If that has happened
From: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com
stable inclusion from linux-4.19.151 commit b6df5afc3d81e34d32f0b092d59b7fe8915d824b
--------------------------------
commit 4bab69093044ca81f394bd0780be1b71c5a4d308 upstream.
When try_module_get() fails in the nvme_dev_open() it returns without releasing the ctrl reference which was taken earlier.
Put the ctrl reference which is taken before calling the try_module_get() in the error return code path.
Fixes: 52a3974feb1a "nvme-core: get/put ctrl and transport module in nvme_dev_open/release()" Signed-off-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Reviewed-by: Logan Gunthorpe logang@deltatee.com Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/host/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 8b478e402f62..e97989e91866 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2585,8 +2585,10 @@ static int nvme_dev_open(struct inode *inode, struct file *file) }
nvme_get_ctrl(ctrl); - if (!try_module_get(ctrl->ops->module)) + if (!try_module_get(ctrl->ops->module)) { + nvme_put_ctrl(ctrl); return -EINVAL; + }
file->private_data = ctrl; return 0;
From: Hugh Dickins hughd@google.com
stable inclusion from linux-4.19.151 commit fbe96d5aab1ef3c992b1dd7a0a4a5aeb21093571
--------------------------------
commit 033b5d77551167f8c24ca862ce83d3e0745f9245 upstream.
There have been elusive reports of filemap_fault() hitting its VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page) on kernels built with CONFIG_READ_ONLY_THP_FOR_FS=y.
Suren has hit it on a kernel with CONFIG_READ_ONLY_THP_FOR_FS=y and CONFIG_NUMA is not set: and he has analyzed it down to how khugepaged without NUMA reuses the same huge page after collapse_file() failed (whereas NUMA targets its allocation to the respective node each time). And most of us were usually testing with CONFIG_NUMA=y kernels.
collapse_file(old start) new_page = khugepaged_alloc_page(hpage) __SetPageLocked(new_page) new_page->index = start // hpage->index=old offset new_page->mapping = mapping xas_store(&xas, new_page)
filemap_fault page = find_get_page(mapping, offset) // if offset falls inside hpage then // compound_head(page) == hpage lock_page_maybe_drop_mmap() __lock_page(page)
// collapse fails xas_store(&xas, old page) new_page->mapping = NULL unlock_page(new_page)
collapse_file(new start) new_page = khugepaged_alloc_page(hpage) __SetPageLocked(new_page) new_page->index = start // hpage->index=new offset new_page->mapping = mapping // mapping becomes valid again
// since compound_head(page) == hpage // page_to_pgoff(page) got changed VM_BUG_ON_PAGE(page_to_pgoff(page) != offset)
An initial patch replaced __SetPageLocked() by lock_page(), which did fix the race which Suren illustrates above. But testing showed that it's not good enough: if the racing task's __lock_page() gets delayed long after its find_get_page(), then it may follow collapse_file(new start)'s successful final unlock_page(), and crash on the same VM_BUG_ON_PAGE.
It could be fixed by relaxing filemap_fault()'s VM_BUG_ON_PAGE to a check and retry (as is done for mapping), with similar relaxations in find_lock_entry() and pagecache_get_page(): but it's not obvious what else might get caught out; and khugepaged non-NUMA appears to be unique in exposing a page to page cache, then revoking, without going through a full cycle of freeing before reuse.
Instead, non-NUMA khugepaged_prealloc_page() release the old page if anyone else has a reference to it (1% of cases when I tested).
Although never reported on huge tmpfs, I believe its find_lock_entry() has been at similar risk; but huge tmpfs does not rely on khugepaged for its normal working nearly so much as READ_ONLY_THP_FOR_FS does.
Reported-by: Denis Lisov dennis.lissov@gmail.com Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=206569 Link: https://lore.kernel.org/linux-mm/?q=20200219144635.3b7417145de19b65f258c943%... Reported-by: Qian Cai cai@lca.pw Link: https://lore.kernel.org/linux-xfs/?q=20200616013309.GB815%40lca.pw Reported-and-analyzed-by: Suren Baghdasaryan surenb@google.com Fixes: 87c460a0bded ("mm/khugepaged: collapse_shmem() without freezing new_page") Signed-off-by: Hugh Dickins hughd@google.com Cc: stable@vger.kernel.org # v4.9+ Reviewed-by: Matthew Wilcox (Oracle) willy@infradead.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/khugepaged.c | 12 ++++++++++++ 1 file changed, 12 insertions(+)
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f37be43f8cae..30553d7df402 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -820,6 +820,18 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) { + /* + * If the hpage allocated earlier was briefly exposed in page cache + * before collapse_file() failed, it is possible that racing lookups + * have not yet completed, and would then be unpleasantly surprised by + * finding the hpage reused for the same mapping at a different offset. + * Just release the previous allocation if there is any danger of that. + */ + if (*hpage && page_count(*hpage) > 1) { + put_page(*hpage); + *hpage = NULL; + } + if (!*hpage) *hpage = khugepaged_alloc_hugepage(wait);
From: Vladimir Zapolskiy vladimir@tuxera.com
stable inclusion from linux-4.19.151 commit eb13209e0cbea7ed59a16da14c6987659219470f
--------------------------------
commit 64b7f674c292207624b3d788eda2dde3dc1415df upstream.
On setxattr() syscall path due to an apprent typo the size of a dynamically allocated memory chunk for storing struct smb2_file_full_ea_info object is computed incorrectly, to be more precise the first addend is the size of a pointer instead of the wanted object size. Coincidentally it makes no difference on 64-bit platforms, however on 32-bit targets the following memcpy() writes 4 bytes of data outside of the dynamically allocated memory.
============================================================================= BUG kmalloc-16 (Not tainted): Redzone overwritten -----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint INFO: 0x79e69a6f-0x9e5cdecf @offset=368. First byte 0x73 instead of 0xcc INFO: Slab 0xd36d2454 objects=85 used=51 fp=0xf7d0fc7a flags=0x35000201 INFO: Object 0x6f171df3 @offset=352 fp=0x00000000
Redzone 5d4ff02d: cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc cc ................ Object 6f171df3: 00 00 00 00 00 05 06 00 73 6e 72 75 62 00 66 69 ........snrub.fi Redzone 79e69a6f: 73 68 32 0a sh2. Padding 56254d82: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ CPU: 0 PID: 8196 Comm: attr Tainted: G B 5.9.0-rc8+ #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1 04/01/2014 Call Trace: dump_stack+0x54/0x6e print_trailer+0x12c/0x134 check_bytes_and_report.cold+0x3e/0x69 check_object+0x18c/0x250 free_debug_processing+0xfe/0x230 __slab_free+0x1c0/0x300 kfree+0x1d3/0x220 smb2_set_ea+0x27d/0x540 cifs_xattr_set+0x57f/0x620 __vfs_setxattr+0x4e/0x60 __vfs_setxattr_noperm+0x4e/0x100 __vfs_setxattr_locked+0xae/0xd0 vfs_setxattr+0x4e/0xe0 setxattr+0x12c/0x1a0 path_setxattr+0xa4/0xc0 __ia32_sys_lsetxattr+0x1d/0x20 __do_fast_syscall_32+0x40/0x70 do_fast_syscall_32+0x29/0x60 do_SYSENTER_32+0x15/0x20 entry_SYSENTER_32+0x9f/0xf2
Fixes: 5517554e4313 ("cifs: Add support for writing attributes on SMB2+") Signed-off-by: Vladimir Zapolskiy vladimir@tuxera.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/cifs/smb2ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 6fc16329ceb4..d85ed2ace9ac 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -950,7 +950,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon, return rc; }
- len = sizeof(ea) + ea_name_len + ea_value_len + 1; + len = sizeof(*ea) + ea_name_len + ea_value_len + 1; ea = kzalloc(len, GFP_KERNEL); if (ea == NULL) { SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.151 commit a813aaee68809b5fc3935ec5ccf7cdba75a9c792
--------------------------------
commit d42ee76ecb6c49d499fc5eb32ca34468d95dbc3e upstream.
After freeing ep->auth_hmacs we have to clear the pointer or risk use-after-free as reported by syzbot:
BUG: KASAN: use-after-free in sctp_auth_destroy_hmacs net/sctp/auth.c:509 [inline] BUG: KASAN: use-after-free in sctp_auth_destroy_hmacs net/sctp/auth.c:501 [inline] BUG: KASAN: use-after-free in sctp_auth_free+0x17e/0x1d0 net/sctp/auth.c:1070 Read of size 8 at addr ffff8880a8ff52c0 by task syz-executor941/6874
CPU: 0 PID: 6874 Comm: syz-executor941 Not tainted 5.9.0-rc8-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x198/0x1fd lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xae/0x497 mm/kasan/report.c:383 __kasan_report mm/kasan/report.c:513 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:530 sctp_auth_destroy_hmacs net/sctp/auth.c:509 [inline] sctp_auth_destroy_hmacs net/sctp/auth.c:501 [inline] sctp_auth_free+0x17e/0x1d0 net/sctp/auth.c:1070 sctp_endpoint_destroy+0x95/0x240 net/sctp/endpointola.c:203 sctp_endpoint_put net/sctp/endpointola.c:236 [inline] sctp_endpoint_free+0xd6/0x110 net/sctp/endpointola.c:183 sctp_destroy_sock+0x9c/0x3c0 net/sctp/socket.c:4981 sctp_v6_destroy_sock+0x11/0x20 net/sctp/socket.c:9415 sk_common_release+0x64/0x390 net/core/sock.c:3254 sctp_close+0x4ce/0x8b0 net/sctp/socket.c:1533 inet_release+0x12e/0x280 net/ipv4/af_inet.c:431 inet6_release+0x4c/0x70 net/ipv6/af_inet6.c:475 __sock_release+0xcd/0x280 net/socket.c:596 sock_close+0x18/0x20 net/socket.c:1277 __fput+0x285/0x920 fs/file_table.c:281 task_work_run+0xdd/0x190 kernel/task_work.c:141 exit_task_work include/linux/task_work.h:25 [inline] do_exit+0xb7d/0x29f0 kernel/exit.c:806 do_group_exit+0x125/0x310 kernel/exit.c:903 __do_sys_exit_group kernel/exit.c:914 [inline] __se_sys_exit_group kernel/exit.c:912 [inline] __x64_sys_exit_group+0x3a/0x50 kernel/exit.c:912 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x43f278 Code: Bad RIP value. RSP: 002b:00007fffe0995c38 EFLAGS: 00000246 ORIG_RAX: 00000000000000e7 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 000000000043f278 RDX: 0000000000000000 RSI: 000000000000003c RDI: 0000000000000000 RBP: 00000000004bf068 R08: 00000000000000e7 R09: ffffffffffffffd0 R10: 0000000020000000 R11: 0000000000000246 R12: 0000000000000001 R13: 00000000006d1180 R14: 0000000000000000 R15: 0000000000000000
Allocated by task 6874: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track mm/kasan/common.c:56 [inline] __kasan_kmalloc.constprop.0+0xbf/0xd0 mm/kasan/common.c:461 kmem_cache_alloc_trace+0x174/0x300 mm/slab.c:3554 kmalloc include/linux/slab.h:554 [inline] kmalloc_array include/linux/slab.h:593 [inline] kcalloc include/linux/slab.h:605 [inline] sctp_auth_init_hmacs+0xdb/0x3b0 net/sctp/auth.c:464 sctp_auth_init+0x8a/0x4a0 net/sctp/auth.c:1049 sctp_setsockopt_auth_supported net/sctp/socket.c:4354 [inline] sctp_setsockopt+0x477e/0x97f0 net/sctp/socket.c:4631 __sys_setsockopt+0x2db/0x610 net/socket.c:2132 __do_sys_setsockopt net/socket.c:2143 [inline] __se_sys_setsockopt net/socket.c:2140 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2140 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Freed by task 6874: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track+0x1c/0x30 mm/kasan/common.c:56 kasan_set_free_info+0x1b/0x30 mm/kasan/generic.c:355 __kasan_slab_free+0xd8/0x120 mm/kasan/common.c:422 __cache_free mm/slab.c:3422 [inline] kfree+0x10e/0x2b0 mm/slab.c:3760 sctp_auth_destroy_hmacs net/sctp/auth.c:511 [inline] sctp_auth_destroy_hmacs net/sctp/auth.c:501 [inline] sctp_auth_init_hmacs net/sctp/auth.c:496 [inline] sctp_auth_init_hmacs+0x2b7/0x3b0 net/sctp/auth.c:454 sctp_auth_init+0x8a/0x4a0 net/sctp/auth.c:1049 sctp_setsockopt_auth_supported net/sctp/socket.c:4354 [inline] sctp_setsockopt+0x477e/0x97f0 net/sctp/socket.c:4631 __sys_setsockopt+0x2db/0x610 net/socket.c:2132 __do_sys_setsockopt net/socket.c:2143 [inline] __se_sys_setsockopt net/socket.c:2140 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2140 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9
Fixes: 1f485649f529 ("[SCTP]: Implement SCTP-AUTH internals") Signed-off-by: Eric Dumazet edumazet@google.com Cc: Vlad Yasevich vyasevich@gmail.com Cc: Neil Horman nhorman@tuxdriver.com Cc: Marcelo Ricardo Leitner marcelo.leitner@gmail.com Acked-by: Marcelo Ricardo Leitner marcelo.leitner@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/sctp/auth.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/net/sctp/auth.c b/net/sctp/auth.c index 5b537613946f..2bd8c80bd85f 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -515,6 +515,7 @@ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) out_err: /* Clean up any successful allocations */ sctp_auth_destroy_hmacs(ep->auth_hmacs); + ep->auth_hmacs = NULL; return -ENOMEM; }
From: Antony Antony antony.antony@secunet.com
stable inclusion from linux-4.19.151 commit 50e117921b322323b7272f108d9c080ad883ee0a
--------------------------------
[ Upstream commit 545e5c571662b1cd79d9588f9d3b6e36985b8007 ]
XFRMA_SET_MARK and XFRMA_SET_MARK_MASK was not cloned from the old to the new. Migrate these two attributes during XFRMA_MSG_MIGRATE
Fixes: 9b42c1f179a6 ("xfrm: Extend the output_mark to support input direction and masking.") Signed-off-by: Antony Antony antony.antony@secunet.com Signed-off-by: Steffen Klassert steffen.klassert@secunet.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/xfrm/xfrm_state.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 47a8ff972a2b..d76b019673aa 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1410,6 +1410,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, }
memcpy(&x->mark, &orig->mark, sizeof(x->mark)); + memcpy(&x->props.smark, &orig->props.smark, sizeof(x->props.smark));
if (xfrm_init_state(x) < 0) goto error;
From: Antony Antony antony.antony@secunet.com
stable inclusion from linux-4.19.151 commit fb48241729bd18dfa252f0e60ea88f331f10e968
--------------------------------
[ Upstream commit 91a46c6d1b4fcbfa4773df9421b8ad3e58088101 ]
XFRMA_REPLAY_ESN_VAL was not cloned completely from the old to the new. Migrate this attribute during XFRMA_MSG_MIGRATE
v1->v2: - move curleft cloning to a separate patch
Fixes: af2f464e326e ("xfrm: Assign esn pointers when cloning a state") Signed-off-by: Antony Antony antony.antony@secunet.com Signed-off-by: Steffen Klassert steffen.klassert@secunet.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/xfrm.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-)
diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 2ea1dc1e06b9..b26fb2eb5346 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1881,21 +1881,17 @@ static inline unsigned int xfrm_replay_state_esn_len(struct xfrm_replay_state_es static inline int xfrm_replay_clone(struct xfrm_state *x, struct xfrm_state *orig) { - x->replay_esn = kzalloc(xfrm_replay_state_esn_len(orig->replay_esn), + + x->replay_esn = kmemdup(orig->replay_esn, + xfrm_replay_state_esn_len(orig->replay_esn), GFP_KERNEL); if (!x->replay_esn) return -ENOMEM; - - x->replay_esn->bmp_len = orig->replay_esn->bmp_len; - x->replay_esn->replay_window = orig->replay_esn->replay_window; - - x->preplay_esn = kmemdup(x->replay_esn, - xfrm_replay_state_esn_len(x->replay_esn), + x->preplay_esn = kmemdup(orig->preplay_esn, + xfrm_replay_state_esn_len(orig->preplay_esn), GFP_KERNEL); - if (!x->preplay_esn) { - kfree(x->replay_esn); + if (!x->preplay_esn) return -ENOMEM; - }
return 0; }
From: Antony Antony antony.antony@secunet.com
stable inclusion from linux-4.19.151 commit e4e0a05e1086b1a0045f1876786c340c212717c0
--------------------------------
[ Upstream commit 7aa05d304785204703a67a6aa7f1db402889a172 ]
XFRMA_SEC_CTX was not cloned from the old to the new. Migrate this attribute during XFRMA_MSG_MIGRATE
v1->v2: - return -ENOMEM on error v2->v3: - fix return type to int
Fixes: 80c9abaabf42 ("[XFRM]: Extension for dynamic update of endpoint address(es)") Signed-off-by: Antony Antony antony.antony@secunet.com Signed-off-by: Steffen Klassert steffen.klassert@secunet.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/xfrm/xfrm_state.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index d76b019673aa..c2640875ec75 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1341,6 +1341,30 @@ int xfrm_state_add(struct xfrm_state *x) EXPORT_SYMBOL(xfrm_state_add);
#ifdef CONFIG_XFRM_MIGRATE +static inline int clone_security(struct xfrm_state *x, struct xfrm_sec_ctx *security) +{ + struct xfrm_user_sec_ctx *uctx; + int size = sizeof(*uctx) + security->ctx_len; + int err; + + uctx = kmalloc(size, GFP_KERNEL); + if (!uctx) + return -ENOMEM; + + uctx->exttype = XFRMA_SEC_CTX; + uctx->len = size; + uctx->ctx_doi = security->ctx_doi; + uctx->ctx_alg = security->ctx_alg; + uctx->ctx_len = security->ctx_len; + memcpy(uctx + 1, security->ctx_str, security->ctx_len); + err = security_xfrm_state_alloc(x, uctx); + kfree(uctx); + if (err) + return err; + + return 0; +} + static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, struct xfrm_encap_tmpl *encap) { @@ -1397,6 +1421,10 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, goto error; }
+ if (orig->security) + if (clone_security(x, orig->security)) + goto error; + if (orig->coaddr) { x->coaddr = kmemdup(orig->coaddr, sizeof(*x->coaddr), GFP_KERNEL);
From: Antony Antony antony.antony@secunet.com
stable inclusion from linux-4.19.151 commit ce868836d8557b9832b17ad6eeed18a8d7f866f6
--------------------------------
[ Upstream commit 8366685b2883e523f91e9816d7be371eb1144749 ]
When we clone state only add_time was cloned. It missed values like bytes, packets. Now clone the all members of the structure.
v1->v3: - use memcpy to copy the entire structure
Fixes: 80c9abaabf42 ("[XFRM]: Extension for dynamic update of endpoint address(es)") Signed-off-by: Antony Antony antony.antony@secunet.com Signed-off-by: Steffen Klassert steffen.klassert@secunet.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/xfrm/xfrm_state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index c2640875ec75..c68eb587c0ef 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1450,7 +1450,7 @@ static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, x->tfcpad = orig->tfcpad; x->replay_maxdiff = orig->replay_maxdiff; x->replay_maxage = orig->replay_maxage; - x->curlft.add_time = orig->curlft.add_time; + memcpy(&x->curlft, &orig->curlft, sizeof(x->curlft)); x->km.state = orig->km.state; x->km.seq = orig->km.seq; x->replay = orig->replay;
From: Herbert Xu herbert@gondor.apana.org.au
stable inclusion from linux-4.19.151 commit 66a675d390060faaabaf58eefd7ebbb9b51f723b
--------------------------------
[ Upstream commit e94ee171349db84c7cfdc5fefbebe414054d0924 ]
The struct flowi must never be interpreted by itself as its size depends on the address family. Therefore it must always be grouped with its original family value.
In this particular instance, the original family value is lost in the function xfrm_state_find. Therefore we get a bogus read when it's coupled with the wrong family which would occur with inter- family xfrm states.
This patch fixes it by keeping the original family value.
Note that the same bug could potentially occur in LSM through the xfrm_state_pol_flow_match hook. I checked the current code there and it seems to be safe for now as only secid is used which is part of struct flowi_common. But that API should be changed so that so that we don't get new bugs in the future. We could do that by replacing fl with just secid or adding a family field.
Reported-by: syzbot+577fbac3145a6eb2e7a5@syzkaller.appspotmail.com Fixes: 48b8d78315bf ("[XFRM]: State selection update to use inner...") Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Steffen Klassert steffen.klassert@secunet.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/xfrm/xfrm_state.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index c68eb587c0ef..a649d7c2f48c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -923,7 +923,8 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, */ if (x->km.state == XFRM_STATE_VALID) { if ((x->sel.family && - !xfrm_selector_match(&x->sel, fl, x->sel.family)) || + (x->sel.family != family || + !xfrm_selector_match(&x->sel, fl, family))) || !security_xfrm_state_pol_flow_match(x, pol, fl)) return;
@@ -936,7 +937,9 @@ static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, *acq_in_progress = 1; } else if (x->km.state == XFRM_STATE_ERROR || x->km.state == XFRM_STATE_EXPIRED) { - if (xfrm_selector_match(&x->sel, fl, x->sel.family) && + if ((!x->sel.family || + (x->sel.family == family && + xfrm_selector_match(&x->sel, fl, family))) && security_xfrm_state_pol_flow_match(x, pol, fl)) *error = -ESRCH; } @@ -976,7 +979,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) - xfrm_state_look_at(pol, x, fl, encap_family, + xfrm_state_look_at(pol, x, fl, family, &best, &acquire_in_progress, &error); } if (best || acquire_in_progress) @@ -993,7 +996,7 @@ xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) - xfrm_state_look_at(pol, x, fl, encap_family, + xfrm_state_look_at(pol, x, fl, family, &best, &acquire_in_progress, &error); }
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.151 commit 3f396a6be34ab71d1d6bd935ad1b787800781849
--------------------------------
[ Upstream commit f32f19339596b214c208c0dba716f4b6cc4f6958 ]
syzbot managed to crash a host by creating a bond with a GRE device.
For non Ethernet device, bonding calls bond_setup_by_slave() instead of ether_setup(), and unfortunately dev->needed_headroom was not copied from the new added member.
[ 171.243095] skbuff: skb_under_panic: text:ffffffffa184b9ea len:116 put:20 head:ffff883f84012dc0 data:ffff883f84012dbc tail:0x70 end:0xd00 dev:bond0 [ 171.243111] ------------[ cut here ]------------ [ 171.243112] kernel BUG at net/core/skbuff.c:112! [ 171.243117] invalid opcode: 0000 [#1] SMP KASAN PTI [ 171.243469] gsmi: Log Shutdown Reason 0x03 [ 171.243505] Call Trace: [ 171.243506] <IRQ> [ 171.243512] [<ffffffffa171be59>] skb_push+0x49/0x50 [ 171.243516] [<ffffffffa184b9ea>] ipgre_header+0x2a/0xf0 [ 171.243520] [<ffffffffa17452d7>] neigh_connected_output+0xb7/0x100 [ 171.243524] [<ffffffffa186f1d3>] ip6_finish_output2+0x383/0x490 [ 171.243528] [<ffffffffa186ede2>] __ip6_finish_output+0xa2/0x110 [ 171.243531] [<ffffffffa186acbc>] ip6_finish_output+0x2c/0xa0 [ 171.243534] [<ffffffffa186abe9>] ip6_output+0x69/0x110 [ 171.243537] [<ffffffffa186ac90>] ? ip6_output+0x110/0x110 [ 171.243541] [<ffffffffa189d952>] mld_sendpack+0x1b2/0x2d0 [ 171.243544] [<ffffffffa189d290>] ? mld_send_report+0xf0/0xf0 [ 171.243548] [<ffffffffa189c797>] mld_ifc_timer_expire+0x2d7/0x3b0 [ 171.243551] [<ffffffffa189c4c0>] ? mld_gq_timer_expire+0x50/0x50 [ 171.243556] [<ffffffffa0fea270>] call_timer_fn+0x30/0x130 [ 171.243559] [<ffffffffa0fea17c>] expire_timers+0x4c/0x110 [ 171.243563] [<ffffffffa0fea0e3>] __run_timers+0x213/0x260 [ 171.243566] [<ffffffffa0fecb7d>] ? ktime_get+0x3d/0xa0 [ 171.243570] [<ffffffffa0ff9c4e>] ? clockevents_program_event+0x7e/0xe0 [ 171.243574] [<ffffffffa0f7e5d5>] ? sched_clock_cpu+0x15/0x190 [ 171.243577] [<ffffffffa0fe973d>] run_timer_softirq+0x1d/0x40 [ 171.243581] [<ffffffffa1c00152>] __do_softirq+0x152/0x2f0 [ 171.243585] [<ffffffffa0f44e1f>] irq_exit+0x9f/0xb0 [ 171.243588] [<ffffffffa1a02e1d>] smp_apic_timer_interrupt+0xfd/0x1a0 [ 171.243591] [<ffffffffa1a01ea6>] apic_timer_interrupt+0x86/0x90
Fixes: f5184d267c1a ("net: Allow netdevices to specify needed head/tailroom") Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/bonding/bond_main.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index d32e32e79174..a59333b87eaf 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1123,6 +1123,7 @@ static void bond_setup_by_slave(struct net_device *bond_dev,
bond_dev->type = slave_dev->type; bond_dev->hard_header_len = slave_dev->hard_header_len; + bond_dev->needed_headroom = slave_dev->needed_headroom; bond_dev->addr_len = slave_dev->addr_len;
memcpy(bond_dev->broadcast, slave_dev->broadcast,
From: Kajol Jain kjain@linux.ibm.com
stable inclusion from linux-4.19.151 commit 80e745b6729ed41248442a687943cc7a48e5e66a
--------------------------------
[ Upstream commit 6d6b8b9f4fceab7266ca03d194f60ec72bd4b654 ]
The error handling introduced by commit:
2ed6edd33a21 ("perf: Add cond_resched() to task_function_call()")
looses any return value from smp_call_function_single() that is not {0, -EINVAL}. This is a problem because it will return -EXNIO when the target CPU is offline. Worse, in that case it'll turn into an infinite loop.
Fixes: 2ed6edd33a21 ("perf: Add cond_resched() to task_function_call()") Reported-by: Srikar Dronamraju srikar@linux.vnet.ibm.com Signed-off-by: Kajol Jain kjain@linux.ibm.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Reviewed-by: Barret Rhoden brho@google.com Tested-by: Srikar Dronamraju srikar@linux.vnet.ibm.com Link: https://lkml.kernel.org/r/20200827064732.20860-1-kjain@linux.ibm.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/events/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c index 4c209f28de58..4f17dd88cfde 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -98,7 +98,7 @@ static void remote_function(void *data) * retry due to any failures in smp_call_function_single(), such as if the * task_cpu() goes offline concurrently. * - * returns @func return value or -ESRCH when the process isn't running + * returns @func return value or -ESRCH or -ENXIO when the process isn't running */ static int task_function_call(struct task_struct *p, remote_function_f func, void *info) @@ -114,7 +114,8 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) for (;;) { ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); - ret = !ret ? data.ret : -EAGAIN; + if (!ret) + ret = data.ret;
if (ret != -EAGAIN) break;
From: Vijay Balakrishna vijayb@linux.microsoft.com
stable inclusion from linux-4.19.151 commit 94c51675811267a1ccaa7f6dc336714a02e20246
--------------------------------
commit 4aab2be0983031a05cb4a19696c9da5749523426 upstream.
When memory is hotplug added or removed the min_free_kbytes should be recalculated based on what is expected by khugepaged. Currently after hotplug, min_free_kbytes will be set to a lower default and higher default set when THP enabled is lost.
This change restores min_free_kbytes as expected for THP consumers.
[vijayb@linux.microsoft.com: v5] Link: https://lkml.kernel.org/r/1601398153-5517-1-git-send-email-vijayb@linux.micr...
Fixes: f000565adb77 ("thp: set recommended min free kbytes") Signed-off-by: Vijay Balakrishna vijayb@linux.microsoft.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Reviewed-by: Pavel Tatashin pasha.tatashin@soleen.com Acked-by: Michal Hocko mhocko@suse.com Cc: Allen Pais apais@microsoft.com Cc: Andrea Arcangeli aarcange@redhat.com Cc: "Kirill A. Shutemov" kirill.shutemov@linux.intel.com Cc: Oleg Nesterov oleg@redhat.com Cc: Song Liu songliubraving@fb.com Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1600305709-2319-2-git-send-email-vijayb@linux.micr... Link: https://lkml.kernel.org/r/1600204258-13683-1-git-send-email-vijayb@linux.mic... Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: mm/page_alloc.c [yyl: adjust context] Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/khugepaged.h | 5 +++++ mm/khugepaged.c | 13 +++++++++++-- mm/page_alloc.c | 3 +++ 3 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 082d1d2a5216..dc9a2eecc8b8 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -15,6 +15,7 @@ extern int __khugepaged_enter(struct mm_struct *mm); extern void __khugepaged_exit(struct mm_struct *mm); extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma, unsigned long vm_flags); +extern void khugepaged_min_free_kbytes_update(void);
#define khugepaged_enabled() \ (transparent_hugepage_flags & \ @@ -73,6 +74,10 @@ static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma, { return 0; } + +static inline void khugepaged_min_free_kbytes_update(void) +{ +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* _LINUX_KHUGEPAGED_H */ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 30553d7df402..9c7dc2276156 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -53,6 +53,9 @@ enum scan_result { #define CREATE_TRACE_POINTS #include <trace/events/huge_memory.h>
+static struct task_struct *khugepaged_thread __read_mostly; +static DEFINE_MUTEX(khugepaged_mutex); + /* default scan 8*512 pte (or vmas) every 30 second */ static unsigned int khugepaged_pages_to_scan __read_mostly; static unsigned int khugepaged_pages_collapsed; @@ -1952,8 +1955,6 @@ static void set_recommended_min_free_kbytes(void)
int start_stop_khugepaged(void) { - static struct task_struct *khugepaged_thread __read_mostly; - static DEFINE_MUTEX(khugepaged_mutex); int err = 0;
mutex_lock(&khugepaged_mutex); @@ -1980,3 +1981,11 @@ int start_stop_khugepaged(void) mutex_unlock(&khugepaged_mutex); return err; } + +void khugepaged_min_free_kbytes_update(void) +{ + mutex_lock(&khugepaged_mutex); + if (khugepaged_enabled() && khugepaged_thread) + set_recommended_min_free_kbytes(); + mutex_unlock(&khugepaged_mutex); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f394e0df9c87..869074294cce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -66,6 +66,7 @@ #include <linux/ftrace.h> #include <linux/lockdep.h> #include <linux/nmi.h> +#include <linux/khugepaged.h> #include <linux/ktask.h>
#include <asm/sections.h> @@ -7520,6 +7521,8 @@ int __meminit init_per_zone_wmark_min(void) setup_min_slab_ratio(); #endif
+ khugepaged_min_free_kbytes_update(); + return 0; } postcore_initcall(init_per_zone_wmark_min)
From: David Ahern dsahern@kernel.org
stable inclusion from linux-4.19.153 commit 1fecbf3ffd468ba0c398171ef7f6b2ed848c42f2
--------------------------------
[ Upstream commit 874fb9e2ca949b443cc419a4f2227cafd4381d39 ]
Tobias reported regressions in IPsec tests following the patch referenced by the Fixes tag below. The root cause is dropping the reset of the flowi4_oif after the fib_lookup. Apparently it is needed for xfrm cases, so restore the oif update to ip_route_output_flow right before the call to xfrm_lookup_route.
Fixes: 2fbc6e89b2f1 ("ipv4: Update exception handling for multipath routes via same device") Reported-by: Tobias Brunner tobias@strongswan.org Signed-off-by: David Ahern dsahern@kernel.org Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 758acec80389..f74b7513d607 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2629,10 +2629,12 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, if (IS_ERR(rt)) return rt;
- if (flp4->flowi4_proto) + if (flp4->flowi4_proto) { + flp4->flowi4_oif = rt->dst.dev->ifindex; rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0); + }
return rt; }
From: Yonghong Song yhs@fb.com
stable inclusion from linux-4.19.153 commit 6b00f0321c69e28228cc3024724c679463b4e863
--------------------------------
[ Upstream commit 6617dfd440149e42ce4d2be615eb31a4755f4d30 ]
Commit 4fc427e05158 ("ipv6_route_seq_next should increase position index") tried to fix the issue where seq_file pos is not increased if a NULL element is returned with seq_ops->next(). See bug https://bugzilla.kernel.org/show_bug.cgi?id=206283 The commit effectively does: - increase pos for all seq_ops->start() - increase pos for all seq_ops->next()
For ipv6_route, increasing pos for all seq_ops->next() is correct. But increasing pos for seq_ops->start() is not correct since pos is used to determine how many items to skip during seq_ops->start(): iter->skip = *pos; seq_ops->start() just fetches the *current* pos item. The item can be skipped only after seq_ops->show() which essentially is the beginning of seq_ops->next().
For example, I have 7 ipv6 route entries, root@arch-fb-vm1:~/net-next dd if=/proc/net/ipv6_route bs=4096 00000000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000400 00000001 00000000 00000001 eth0 fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001 eth0 00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200 lo 00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001 lo fe800000000000002050e3fffebd3be8 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001 eth0 ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000004 00000000 00000001 eth0 00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200 lo 0+1 records in 0+1 records out 1050 bytes (1.0 kB, 1.0 KiB) copied, 0.00707908 s, 148 kB/s root@arch-fb-vm1:~/net-next
In the above, I specify buffer size 4096, so all records can be returned to user space with a single trip to the kernel.
If I use buffer size 128, since each record size is 149, internally kernel seq_read() will read 149 into its internal buffer and return the data to user space in two read() syscalls. Then user read() syscall will trigger next seq_ops->start(). Since the current implementation increased pos even for seq_ops->start(), it will skip record #2, #4 and #6, assuming the first record is #1.
root@arch-fb-vm1:~/net-next dd if=/proc/net/ipv6_route bs=128 00000000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000400 00000001 00000000 00000001 eth0 00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200 lo fe800000000000002050e3fffebd3be8 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001 eth0 00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200 lo 4+1 records in 4+1 records out 600 bytes copied, 0.00127758 s, 470 kB/s
To fix the problem, create a fake pos pointer so seq_ops->start() won't actually increase seq_file pos. With this fix, the above `dd` command with `bs=128` will show correct result.
Fixes: 4fc427e05158 ("ipv6_route_seq_next should increase position index") Cc: Alexei Starovoitov ast@kernel.org Suggested-by: Vasily Averin vvs@virtuozzo.com Reviewed-by: Vasily Averin vvs@virtuozzo.com Signed-off-by: Yonghong Song yhs@fb.com Acked-by: Martin KaFai Lau kafai@fb.com Acked-by: Andrii Nakryiko andrii@kernel.org Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv6/ip6_fib.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index b924941b96a3..8b5459b34bc4 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2417,8 +2417,10 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) iter->skip = *pos;
if (iter->tbl) { + loff_t p = 0; + ipv6_route_seq_setup_walk(iter, net); - return ipv6_route_seq_next(seq, NULL, pos); + return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; }
From: Maciej Żenczykowski maze@google.com
stable inclusion from linux-4.19.153 commit 9db62b759161b9e75626e419d85d6944a23a2ab1
--------------------------------
[ Upstream commit 02a1b175b0e92d9e0fa5df3957ade8d733ceb6a0 ]
Documentation/networking/ip-sysctl.txt:46 says: ip_forward_use_pmtu - BOOLEAN By default we don't trust protocol path MTUs while forwarding because they could be easily forged and can lead to unwanted fragmentation by the router. You only need to enable this if you have user-space software which tries to discover path mtus by itself and depends on the kernel honoring this information. This is normally not the case. Default: 0 (disabled) Possible values: 0 - disabled 1 - enabled
Which makes it pretty clear that setting it to 1 is a potential security/safety/DoS issue, and yet it is entirely reasonable to want forwarded traffic to honour explicitly administrator configured route mtus (instead of defaulting to device mtu).
Indeed, I can't think of a single reason why you wouldn't want to. Since you configured a route mtu you probably know better...
It is pretty common to have a higher device mtu to allow receiving large (jumbo) frames, while having some routes via that interface (potentially including the default route to the internet) specify a lower mtu.
Note that ipv6 forwarding uses device mtu unless the route is locked (in which case it will use the route mtu).
This approach is not usable for IPv4 where an 'mtu lock' on a route also has the side effect of disabling TCP path mtu discovery via disabling the IPv4 DF (don't frag) bit on all outgoing frames.
I'm not aware of a way to lock a route from an IPv6 RA, so that also potentially seems wrong.
Signed-off-by: Maciej Żenczykowski maze@google.com Cc: Eric Dumazet edumazet@google.com Cc: Willem de Bruijn willemb@google.com Cc: Lorenzo Colitti lorenzo@google.com Cc: Sunmeet Gill (Sunny) sgill@quicinc.com Cc: Vinay Paradkar vparadka@qti.qualcomm.com Cc: Tyler Wear twear@quicinc.com Cc: David Ahern dsahern@kernel.org Reviewed-by: Eric Dumazet edumazet@google.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/ip.h | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/include/net/ip.h b/include/net/ip.h index 7d9e63bd723f..17766586dfa0 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -399,12 +399,18 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, bool forwarding) { struct net *net = dev_net(dst->dev); + unsigned int mtu;
if (net->ipv4.sysctl_ip_fwd_use_pmtu || ip_mtu_locked(dst) || !forwarding) return dst_mtu(dst);
+ /* 'forwarding = true' case should always honour route mtu */ + mtu = dst_metric_raw(dst, RTAX_MTU); + if (mtu) + return mtu; + return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU); }
From: Davide Caratti dcaratti@redhat.com
stable inclusion from linux-4.19.153 commit a298ba5e7d1715d4c3a5844b6e7a5c7bd7feff87
--------------------------------
[ Upstream commit a7a12b5a0f950bc6b9f7153390634ea798738db9 ]
the following command
# tc action add action tunnel_key \
set src_ip 2001:db8::1 dst_ip 2001:db8::2 id 10 erspan_opts 1:6789:0:0
generates the following splat:
BUG: KASAN: slab-out-of-bounds in tunnel_key_copy_opts+0xcc9/0x1010 [act_tunnel_key] Write of size 4 at addr ffff88813f5f1cc8 by task tc/873
CPU: 2 PID: 873 Comm: tc Not tainted 5.9.0+ #282 Hardware name: Red Hat KVM, BIOS 1.11.1-4.module+el8.1.0+4066+0f1aadab 04/01/2014 Call Trace: dump_stack+0x99/0xcb print_address_description.constprop.7+0x1e/0x230 kasan_report.cold.13+0x37/0x7c tunnel_key_copy_opts+0xcc9/0x1010 [act_tunnel_key] tunnel_key_init+0x160c/0x1f40 [act_tunnel_key] tcf_action_init_1+0x5b5/0x850 tcf_action_init+0x15d/0x370 tcf_action_add+0xd9/0x2f0 tc_ctl_action+0x29b/0x3a0 rtnetlink_rcv_msg+0x341/0x8d0 netlink_rcv_skb+0x120/0x380 netlink_unicast+0x439/0x630 netlink_sendmsg+0x719/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5ba/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f872a96b338 Code: 89 02 48 c7 c0 ff ff ff ff eb b5 0f 1f 80 00 00 00 00 f3 0f 1e fa 48 8d 05 25 43 2c 00 8b 00 85 c0 75 17 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 58 c3 0f 1f 80 00 00 00 00 41 54 41 89 d4 55 RSP: 002b:00007ffffe367518 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 000000005f8f5aed RCX: 00007f872a96b338 RDX: 0000000000000000 RSI: 00007ffffe367580 RDI: 0000000000000003 RBP: 0000000000000000 R08: 0000000000000001 R09: 000000000000001c R10: 000000000000000b R11: 0000000000000246 R12: 0000000000000001 R13: 0000000000686760 R14: 0000000000000601 R15: 0000000000000000
Allocated by task 873: kasan_save_stack+0x19/0x40 __kasan_kmalloc.constprop.7+0xc1/0xd0 __kmalloc+0x151/0x310 metadata_dst_alloc+0x20/0x40 tunnel_key_init+0xfff/0x1f40 [act_tunnel_key] tcf_action_init_1+0x5b5/0x850 tcf_action_init+0x15d/0x370 tcf_action_add+0xd9/0x2f0 tc_ctl_action+0x29b/0x3a0 rtnetlink_rcv_msg+0x341/0x8d0 netlink_rcv_skb+0x120/0x380 netlink_unicast+0x439/0x630 netlink_sendmsg+0x719/0xbf0 sock_sendmsg+0xe2/0x110 ____sys_sendmsg+0x5ba/0x890 ___sys_sendmsg+0xe9/0x160 __sys_sendmsg+0xd3/0x170 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9
The buggy address belongs to the object at ffff88813f5f1c00 which belongs to the cache kmalloc-256 of size 256 The buggy address is located 200 bytes inside of 256-byte region [ffff88813f5f1c00, ffff88813f5f1d00) The buggy address belongs to the page: page:0000000011b48a19 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x13f5f0 head:0000000011b48a19 order:1 compound_mapcount:0 flags: 0x17ffffc0010200(slab|head) raw: 0017ffffc0010200 0000000000000000 0000000d00000001 ffff888107c43400 raw: 0000000000000000 0000000080100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected
Memory state around the buggy address: ffff88813f5f1b80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88813f5f1c00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
ffff88813f5f1c80: 00 00 00 00 00 00 00 00 00 fc fc fc fc fc fc fc
^ ffff88813f5f1d00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff88813f5f1d80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
using IPv6 tunnels, act_tunnel_key allocates a fixed amount of memory for the tunnel metadata, but then it expects additional bytes to store tunnel specific metadata with tunnel_key_copy_opts().
Fix the arguments of __ipv6_tun_set_dst(), so that 'md_size' contains the size previously computed by tunnel_key_get_opts_len(), like it's done for IPv4 tunnels.
Fixes: 0ed5269f9e41 ("net/sched: add tunnel option support to act_tunnel_key") Reported-by: Shuang Li shuali@redhat.com Signed-off-by: Davide Caratti dcaratti@redhat.com Acked-by: Cong Wang xiyou.wangcong@gmail.com Link: https://lore.kernel.org/r/36ebe969f6d13ff59912d6464a4356fe6f103766.160323110... Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/sched/act_tunnel_key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index e4fc6b2bc29d..f43234be5695 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -314,7 +314,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
metadata = __ipv6_tun_set_dst(&saddr, &daddr, tos, ttl, dst_port, 0, flags, - key_id, 0); + key_id, opts_len); } else { NL_SET_ERR_MSG(extack, "Missing either ipv4 or ipv6 src and dst"); ret = -EINVAL;
From: Neal Cardwell ncardwell@google.com
stable inclusion from linux-4.19.153 commit 7e1c74befe158d86c5b4e2ef3f174db5dd537000
--------------------------------
[ Upstream commit 18ded910b589839e38a51623a179837ab4cc3789 ]
In the header prediction fast path for a bulk data receiver, if no data is newly acknowledged then we do not call tcp_ack() and do not call tcp_ack_update_window(). This means that a bulk receiver that receives large amounts of data can have the incoming sequence numbers wrap, so that the check in tcp_may_update_window fails: after(ack_seq, tp->snd_wl1)
If the incoming receive windows are zero in this state, and then the connection that was a bulk data receiver later wants to send data, that connection can find itself persistently rejecting the window updates in incoming ACKs. This means the connection can persistently fail to discover that the receive window has opened, which in turn means that the connection is unable to send anything, and the connection's sending process can get permanently "stuck".
The fix is to update snd_wl1 in the header prediction fast path for a bulk data receiver, so that it keeps up and does not see wrapping problems.
This fix is based on a very nice and thorough analysis and diagnosis by Apollon Oikonomopoulos (see link below).
This is a stable candidate but there is no Fixes tag here since the bug predates current git history. Just for fun: looks like the bug dates back to when header prediction was added in Linux v2.1.8 in Nov 1996. In that version tcp_rcv_established() was added, and the code only updates snd_wl1 in tcp_ack(), and in the new "Bulk data transfer: receiver" code path it does not call tcp_ack(). This fix seems to apply cleanly at least as far back as v3.2.
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Neal Cardwell ncardwell@google.com Reported-by: Apollon Oikonomopoulos apoikos@dmesg.gr Tested-by: Apollon Oikonomopoulos apoikos@dmesg.gr Link: https://www.spinics.net/lists/netdev/msg692430.html Acked-by: Soheil Hassas Yeganeh soheil@google.com Acked-by: Yuchung Cheng ycheng@google.com Signed-off-by: Eric Dumazet edumazet@google.com Link: https://lore.kernel.org/r/20201022143331.1887495-1-ncardwell.kernel@gmail.co... Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/tcp_input.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 493e42755535..8f04d2eb2799 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5639,6 +5639,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; + } else { + tcp_update_wl(tp, TCP_SKB_CB(skb)->seq); }
__tcp_ack_snd_check(sk, 0);
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.153 commit d6c552505c0d1719dda42b4af2def0618bd7bf54
--------------------------------
[ Upstream commit b38e7819cae946e2edf869e604af1e65a5d241c5 ]
Keyu Man reported that the ICMP rate limiter could be used by attackers to get useful signal. Details will be provided in an upcoming academic publication.
Our solution is to add some noise, so that the attackers no longer can get help from the predictable token bucket limiter.
Fixes: 4cdf507d5452 ("icmp: add a global rate limitation") Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: Keyu Man kman001@ucr.edu Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- Documentation/networking/ip-sysctl.txt | 4 +++- net/ipv4/icmp.c | 7 +++++-- 2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index a8ae7ca9acb4..c99a4423ffba 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -946,12 +946,14 @@ icmp_ratelimit - INTEGER icmp_msgs_per_sec - INTEGER Limit maximal number of ICMP packets sent per second from this host. Only messages whose type matches icmp_ratemask (see below) are - controlled by this limit. + controlled by this limit. For security reasons, the precise count + of messages per second is randomized. Default: 1000
icmp_msgs_burst - INTEGER icmp_msgs_per_sec controls number of ICMP packets sent per second, while icmp_msgs_burst controls the burst size of these packets. + For security reasons, the precise burst size is randomized. Default: 50
icmp_ratemask - INTEGER diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4020ab318da4..71a25e6db892 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -246,7 +246,7 @@ static struct { /** * icmp_global_allow - Are we allowed to send one more ICMP message ? * - * Uses a token bucket to limit our ICMP messages to sysctl_icmp_msgs_per_sec. + * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec. * Returns false if we reached the limit and can not send another packet. * Note: called with BH disabled */ @@ -274,7 +274,10 @@ bool icmp_global_allow(void) } credit = min_t(u32, icmp_global.credit + incr, sysctl_icmp_msgs_burst); if (credit) { - credit--; + /* We want to use a credit of one in average, but need to randomize + * it for security reasons. + */ + credit = max_t(int, credit - prandom_u32_max(3), 0); rc = true; } WRITE_ONCE(icmp_global.credit, credit);
From: Dan Carpenter dan.carpenter@oracle.com
stable inclusion from linux-4.19.153 commit ab57a8aa9d6ccbee6c8b8bcc4e32a5772272cbcf
--------------------------------
commit d367cb960ce88914898cbfa43645c2e43ede9465 upstream.
The "end" pointer is either NULL or it points to the next byte to parse. If there isn't a next byte then dereferencing "end" is an off-by-one out of bounds error. And, of course, if it's NULL that leads to an Oops. Printing "*end" doesn't seem very useful so let's delete this code.
Also for the last debug statement, I noticed that it should be printing "sequence_end" instead of "end" so fix that as well.
Reported-by: Dominik Maier dmaier@sect.tu-berlin.de Signed-off-by: Dan Carpenter dan.carpenter@oracle.com Signed-off-by: Steve French stfrench@microsoft.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/cifs/asn1.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 3d19595eb352..4a9b53229fba 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -541,8 +541,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n", + cls, con, tag, end); return 0; }
@@ -552,8 +552,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 1\n", + cls, con, tag, end); return 0; }
@@ -563,8 +563,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 0\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p exit 0\n", + cls, con, tag, end); return 0; }
@@ -575,8 +575,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { - cifs_dbg(FYI, "cls = %d con = %d tag = %d end = %p (%d) exit 1\n", - cls, con, tag, end, *end); + cifs_dbg(FYI, "cls = %d con = %d tag = %d sequence_end = %p exit 1\n", + cls, con, tag, sequence_end); return 0; }
From: Shyam Prasad N sprasad@microsoft.com
stable inclusion from linux-4.19.153 commit a4f751c41a8b67770667451167bdc2ac61e077ac
--------------------------------
commit 0bd294b55a5de442370c29fa53bab17aef3ff318 upstream.
In crypt_message, when smb2_get_enc_key returns error, we need to return the error back to the caller. If not, we end up processing the message further, causing a kernel oops due to unwarranted access of memory.
Call Trace: smb3_receive_transform+0x120/0x870 [cifs] cifs_demultiplex_thread+0xb53/0xc20 [cifs] ? cifs_handle_standard+0x190/0x190 [cifs] kthread+0x116/0x130 ? kthread_park+0x80/0x80 ret_from_fork+0x1f/0x30
Signed-off-by: Shyam Prasad N sprasad@microsoft.com Reviewed-by: Pavel Shilovsky pshilov@microsoft.com Reviewed-by: Ronnie Sahlberg lsahlber@redhat.com CC: Stable stable@vger.kernel.org Signed-off-by: Steve French stfrench@microsoft.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/cifs/smb2ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index d85ed2ace9ac..9e3ee00d1a49 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -2702,7 +2702,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (rc) { cifs_dbg(VFS, "%s: Could not get %scryption key\n", __func__, enc ? "en" : "de"); - return 0; + return rc; }
rc = smb3_crypto_aead_allocate(server);
From: Julian Anastasov ja@ssi.bg
stable inclusion from linux-4.19.153 commit 69a117d795ac1c79105c2284452095eed5becc6d
--------------------------------
[ Upstream commit 7980d2eabde82be86c5be18aa3d07e88ec13c6a1 ]
fq qdisc requires tstamp to be cleared in forwarding path
Reported-by: Evgeny B abt-admin@mail.ru Link: https://bugzilla.kernel.org/show_bug.cgi?id=209427 Suggested-by: Eric Dumazet eric.dumazet@gmail.com Fixes: 8203e2d844d3 ("net: clear skb->tstamp in forwarding paths") Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time.") Signed-off-by: Julian Anastasov ja@ssi.bg Reviewed-by: Simon Horman horms@verge.net.au Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netfilter/ipvs/ip_vs_xmit.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 3f75cd947045..11f7c546e57b 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -586,6 +586,8 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (ret == NF_ACCEPT) { nf_reset(skb); skb_forward_csum(skb); + if (skb->dev) + skb->tstamp = 0; } return ret; } @@ -626,6 +628,8 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
if (!local) { skb_forward_csum(skb); + if (skb->dev) + skb->tstamp = 0; NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else @@ -646,6 +650,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (!local) { ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); + if (skb->dev) + skb->tstamp = 0; NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else
From: Pablo Neira Ayuso pablo@netfilter.org
stable inclusion from linux-4.19.153 commit d74d61d90babd63b2fc953fbf0ce5f4fe2d47cf0
--------------------------------
[ Upstream commit 0d9826bc18ce356e8909919ad681ad65d0a6061e ]
Dump vlan tag and proto for the usual vlan offload case if the NF_LOG_MACDECODE flag is set on. Without this information the logging is misleading as there is no reference to the VLAN header.
[12716.993704] test: IN=veth0 OUT= MACSRC=86:6c:92:ea:d6:73 MACDST=0e:3b:eb:86:73:76 VPROTO=8100 VID=10 MACPROTO=0800 SRC=192.168.10.2 DST=172.217.168.163 LEN=52 TOS=0x00 PREC=0x00 TTL=64 ID=2548 DF PROTO=TCP SPT=55848 DPT=80 WINDOW=501 RES=0x00 ACK FIN URGP=0 [12721.157643] test: IN=veth0 OUT= MACSRC=86:6c:92:ea:d6:73 MACDST=0e:3b:eb:86:73:76 VPROTO=8100 VID=10 MACPROTO=0806 ARP HTYPE=1 PTYPE=0x0800 OPCODE=2 MACSRC=86:6c:92:ea:d6:73 IPSRC=192.168.10.2 MACDST=0e:3b:eb:86:73:76 IPDST=192.168.10.1
Fixes: 83e96d443b37 ("netfilter: log: split family specific code to nf_log_{ip,ip6,common}.c files") Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/netfilter/nf_log.h | 1 + net/ipv4/netfilter/nf_log_arp.c | 19 +++++++++++++++++-- net/ipv4/netfilter/nf_log_ipv4.c | 6 ++++-- net/ipv6/netfilter/nf_log_ipv6.c | 8 +++++--- net/netfilter/nf_log_common.c | 12 ++++++++++++ 5 files changed, 39 insertions(+), 7 deletions(-)
diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 0d3920896d50..716db4a0fed8 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -108,6 +108,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, unsigned int logflags); void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, struct sock *sk); +void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb); void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index df5c2a2061a4..19fff2c589fa 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -46,16 +46,31 @@ static void dump_arp_packet(struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int nhoff) { - const struct arphdr *ah; - struct arphdr _arph; const struct arppayload *ap; struct arppayload _arpp; + const struct arphdr *ah; + unsigned int logflags; + struct arphdr _arph;
ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); if (ah == NULL) { nf_log_buf_add(m, "TRUNCATED"); return; } + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_DEFAULT_MASK; + + if (logflags & NF_LOG_MACDECODE) { + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); + } + nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op));
diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 1e6f28c97d3a..cde1918607e9 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -287,8 +287,10 @@ static void dump_ipv4_mac_header(struct nf_log_buf *m,
switch (dev->type) { case ARPHRD_ETHER: - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", ntohs(eth_hdr(skb)->h_proto)); return; default: diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index c6bf580d0f33..c456e2f902b9 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -300,9 +300,11 @@ static void dump_ipv6_mac_header(struct nf_log_buf *m,
switch (dev->type) { case ARPHRD_ETHER: - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, - ntohs(eth_hdr(skb)->h_proto)); + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); return; default: break; diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index a8c5c846aec1..b164a0e1e053 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -176,6 +176,18 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, } EXPORT_SYMBOL_GPL(nf_log_dump_packet_common);
+void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb) +{ + u16 vid; + + if (!skb_vlan_tag_present(skb)) + return; + + vid = skb_vlan_tag_get(skb); + nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid); +} +EXPORT_SYMBOL_GPL(nf_log_dump_vlan); + /* bridge and netdev logging families share this code. */ void nf_log_l2packet(struct net *net, u_int8_t pf, __be16 protocol,
From: Ralph Campbell rcampbell@nvidia.com
stable inclusion from linux-4.19.153 commit 2761fff65fbf59cd77346fc63b76f651cf0f97d2
--------------------------------
[ Upstream commit 9a137153fc8798a89d8fce895cd0a06ea5b8e37c ]
The code in mc_handle_swap_pte() checks for non_swap_entry() and returns NULL before checking is_device_private_entry() so device private pages are never handled. Fix this by checking for non_swap_entry() after handling device private swap PTEs.
I assume the memory cgroup accounting would be off somehow when moving a process to another memory cgroup. Currently, the device private page is charged like a normal anonymous page when allocated and is uncharged when the page is freed so I think that path is OK.
Signed-off-by: Ralph Campbell rcampbell@nvidia.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Johannes Weiner hannes@cmpxchg.org Cc: Michal Hocko mhocko@kernel.org Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Jerome Glisse jglisse@redhat.com Cc: Balbir Singh bsingharora@gmail.com Cc: Ira Weiny ira.weiny@intel.com Link: https://lkml.kernel.org/r/20201009215952.2726-1-rcampbell@nvidia.com xFixes: c733a82874a7 ("mm/memcontrol: support MEMORY_DEVICE_PRIVATE") Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/memcontrol.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1342b9540476..d926de1686ba 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4799,7 +4799,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, struct page *page = NULL; swp_entry_t ent = pte_to_swp_entry(ptent);
- if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent)) + if (!(mc.flags & MOVE_ANON)) return NULL;
/* @@ -4818,6 +4818,9 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return page; }
+ if (non_swap_entry(ent)) + return NULL; + /* * Because lookup_swap_cache() updates some statistics counter, * we call find_get_page() with swapper_space directly.
From: Suren Baghdasaryan surenb@google.com
stable inclusion from linux-4.19.153 commit a3d0ceee716047c5e8a2bcdd3192f885f404386d
--------------------------------
[ Upstream commit 67197a4f28d28d0b073ab0427b03cb2ee5382578 ]
Currently __set_oom_adj loops through all processes in the system to keep oom_score_adj and oom_score_adj_min in sync between processes sharing their mm. This is done for any task with more that one mm_users, which includes processes with multiple threads (sharing mm and signals). However for such processes the loop is unnecessary because their signal structure is shared as well.
Android updates oom_score_adj whenever a tasks changes its role (background/foreground/...) or binds to/unbinds from a service, making it more/less important. Such operation can happen frequently. We noticed that updates to oom_score_adj became more expensive and after further investigation found out that the patch mentioned in "Fixes" introduced a regression. Using Pixel 4 with a typical Android workload, write time to oom_score_adj increased from ~3.57us to ~362us. Moreover this regression linearly depends on the number of multi-threaded processes running on the system.
Mark the mm with a new MMF_MULTIPROCESS flag bit when task is created with (CLONE_VM && !CLONE_THREAD && !CLONE_VFORK). Change __set_oom_adj to use MMF_MULTIPROCESS instead of mm_users to decide whether oom_score_adj update should be synchronized between multiple processes. To prevent races between clone() and __set_oom_adj(), when oom_score_adj of the process being cloned might be modified from userspace, we use oom_adj_mutex. Its scope is changed to global.
The combination of (CLONE_VM && !CLONE_THREAD) is rarely used except for the case of vfork(). To prevent performance regressions of vfork(), we skip taking oom_adj_mutex and setting MMF_MULTIPROCESS when CLONE_VFORK is specified. Clearing the MMF_MULTIPROCESS flag (when the last process sharing the mm exits) is left out of this patch to keep it simple and because it is believed that this threading model is rare. Should there ever be a need for optimizing that case as well, it can be done by hooking into the exit path, likely following the mm_update_next_owner pattern.
With the combination of (CLONE_VM && !CLONE_THREAD && !CLONE_VFORK) being quite rare, the regression is gone after the change is applied.
[surenb@google.com: v3] Link: https://lkml.kernel.org/r/20200902012558.2335613-1-surenb@google.com
Fixes: 44a70adec910 ("mm, oom_adj: make sure processes sharing mm have same view of oom_score_adj") Reported-by: Tim Murray timmurray@google.com Suggested-by: Michal Hocko mhocko@kernel.org Signed-off-by: Suren Baghdasaryan surenb@google.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Acked-by: Christian Brauner christian.brauner@ubuntu.com Acked-by: Michal Hocko mhocko@suse.com Acked-by: Oleg Nesterov oleg@redhat.com Cc: Ingo Molnar mingo@kernel.org Cc: Peter Zijlstra peterz@infradead.org Cc: Thomas Gleixner tglx@linutronix.de Cc: Eugene Syromiatnikov esyr@redhat.com Cc: Christian Kellner christian@kellner.me Cc: Adrian Reber areber@redhat.com Cc: Shakeel Butt shakeelb@google.com Cc: Aleksa Sarai cyphar@cyphar.com Cc: Alexey Dobriyan adobriyan@gmail.com Cc: "Eric W. Biederman" ebiederm@xmission.com Cc: Alexey Gladkov gladkov.alexey@gmail.com Cc: Michel Lespinasse walken@google.com Cc: Daniel Jordan daniel.m.jordan@oracle.com Cc: Andrei Vagin avagin@gmail.com Cc: Bernd Edlinger bernd.edlinger@hotmail.de Cc: John Johansen john.johansen@canonical.com Cc: Yafang Shao laoar.shao@gmail.com Link: https://lkml.kernel.org/r/20200824153036.3201505-1-surenb@google.com Debugged-by: Minchan Kim minchan@kernel.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/proc/base.c | 3 +-- include/linux/oom.h | 1 + include/linux/sched/coredump.h | 1 + kernel/fork.c | 21 +++++++++++++++++++++ mm/oom_kill.c | 2 ++ 5 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 3b9b726b1a6c..5e705fa9a913 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1035,7 +1035,6 @@ static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) { - static DEFINE_MUTEX(oom_adj_mutex); struct mm_struct *mm = NULL; struct task_struct *task; int err = 0; @@ -1075,7 +1074,7 @@ static int __set_oom_adj(struct file *file, int oom_adj, bool legacy) struct task_struct *p = find_lock_task_mm(task);
if (p) { - if (atomic_read(&p->mm->mm_users) > 1) { + if (test_bit(MMF_MULTIPROCESS, &p->mm->flags)) { mm = p->mm; mmgrab(mm); } diff --git a/include/linux/oom.h b/include/linux/oom.h index 689d32ab694b..2b2a40cc19bf 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -45,6 +45,7 @@ struct oom_control { };
extern struct mutex oom_lock; +extern struct mutex oom_adj_mutex;
static inline void set_current_oom_origin(void) { diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index ecdc6542070f..dfd82eab2902 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -72,6 +72,7 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_OOM_VICTIM 25 /* mm is the oom victim */ #define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 27 /* mm is shared between processes */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP)
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/kernel/fork.c b/kernel/fork.c index e333d203a8ce..68b0fd51302e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1708,6 +1708,25 @@ static __always_inline void delayed_free_task(struct task_struct *tsk) #endif /* CONFIG_MEMCG */ }
+static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) +{ + /* Skip if kernel thread */ + if (!tsk->mm) + return; + + /* Skip if spawning a thread or using vfork */ + if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM) + return; + + /* We need to synchronize with __set_oom_adj */ + mutex_lock(&oom_adj_mutex); + set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); + /* Update the values in case they were changed after copy_signal */ + tsk->signal->oom_score_adj = current->signal->oom_score_adj; + tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; + mutex_unlock(&oom_adj_mutex); +} + /* * This creates a new process as a copy of the old one, * but does not actually start it yet. @@ -2147,6 +2166,8 @@ static __latent_entropy struct task_struct *copy_process( trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags);
+ copy_oom_score_adj(clone_flags, p); + return p;
bad_fork_cancel_cgroup: diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 46cd317e6743..1c8236cbb902 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -65,6 +65,8 @@ int sysctl_enable_oom_killer = 1; * and mark_oom_victim */ DEFINE_MUTEX(oom_lock); +/* Serializes oom_score_adj and oom_score_adj_min updates */ +DEFINE_MUTEX(oom_adj_mutex);
#ifdef CONFIG_NUMA /**
From: "Darrick J. Wong" darrick.wong@oracle.com
stable inclusion from linux-4.19.153 commit 56c37c5dba2ce51e67167a6c4545f230a23607b9
--------------------------------
[ Upstream commit acd1ac3aa22fd58803a12d26b1ab7f70232f8d8d ]
If userspace asked fsmap to count the number of entries, we cannot return more than UINT_MAX entries because fmh_entries is u32. Therefore, stop counting if we hit this limit or else we will waste time to return truncated results.
Fixes: e89c041338ed ("xfs: implement the GETFSMAP ioctl") Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/xfs_fsmap.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 910f0e9b5c68..9e623496ab68 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -259,6 +259,9 @@ xfs_getfsmap_helper(
/* Are we just counting mappings? */ if (info->head->fmh_count == 0) { + if (info->head->fmh_entries == UINT_MAX) + return -ECANCELED; + if (rec_daddr > info->next_daddr) info->head->fmh_entries++;
From: "Darrick J. Wong" darrick.wong@oracle.com
stable inclusion from linux-4.19.153 commit dffa7657483b1e640196d54061dbcbfe8769444e
--------------------------------
[ Upstream commit d88850bd5516a77c6f727e8b6cefb64e0cc929c7 ]
Fix some off-by-one errors in xfs_rtalloc_query_range. The highest key in the realtime bitmap is always one less than the number of rt extents, which means that the key clamp at the start of the function is wrong. The 4th argument to xfs_rtfind_forw is the highest rt extent that we want to probe, which means that passing 1 less than the high key is wrong. Finally, drop the rem variable that controls the loop because we can compare the iteration point (rtstart) against the high key directly.
The sordid history of this function is that the original commit (fb3c3) incorrectly passed (high_rec->ar_startblock - 1) as the 'limit' parameter to xfs_rtfind_forw. This was wrong because the "high key" is supposed to be the largest key for which the caller wants result rows, not the key for the first row that could possibly be outside the range that the caller wants to see.
A subsequent attempt (8ad56) to strengthen the parameter checking added incorrect clamping of the parameters to the number of rt blocks in the system (despite the bitmap functions all taking units of rt extents) to avoid querying ranges past the end of rt bitmap file but failed to fix the incorrect _rtfind_forw parameter. The original _rtfind_forw parameter error then survived the conversion of the startblock and blockcount fields to rt extents (a0e5c), and the most recent off-by-one fix (a3a37) thought it was patching a problem when the end of the rt volume is not in use, but none of these fixes actually solved the original problem that the author was confused about the "limit" argument to xfs_rtfind_forw.
Sadly, all four of these patches were written by this author and even his own usage of this function and rt testing were inadequate to get this fixed quickly.
Original-problem: fb3c3de2f65c ("xfs: add a couple of queries to iterate free extents in the rtbitmap") Not-fixed-by: 8ad560d2565e ("xfs: strengthen rtalloc query range checks") Not-fixed-by: a0e5c435babd ("xfs: fix xfs_rtalloc_rec units") Fixes: a3a374bf1889 ("xfs: fix off-by-one error in xfs_rtalloc_query_range") Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Chandan Babu R chandanrlinux@gmail.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/libxfs/xfs_rtbitmap.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-)
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index b228c821bae6..fe7323032e78 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1020,7 +1020,6 @@ xfs_rtalloc_query_range( struct xfs_mount *mp = tp->t_mountp; xfs_rtblock_t rtstart; xfs_rtblock_t rtend; - xfs_rtblock_t rem; int is_free; int error = 0;
@@ -1029,13 +1028,12 @@ xfs_rtalloc_query_range( if (low_rec->ar_startext >= mp->m_sb.sb_rextents || low_rec->ar_startext == high_rec->ar_startext) return 0; - if (high_rec->ar_startext > mp->m_sb.sb_rextents) - high_rec->ar_startext = mp->m_sb.sb_rextents; + high_rec->ar_startext = min(high_rec->ar_startext, + mp->m_sb.sb_rextents - 1);
/* Iterate the bitmap, looking for discrepancies. */ rtstart = low_rec->ar_startext; - rem = high_rec->ar_startext - rtstart; - while (rem) { + while (rtstart <= high_rec->ar_startext) { /* Is the first block free? */ error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, &is_free); @@ -1044,7 +1042,7 @@ xfs_rtalloc_query_range(
/* How long does the extent go for? */ error = xfs_rtfind_forw(mp, tp, rtstart, - high_rec->ar_startext - 1, &rtend); + high_rec->ar_startext, &rtend); if (error) break;
@@ -1057,7 +1055,6 @@ xfs_rtalloc_query_range( break; }
- rem -= rtend - rtstart + 1; rtstart = rtend + 1; }
From: Tobias Jordan kernel@cdqe.de
stable inclusion from linux-4.19.154 commit 67f4d89b4efa2726eb0f04e6987dbcde3e7c24a0
--------------------------------
[ Upstream commit 904542dc56524f921a6bab0639ff6249c01e775f ]
Whether crc32_be needs a lookup table is chosen based on CRC_LE_BITS. Obviously, the _be function should be governed by the _BE_ define.
This probably never pops up as it's hard to come up with a configuration where CRC_BE_BITS isn't the same as CRC_LE_BITS and as nobody is using bitwise CRC anyway.
Fixes: 46c5801eaf86 ("crc32: bolt on crc32c") Signed-off-by: Tobias Jordan kernel@cdqe.de Signed-off-by: Andrew Morton akpm@linux-foundation.org Cc: Krzysztof Kozlowski krzk@kernel.org Cc: Jonathan Corbet corbet@lwn.net Cc: Mauro Carvalho Chehab mchehab+huawei@kernel.org Link: https://lkml.kernel.org/r/20200923182122.GA3338@agrajag.zerfleddert.de Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- lib/crc32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/crc32.c b/lib/crc32.c index 4a20455d1f61..bf60ef26a45c 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -331,7 +331,7 @@ static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, return crc; }
-#if CRC_LE_BITS == 1 +#if CRC_BE_BITS == 1 u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) { return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE);
From: "Matthew Wilcox (Oracle)" willy@infradead.org
stable inclusion from linux-4.19.154 commit 438191cfde4257238e3dcd765ae77639e1824635
--------------------------------
[ Upstream commit 50b7d85680086126d7bd91dae81d57d4cb1ab6b7 ]
ramfs needs to check that pages are both physically contiguous and contiguous in the file. If the page cache happens to have, eg, page A for index 0 of the file, no page for index 1, and page A+1 for index 2, then an mmap of the first two pages of the file will succeed when it should fail.
Fixes: 642fb4d1f1dd ("[PATCH] NOMMU: Provide shared-writable mmap support on ramfs") Signed-off-by: Matthew Wilcox (Oracle) willy@infradead.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Cc: David Howells dhowells@redhat.com Link: https://lkml.kernel.org/r/20200914122239.GO6583@casper.infradead.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ramfs/file-nommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 3ac1f2387083..5e1ebbe639eb 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -228,7 +228,7 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file, if (!pages) goto out_free;
- nr = find_get_pages(inode->i_mapping, &pgoff, lpages, pages); + nr = find_get_pages_contig(inode->i_mapping, pgoff, lpages, pages); if (nr != lpages) goto out_free_pages; /* leave if some pages were missing */
From: Krzysztof Sobota krzysztof.sobota@nokia.com
stable inclusion from linux-4.19.141 commit ddf2b7891323fe30bfac761d2e12d3378a0d7eb2
--------------------------------
[ Upstream commit cb36e29bb0e4b0c33c3d5866a0a4aebace4c99b7 ]
When watchdog device is being registered, it calls misc_register that makes watchdog available for systemd to open. This is a data race scenario, because when device is open it may still have device struct not initialized - this in turn causes a crash. This patch moves device initialization before misc_register call and it solves the problem printed below.
------------[ cut here ]------------ WARNING: CPU: 3 PID: 1 at lib/kobject.c:612 kobject_get+0x50/0x54 kobject: '(null)' ((ptrval)): is not initialized, yet kobject_get() is being called. Modules linked in: k2_reset_status(O) davinci_wdt(+) sfn_platform_hwbcn(O) fsmddg_sfn(O) clk_misc_mmap(O) clk_sw_bcn(O) fsp_reset(O) cma_mod(O) slave_sup_notif(O) fpga_master(O) latency(O+) evnotify(O) enable_arm_pmu(O) xge(O) rio_mport_cdev br_netfilter bridge stp llc nvrd_checksum(O) ipv6 CPU: 3 PID: 1 Comm: systemd Tainted: G O 4.19.113-g2579778-fsm4_k2 #1 Hardware name: Keystone [<c02126c4>] (unwind_backtrace) from [<c020da94>] (show_stack+0x18/0x1c) [<c020da94>] (show_stack) from [<c07f87d8>] (dump_stack+0xb4/0xe8) [<c07f87d8>] (dump_stack) from [<c0221f70>] (__warn+0xfc/0x114) [<c0221f70>] (__warn) from [<c0221fd8>] (warn_slowpath_fmt+0x50/0x74) [<c0221fd8>] (warn_slowpath_fmt) from [<c07fd394>] (kobject_get+0x50/0x54) [<c07fd394>] (kobject_get) from [<c0602ce8>] (get_device+0x1c/0x24) [<c0602ce8>] (get_device) from [<c06961e0>] (watchdog_open+0x90/0xf0) [<c06961e0>] (watchdog_open) from [<c06001dc>] (misc_open+0x130/0x17c) [<c06001dc>] (misc_open) from [<c0388228>] (chrdev_open+0xec/0x1a8) [<c0388228>] (chrdev_open) from [<c037fa98>] (do_dentry_open+0x204/0x3cc) [<c037fa98>] (do_dentry_open) from [<c0391e2c>] (path_openat+0x330/0x1148) [<c0391e2c>] (path_openat) from [<c0394518>] (do_filp_open+0x78/0xec) [<c0394518>] (do_filp_open) from [<c0381100>] (do_sys_open+0x130/0x1f4) [<c0381100>] (do_sys_open) from [<c0201000>] (ret_fast_syscall+0x0/0x28) Exception stack(0xd2ceffa8 to 0xd2cefff0) ffa0: b6f69968 00000000 ffffff9c b6ebd210 000a0001 00000000 ffc0: b6f69968 00000000 00000000 00000142 fffffffd ffffffff 00b65530 bed7bb78 ffe0: 00000142 bed7ba70 b6cc2503 b6cc41d6 ---[ end trace 7b16eb105513974f ]---
------------[ cut here ]------------ WARNING: CPU: 3 PID: 1 at lib/refcount.c:153 kobject_get+0x24/0x54 refcount_t: increment on 0; use-after-free. Modules linked in: k2_reset_status(O) davinci_wdt(+) sfn_platform_hwbcn(O) fsmddg_sfn(O) clk_misc_mmap(O) clk_sw_bcn(O) fsp_reset(O) cma_mod(O) slave_sup_notif(O) fpga_master(O) latency(O+) evnotify(O) enable_arm_pmu(O) xge(O) rio_mport_cdev br_netfilter bridge stp llc nvrd_checksum(O) ipv6 CPU: 3 PID: 1 Comm: systemd Tainted: G W O 4.19.113-g2579778-fsm4_k2 #1 Hardware name: Keystone [<c02126c4>] (unwind_backtrace) from [<c020da94>] (show_stack+0x18/0x1c) [<c020da94>] (show_stack) from [<c07f87d8>] (dump_stack+0xb4/0xe8) [<c07f87d8>] (dump_stack) from [<c0221f70>] (__warn+0xfc/0x114) [<c0221f70>] (__warn) from [<c0221fd8>] (warn_slowpath_fmt+0x50/0x74) [<c0221fd8>] (warn_slowpath_fmt) from [<c07fd368>] (kobject_get+0x24/0x54) [<c07fd368>] (kobject_get) from [<c0602ce8>] (get_device+0x1c/0x24) [<c0602ce8>] (get_device) from [<c06961e0>] (watchdog_open+0x90/0xf0) [<c06961e0>] (watchdog_open) from [<c06001dc>] (misc_open+0x130/0x17c) [<c06001dc>] (misc_open) from [<c0388228>] (chrdev_open+0xec/0x1a8) [<c0388228>] (chrdev_open) from [<c037fa98>] (do_dentry_open+0x204/0x3cc) [<c037fa98>] (do_dentry_open) from [<c0391e2c>] (path_openat+0x330/0x1148) [<c0391e2c>] (path_openat) from [<c0394518>] (do_filp_open+0x78/0xec) [<c0394518>] (do_filp_open) from [<c0381100>] (do_sys_open+0x130/0x1f4) [<c0381100>] (do_sys_open) from [<c0201000>] (ret_fast_syscall+0x0/0x28) Exception stack(0xd2ceffa8 to 0xd2cefff0) ffa0: b6f69968 00000000 ffffff9c b6ebd210 000a0001 00000000 ffc0: b6f69968 00000000 00000000 00000142 fffffffd ffffffff 00b65530 bed7bb78 ffe0: 00000142 bed7ba70 b6cc2503 b6cc41d6 ---[ end trace 7b16eb1055139750 ]---
Fixes: 72139dfa2464 ("watchdog: Fix the race between the release of watchdog_core_data and cdev") Reviewed-by: Guenter Roeck linux@roeck-us.net Reviewed-by: Alexander Sverdlin alexander.sverdlin@nokia.com Signed-off-by: Krzysztof Sobota krzysztof.sobota@nokia.com Link: https://lore.kernel.org/r/20200717103109.14660-1-krzysztof.sobota@nokia.com Signed-off-by: Guenter Roeck linux@roeck-us.net Signed-off-by: Wim Van Sebroeck wim@linux-watchdog.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/watchdog/watchdog_dev.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-)
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index 6c7c5f4abeae..17d91eaceafb 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -1010,6 +1010,15 @@ static int watchdog_cdev_register(struct watchdog_device *wdd) if (IS_ERR_OR_NULL(watchdog_kworker)) return -ENODEV;
+ device_initialize(&wd_data->dev); + wd_data->dev.devt = MKDEV(MAJOR(watchdog_devt), wdd->id); + wd_data->dev.class = &watchdog_class; + wd_data->dev.parent = wdd->parent; + wd_data->dev.groups = wdd->groups; + wd_data->dev.release = watchdog_core_data_release; + dev_set_drvdata(&wd_data->dev, wdd); + dev_set_name(&wd_data->dev, "watchdog%d", wdd->id); + kthread_init_work(&wd_data->work, watchdog_ping_work); hrtimer_init(&wd_data->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); wd_data->timer.function = watchdog_timer_expired; @@ -1030,15 +1039,6 @@ static int watchdog_cdev_register(struct watchdog_device *wdd) } }
- device_initialize(&wd_data->dev); - wd_data->dev.devt = MKDEV(MAJOR(watchdog_devt), wdd->id); - wd_data->dev.class = &watchdog_class; - wd_data->dev.parent = wdd->parent; - wd_data->dev.groups = wdd->groups; - wd_data->dev.release = watchdog_core_data_release; - dev_set_drvdata(&wd_data->dev, wdd); - dev_set_name(&wd_data->dev, "watchdog%d", wdd->id); - /* Fill in the data structures */ cdev_init(&wd_data->cdev, &watchdog_fops);
From: Dinghao Liu dinghao.liu@zju.edu.cn
stable inclusion from linux-4.19.154 commit a19725131e162e85ef2aa08b457148073f5349f5
--------------------------------
[ Upstream commit 5afb6d203d0293512aa2c6ae098274a2a4f6ed02 ]
When watchdog_kworker is NULL, we should free wd_data before the function returns to prevent memleak.
Fixes: 664a39236e718 ("watchdog: Introduce hardware maximum heartbeat in watchdog core") Signed-off-by: Dinghao Liu dinghao.liu@zju.edu.cn Reviewed-by: Guenter Roeck linux@roeck-us.net Link: https://lore.kernel.org/r/20200824024001.25474-1-dinghao.liu@zju.edu.cn Signed-off-by: Guenter Roeck linux@roeck-us.net Signed-off-by: Wim Van Sebroeck wim@linux-watchdog.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/watchdog/watchdog_dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index 17d91eaceafb..ca8486a5850e 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -1007,8 +1007,10 @@ static int watchdog_cdev_register(struct watchdog_device *wdd) wd_data->wdd = wdd; wdd->wd_data = wd_data;
- if (IS_ERR_OR_NULL(watchdog_kworker)) + if (IS_ERR_OR_NULL(watchdog_kworker)) { + kfree(wd_data); return -ENODEV; + }
device_initialize(&wd_data->dev); wd_data->dev.devt = MKDEV(MAJOR(watchdog_devt), wdd->id);
From: Dinghao Liu dinghao.liu@zju.edu.cn
stable inclusion from linux-4.19.154 commit 322661e7616d261e3e6dae9a088b900127d6b7c1
--------------------------------
[ Upstream commit 937425d4cd3ae4e2882b41e332bbbab616bcf0ad ]
We should use put_device() instead of freeing device directly after device_initialize().
Fixes: cb36e29bb0e4b ("watchdog: initialize device before misc_register") Signed-off-by: Dinghao Liu dinghao.liu@zju.edu.cn Reviewed-by: Guenter Roeck linux@roeck-us.net Link: https://lore.kernel.org/r/20200824031230.31050-1-dinghao.liu@zju.edu.cn Signed-off-by: Guenter Roeck linux@roeck-us.net Signed-off-by: Wim Van Sebroeck wim@linux-watchdog.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/watchdog/watchdog_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index ca8486a5850e..2b652df44593 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -1036,7 +1036,7 @@ static int watchdog_cdev_register(struct watchdog_device *wdd) pr_err("%s: a legacy watchdog module is probably present.\n", wdd->info->identity); old_wd_data = NULL; - kfree(wd_data); + put_device(&wd_data->dev); return err; } }
From: "Darrick J. Wong" darrick.wong@oracle.com
stable inclusion from linux-4.19.154 commit 3abb2ac9594b3a7a54086e05452dba25a011a78b
--------------------------------
[ Upstream commit af8c53c8bc087459b1aadd4c94805d8272358d79 ]
If userspace asked fsmap to try to count the number of entries, we cannot return more than UINT_MAX entries because fmh_entries is u32. Therefore, stop counting if we hit this limit or else we will waste time to return truncated results.
Fixes: 0c9ec4beecac ("ext4: support GETFSMAP ioctls") Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Link: https://lore.kernel.org/r/20201001222148.GA49520@magnolia Signed-off-by: Theodore Ts'o tytso@mit.edu Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/fsmap.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index 4b99e2db95b8..6f3f245f3a80 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -108,6 +108,9 @@ static int ext4_getfsmap_helper(struct super_block *sb,
/* Are we just counting mappings? */ if (info->gfi_head->fmh_count == 0) { + if (info->gfi_head->fmh_entries == UINT_MAX) + return EXT4_QUERY_RANGE_ABORT; + if (rec_fsblk > info->gfi_next_fsblk) info->gfi_head->fmh_entries++;
From: Alex Williamson alex.williamson@redhat.com
stable inclusion from linux-4.19.154 commit f393f4a5fcbc38248a9a261f4cbee8ea5b1296fd
--------------------------------
[ Upstream commit 852b1beecb6ff9326f7ca4bc0fe69ae860ebdb9e ]
The eventfd context is used as our irqbypass token, therefore if an eventfd is re-used, our token is the same. The irqbypass code will return an -EBUSY in this case, but we'll still attempt to unregister the producer, where if that duplicate token still exists, results in removing the wrong object. Clear the token of failed producers so that they harmlessly fall out when unregistered.
Fixes: 6d7425f109d2 ("vfio: Register/unregister irq_bypass_producer") Reported-by: guomin chen guomin_chen@sina.com Tested-by: guomin chen guomin_chen@sina.com Signed-off-by: Alex Williamson alex.williamson@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/vfio/pci/vfio_pci_intrs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index bdfdd506bc58..c989f777bf77 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -355,11 +355,13 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev, vdev->ctx[vector].producer.token = trigger; vdev->ctx[vector].producer.irq = irq; ret = irq_bypass_register_producer(&vdev->ctx[vector].producer); - if (unlikely(ret)) + if (unlikely(ret)) { dev_info(&pdev->dev, "irq bypass producer (token %p) registration fails: %d\n", vdev->ctx[vector].producer.token, ret);
+ vdev->ctx[vector].producer.token = NULL; + } vdev->ctx[vector].trigger = trigger;
return 0;
From: Xiaoyang Xu xuxiaoyang2@huawei.com
stable inclusion from linux-4.19.154 commit 707f8d0d380d7893cca2c6f196e0281b09db6d36
--------------------------------
[ Upstream commit 2e6cfd496f5b57034cf2aec738799571b5a52124 ]
pfn is not added to pfn_list when vfio_add_to_pfn_list fails. vfio_unpin_page_external will exit directly without calling vfio_iova_put_vfio_pfn. This will lead to a memory leak.
Fixes: a54eb55045ae ("vfio iommu type1: Add support for mediated devices") Signed-off-by: Xiaoyang Xu xuxiaoyang2@huawei.com [aw: simplified logic, add Fixes] Signed-off-by: Alex Williamson alex.williamson@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/vfio/vfio_iommu_type1.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 2bde1a098a29..5a106963dd08 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -640,7 +640,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]); if (ret) { - vfio_unpin_page_external(dma, iova, do_accounting); + if (put_pfn(phys_pfn[i], dma->prot) && do_accounting) + vfio_lock_acct(dma, -1, true); goto pin_unwind; } }
From: Francesco Ruggeri fruggeri@arista.com
stable inclusion from linux-4.19.154 commit b221c4a1f8778923f1984092b132fe3e176677c4
--------------------------------
[ Upstream commit 4f25434bccc28cf8a07876ef5142a2869a674353 ]
If the first packet conntrack sees after a re-register is an outgoing keepalive packet with no data (SEG.SEQ = SND.NXT-1), td_end is set to SND.NXT-1. When the peer correctly acknowledges SND.NXT, tcp_in_window fails check III (Upper bound for valid (s)ack: sack <= receiver.td_end) and returns false, which cascades into nf_conntrack_in setting skb->_nfct = 0 and in later conntrack iptables rules not matching. In cases where iptables are dropping packets that do not match conntrack rules this can result in idle tcp connections to time out.
v2: adjust td_end when getting the reply rather than when sending out the keepalive packet.
Fixes: f94e63801ab2 ("netfilter: conntrack: reset tcp maxwin on re-register") Signed-off-by: Francesco Ruggeri fruggeri@arista.com Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netfilter/nf_conntrack_proto_tcp.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index fe4b198bbbf0..ba14ef107f3c 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -544,13 +544,20 @@ static bool tcp_in_window(const struct nf_conn *ct, swin = win << sender->td_scale; sender->td_maxwin = (swin == 0 ? 1 : swin); sender->td_maxend = end + sender->td_maxwin; - /* - * We haven't seen traffic in the other direction yet - * but we have to tweak window tracking to pass III - * and IV until that happens. - */ - if (receiver->td_maxwin == 0) + if (receiver->td_maxwin == 0) { + /* We haven't seen traffic in the other + * direction yet but we have to tweak window + * tracking to pass III and IV until that + * happens. + */ receiver->td_end = receiver->td_maxend = sack; + } else if (sack == receiver->td_end + 1) { + /* Likely a reply to a keepalive. + * Needed for III. + */ + receiver->td_end++; + } + } } else if (((state->state == TCP_CONNTRACK_SYN_SENT && dir == IP_CT_DIR_ORIGINAL)
From: Pablo Neira Ayuso pablo@netfilter.org
stable inclusion from linux-4.19.154 commit be6992b68bd05829b4fb35a2c4546d0273527a9c
--------------------------------
[ Upstream commit c77761c8a59405cb7aa44188b30fffe13fbdd02d ]
Similar to 7980d2eabde8 ("ipvs: clear skb->tstamp in forwarding path"). fq qdisc requires tstamp to be cleared in forwarding path.
Fixes: 8203e2d844d3 ("net: clear skb->tstamp in forwarding paths") Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time.") Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netfilter/nf_dup_netdev.c | 1 + net/netfilter/nft_fwd_netdev.c | 1 + 2 files changed, 2 insertions(+)
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c index f4a566e67213..98d117f3340c 100644 --- a/net/netfilter/nf_dup_netdev.c +++ b/net/netfilter/nf_dup_netdev.c @@ -21,6 +21,7 @@ static void nf_do_netdev_egress(struct sk_buff *skb, struct net_device *dev) skb_push(skb, skb->mac_len);
skb->dev = dev; + skb->tstamp = 0; dev_queue_xmit(skb); }
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c index 649edbe77a20..10a12e094929 100644 --- a/net/netfilter/nft_fwd_netdev.c +++ b/net/netfilter/nft_fwd_netdev.c @@ -129,6 +129,7 @@ static void nft_fwd_neigh_eval(const struct nft_expr *expr, return;
skb->dev = dev; + skb->tstamp = 0; neigh_xmit(neigh_table, dev, addr, skb); out: regs->verdict.code = verdict;
From: Pavel Machek pavel@denx.de
stable inclusion from linux-4.19.154 commit 2c37decd6c5a67489c09bfd9fa46d64c1370992a
--------------------------------
[ Upstream commit e356c49c6cf0db3f00e1558749170bd56e47652d ]
Fix resource leak in error handling.
Signed-off-by: Pavel Machek (CIP) pavel@denx.de Acked-by: John Allen john.allen@amd.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/crypto/ccp/ccp-ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/crypto/ccp/ccp-ops.c b/drivers/crypto/ccp/ccp-ops.c index 0024bf1b75ba..4dca1c7b9d47 100644 --- a/drivers/crypto/ccp/ccp-ops.c +++ b/drivers/crypto/ccp/ccp-ops.c @@ -1747,7 +1747,7 @@ static int ccp_run_sha_cmd(struct ccp_cmd_queue *cmd_q, struct ccp_cmd *cmd) break; default: ret = -EINVAL; - goto e_ctx; + goto e_data; } } else { /* Stash the context */
From: Cong Wang xiyou.wangcong@gmail.com
stable inclusion from linux-4.19.154 commit cd3ecf114cbe4b12112cd2c175dbd1e41c70758f
--------------------------------
[ Upstream commit fdafed459998e2be0e877e6189b24cb7a0183224 ]
GRE tunnel has its own header_ops, ipgre_header_ops, and sets it conditionally. When it is set, it assumes the outer IP header is already created before ipgre_xmit().
This is not true when we send packets through a raw packet socket, where L2 headers are supposed to be constructed by user. Packet socket calls dev_validate_header() to validate the header. But GRE tunnel does not set dev->hard_header_len, so that check can be simply bypassed, therefore uninit memory could be passed down to ipgre_xmit(). Similar for dev->needed_headroom.
dev->hard_header_len is supposed to be the length of the header created by dev->header_ops->create(), so it should be used whenever header_ops is set, and dev->needed_headroom should be used when it is not set.
Reported-and-tested-by: syzbot+4a2c52677a8a1aa283cb@syzkaller.appspotmail.com Cc: William Tu u9012063@gmail.com Acked-by: Willem de Bruijn willemb@google.com Signed-off-by: Cong Wang xiyou.wangcong@gmail.com Acked-by: Xie He xie.he.0141@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/ip_gre.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index ffcb5983107d..de6f89511a21 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -680,9 +680,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, }
if (dev->header_ops) { - /* Need space for new headers */ - if (skb_cow_head(skb, dev->needed_headroom - - (tunnel->hlen + sizeof(struct iphdr)))) + if (skb_cow_head(skb, 0)) goto free_skb;
tnl_params = (const struct iphdr *)skb->data; @@ -800,7 +798,11 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) len = tunnel->tun_hlen - len; tunnel->hlen = tunnel->hlen + len;
- dev->needed_headroom = dev->needed_headroom + len; + if (dev->header_ops) + dev->hard_header_len += len; + else + dev->needed_headroom += len; + if (set_mtu) dev->mtu = max_t(int, dev->mtu - len, 68);
@@ -1003,6 +1005,7 @@ static void __gre_tunnel_init(struct net_device *dev) tunnel->parms.iph.protocol = IPPROTO_GRE;
tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph);
dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; @@ -1046,10 +1049,14 @@ static int ipgre_tunnel_init(struct net_device *dev) return -EINVAL; dev->flags = IFF_BROADCAST; dev->header_ops = &ipgre_header_ops; + dev->hard_header_len = tunnel->hlen + sizeof(*iph); + dev->needed_headroom = 0; } #endif } else if (!tunnel->collect_md) { dev->header_ops = &ipgre_header_ops; + dev->hard_header_len = tunnel->hlen + sizeof(*iph); + dev->needed_headroom = 0; }
return ip_tunnel_init(dev);
From: "Darrick J. Wong" darrick.wong@oracle.com
stable inclusion from linux-4.19.154 commit e004f8f381e383a32b4df0eec60cc9813864cd92
--------------------------------
[ Upstream commit 2a6ca4baed620303d414934aa1b7b0a8e7bab05f ]
There's an overflow bug in the realtime allocator. If the rt volume is large enough to handle a single allocation request that is larger than the maximum bmap extent length and the rt bitmap ends exactly on a bitmap block boundary, it's possible that the near allocator will try to check the freeness of a range that extends past the end of the bitmap. This fails with a corruption error and shuts down the fs.
Therefore, constrain maxlen so that the range scan cannot run off the end of the rt bitmap.
Signed-off-by: Darrick J. Wong darrick.wong@oracle.com Reviewed-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/xfs/xfs_rtalloc.c | 11 +++++++++++ 1 file changed, 11 insertions(+)
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 484eb0adcefb..08da48b66235 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -245,6 +245,9 @@ xfs_rtallocate_extent_block( end = XFS_BLOCKTOBIT(mp, bbno + 1) - 1; i <= end; i++) { + /* Make sure we don't scan off the end of the rt volume. */ + maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i; + /* * See if there's a free extent of maxlen starting at i. * If it's not so then next will contain the first non-free. @@ -440,6 +443,14 @@ xfs_rtallocate_extent_near( */ if (bno >= mp->m_sb.sb_rextents) bno = mp->m_sb.sb_rextents - 1; + + /* Make sure we don't run off the end of the rt volume. */ + maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno; + if (maxlen < minlen) { + *rtblock = NULLRTBLOCK; + return 0; + } + /* * Try the exact allocation first. */
From: Peilin Ye yepeilin.cs@gmail.com
stable inclusion from linux-4.19.154 commit 796f0d39dcd58e91994f4f86525d09c9e03de293
--------------------------------
[ Upstream commit c5a8a8498eed1c164afc94f50a939c1a10abf8ad ]
do_ip_vs_set_ctl() is referencing uninitialized stack value when `len` is zero. Fix it.
Reported-by: syzbot+23b5f9e7caf61d9a3898@syzkaller.appspotmail.com Link: https://syzkaller.appspot.com/bug?id=46ebfb92a8a812621a001ef04d90dfa459520fe... Suggested-by: Julian Anastasov ja@ssi.bg Signed-off-by: Peilin Ye yepeilin.cs@gmail.com Acked-by: Julian Anastasov ja@ssi.bg Reviewed-by: Simon Horman horms@verge.net.au Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netfilter/ipvs/ip_vs_ctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c339b5e386b7..3ad1de081e3c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2393,6 +2393,10 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) /* Set timeout values for (tcp tcpfin udp) */ ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg); goto out_unlock; + } else if (!len) { + /* No more commands with len == 0 below */ + ret = -EINVAL; + goto out_unlock; }
usvc_compat = (struct ip_vs_service_user *)arg; @@ -2469,9 +2473,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) break; case IP_VS_SO_SET_DELDEST: ret = ip_vs_del_dest(svc, &udest); - break; - default: - ret = -EINVAL; }
out_unlock:
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.149 commit 1baf236084a366789eaf9515ee6027b515fb059d
--------------------------------
[ Upstream commit 8a37963c7ac9ecb7f86f8ebda020e3f8d6d7b8a0 ]
If an element is freed via RCU then recursion into BPF instrumentation functions is not a concern. The element is already detached from the map and the RCU callback does not hold any locks on which a kprobe, perf event or tracepoint attached BPF program could deadlock.
Signed-off-by: Thomas Gleixner tglx@linutronix.de Signed-off-by: Alexei Starovoitov ast@kernel.org Link: https://lore.kernel.org/bpf/20200224145643.259118710@linutronix.de Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/bpf/hashtab.c | 8 -------- 1 file changed, 8 deletions(-)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 1b28fb006763..3f3ed33bd2fd 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -667,15 +667,7 @@ static void htab_elem_free_rcu(struct rcu_head *head) struct htab_elem *l = container_of(head, struct htab_elem, rcu); struct bpf_htab *htab = l->htab;
- /* must increment bpf_prog_active to avoid kprobe+bpf triggering while - * we're calling kfree, otherwise deadlock is possible if kprobes - * are placed somewhere inside of slub - */ - preempt_disable(); - __this_cpu_inc(bpf_prog_active); htab_elem_free(htab, l); - __this_cpu_dec(bpf_prog_active); - preempt_enable(); }
static void htab_put_fd_value(struct bpf_htab *htab, struct htab_elem *l)
From: Daniel Borkmann daniel@iogearbox.net
stable inclusion from linux-4.19.149 commit 87f947e2bb5a11dad396a64505f30c647d5ed0ed
--------------------------------
[ Upstream commit e6a18d36118bea3bf497c9df4d9988b6df120689 ]
Bryce reported that he saw the following with:
0: r6 = r1 1: r1 = 12 2: r0 = *(u16 *)skb[r1]
The xlated sequence was incorrectly clobbering r2 with pointer value of r6 ...
0: (bf) r6 = r1 1: (b7) r1 = 12 2: (bf) r1 = r6 3: (bf) r2 = r1 4: (85) call bpf_skb_load_helper_16_no_cache#7692160
... and hence call to the load helper never succeeded given the offset was too high. Fix it by reordering the load of r6 to r1.
Other than that the insn has similar calling convention than BPF helpers, that is, r0 - r5 are scratch regs, so nothing else affected after the insn.
Fixes: e0cea7ce988c ("bpf: implement ld_abs/ld_ind in native bpf") Reported-by: Bryce Kahle bryce.kahle@datadoghq.com Signed-off-by: Daniel Borkmann daniel@iogearbox.net Signed-off-by: Alexei Starovoitov ast@kernel.org Link: https://lore.kernel.org/bpf/cace836e4d07bb63b1a53e49c5dfb238a040c298.1599512... Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/core/filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/core/filter.c b/net/core/filter.c index a6a23cf6ea61..6a9c4bf09d37 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5420,8 +5420,6 @@ static int bpf_gen_ld_abs(const struct bpf_insn *orig, bool indirect = BPF_MODE(orig->code) == BPF_IND; struct bpf_insn *insn = insn_buf;
- /* We're guaranteed here that CTX is in R6. */ - *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX); if (!indirect) { *insn++ = BPF_MOV64_IMM(BPF_REG_2, orig->imm); } else { @@ -5429,6 +5427,8 @@ static int bpf_gen_ld_abs(const struct bpf_insn *orig, if (orig->imm) *insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, orig->imm); } + /* We're guaranteed here that CTX is in R6. */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_CTX);
switch (BPF_SIZE(orig->code)) { case BPF_B: