From: Heiner Kallweit hkallweit1@gmail.com
stable inclusion from linux-4.19.194 commit 5fdb418b14e41b33880b949db64b18f1f448916b
--------------------------------
[ Upstream commit 45add3cc99feaaf57d4b6f01d52d532c16a1caee ]
UEFI spec 2.9, p.108, table 4-1 lists the scenario that both attributes are cleared with the description "No memory access protection is possible for Entry". So we can have valid entries where both attributes are cleared, so remove the check.
Signed-off-by: Heiner Kallweit hkallweit1@gmail.com Fixes: 10f0d2f577053 ("efi: Implement generic support for the Memory Attributes table") Signed-off-by: Ard Biesheuvel ardb@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/firmware/efi/memattr.c | 5 ----- 1 file changed, 5 deletions(-)
diff --git a/drivers/firmware/efi/memattr.c b/drivers/firmware/efi/memattr.c index aac972b056d91..e0889922cc6d7 100644 --- a/drivers/firmware/efi/memattr.c +++ b/drivers/firmware/efi/memattr.c @@ -69,11 +69,6 @@ static bool entry_is_valid(const efi_memory_desc_t *in, efi_memory_desc_t *out) return false; }
- if (!(in->attribute & (EFI_MEMORY_RO | EFI_MEMORY_XP))) { - pr_warn("Entry attributes invalid: RO and XP bits both cleared\n"); - return false; - } - if (PAGE_SIZE > EFI_PAGE_SIZE && (!PAGE_ALIGNED(in->phys_addr) || !PAGE_ALIGNED(in->num_pages << EFI_PAGE_SHIFT))) {
From: Rasmus Villemoes linux@rasmusvillemoes.dk
stable inclusion from linux-4.19.194 commit dd47a33e11fdd6c9d31570500e6ecc369ba1f08e
--------------------------------
[ Upstream commit 942859d969de7f6f7f2659a79237a758b42782da ]
snprintf() should be given the full buffer size, not one less. And it guarantees nul-termination, so doing it manually afterwards is pointless.
It's even potentially harmful (though probably not in practice because CPER_REC_LEN is 256), due to the "return how much would have been written had the buffer been big enough" semantics. I.e., if the bank and/or device strings are long enough that the "DIMM location ..." output gets truncated, writing to msg[n] is a buffer overflow.
Signed-off-by: Rasmus Villemoes linux@rasmusvillemoes.dk Fixes: 3760cd20402d4 ("CPER: Adjust code flow of some functions") Signed-off-by: Ard Biesheuvel ardb@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/firmware/efi/cper.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/firmware/efi/cper.c b/drivers/firmware/efi/cper.c index 116989cf3d457..97da083afd324 100644 --- a/drivers/firmware/efi/cper.c +++ b/drivers/firmware/efi/cper.c @@ -275,8 +275,7 @@ static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg) if (!msg || !(mem->validation_bits & CPER_MEM_VALID_MODULE_HANDLE)) return 0;
- n = 0; - len = CPER_REC_LEN - 1; + len = CPER_REC_LEN; dmi_memdev_name(mem->mem_dev_handle, &bank, &device); if (bank && device) n = snprintf(msg, len, "DIMM location: %s %s ", bank, device); @@ -285,7 +284,6 @@ static int cper_dimm_err_location(struct cper_mem_err_compact *mem, char *msg) "DIMM location: not present. DMI handle: 0x%.4x ", mem->mem_dev_handle);
- msg[n] = '\0'; return n; }
From: Zhen Lei thunder.leizhen@huawei.com
stable inclusion from linux-4.19.194 commit c953aee0d8a162d249dedaa47111e95dde0461cd
--------------------------------
[ Upstream commit d1ce2c79156d3baf0830990ab06d296477b93c26 ]
The error code returned from vfio_ext_cap_len() is stored in 'len', not in 'ret'.
Fixes: 89e1f7d4c66d ("vfio: Add PCI device driver") Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Zhen Lei thunder.leizhen@huawei.com Reviewed-by: Max Gurtovoy mgurtovoy@nvidia.com Message-Id: 20210515020458.6771-1-thunder.leizhen@huawei.com Signed-off-by: Alex Williamson alex.williamson@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/vfio/pci/vfio_pci_config.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 747c2e4835255..91085ffbc58b2 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -1574,7 +1574,7 @@ static int vfio_ecap_init(struct vfio_pci_device *vdev) if (len == 0xFF) { len = vfio_ext_cap_len(vdev, ecap, epos); if (len < 0) - return ret; + return len; } }
From: Randy Dunlap rdunlap@infradead.org
stable inclusion from linux-4.19.194 commit 2b2ca3ee36e4e40d98c01b41a5819243197d4609
--------------------------------
[ Upstream commit 2a55ca37350171d9b43d561528f23d4130097255 ]
zap_vma_ptes() is only available when CONFIG_MMU is set/enabled. Without CONFIG_MMU, vfio_pci.o has build errors, so make VFIO_PCI depend on MMU.
riscv64-linux-ld: drivers/vfio/pci/vfio_pci.o: in function `vfio_pci_mmap_open': vfio_pci.c:(.text+0x1ec): undefined reference to `zap_vma_ptes' riscv64-linux-ld: drivers/vfio/pci/vfio_pci.o: in function `.L0 ': vfio_pci.c:(.text+0x165c): undefined reference to `zap_vma_ptes'
Fixes: 11c4cd07ba11 ("vfio-pci: Fault mmaps to enable vma tracking") Signed-off-by: Randy Dunlap rdunlap@infradead.org Reported-by: kernel test robot lkp@intel.com Cc: Alex Williamson alex.williamson@redhat.com Cc: Cornelia Huck cohuck@redhat.com Cc: kvm@vger.kernel.org Cc: Jason Gunthorpe jgg@nvidia.com Cc: Eric Auger eric.auger@redhat.com Message-Id: 20210515190856.2130-1-rdunlap@infradead.org Signed-off-by: Alex Williamson alex.williamson@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/vfio/pci/Kconfig | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 42dc1d3d71cf0..fcbfd0aacebcd 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -1,6 +1,7 @@ config VFIO_PCI tristate "VFIO support for PCI devices" depends on VFIO && PCI && EVENTFD + depends on MMU select VFIO_VIRQFD select IRQ_BYPASS_MANAGER help
From: Max Gurtovoy mgurtovoy@nvidia.com
stable inclusion from linux-4.19.194 commit 2bd07ebcb949b17a431aacd0e99fa4558c4a5600
--------------------------------
[ Upstream commit dc51ff91cf2d1e9a2d941da483602f71d4a51472 ]
The ->parent_module is the one that use in try_module_get. It should also be the one the we use in module_put during vfio_platform_open().
Fixes: 32a2d71c4e80 ("vfio: platform: introduce vfio-platform-base module") Signed-off-by: Max Gurtovoy mgurtovoy@nvidia.com Message-Id: 20210518192133.59195-1-mgurtovoy@nvidia.com Signed-off-by: Alex Williamson alex.williamson@redhat.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/vfio/platform/vfio_platform_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index c0cd824be2b76..bf06825d977ee 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -296,7 +296,7 @@ static int vfio_platform_open(void *device_data) vfio_platform_regions_cleanup(vdev); err_reg: mutex_unlock(&driver_lock); - module_put(THIS_MODULE); + module_put(vdev->parent_module); return ret; }
From: Julian Anastasov ja@ssi.bg
stable inclusion from linux-4.19.194 commit 6895ac910b73f7f886bb292c0a87d6e4d6eb6801
--------------------------------
[ Upstream commit 56e4ee82e850026d71223262c07df7d6af3bd872 ]
syzbot reported memory leak [1] when adding service with HASHED flag. We should ignore this flag both from sockopt and netlink provided data, otherwise the service is not hashed and not visible while releasing resources.
[1] BUG: memory leak unreferenced object 0xffff888115227800 (size 512): comm "syz-executor263", pid 8658, jiffies 4294951882 (age 12.560s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<ffffffff83977188>] kmalloc include/linux/slab.h:556 [inline] [<ffffffff83977188>] kzalloc include/linux/slab.h:686 [inline] [<ffffffff83977188>] ip_vs_add_service+0x598/0x7c0 net/netfilter/ipvs/ip_vs_ctl.c:1343 [<ffffffff8397d770>] do_ip_vs_set_ctl+0x810/0xa40 net/netfilter/ipvs/ip_vs_ctl.c:2570 [<ffffffff838449a8>] nf_setsockopt+0x68/0xa0 net/netfilter/nf_sockopt.c:101 [<ffffffff839ae4e9>] ip_setsockopt+0x259/0x1ff0 net/ipv4/ip_sockglue.c:1435 [<ffffffff839fa03c>] raw_setsockopt+0x18c/0x1b0 net/ipv4/raw.c:857 [<ffffffff83691f20>] __sys_setsockopt+0x1b0/0x360 net/socket.c:2117 [<ffffffff836920f2>] __do_sys_setsockopt net/socket.c:2128 [inline] [<ffffffff836920f2>] __se_sys_setsockopt net/socket.c:2125 [inline] [<ffffffff836920f2>] __x64_sys_setsockopt+0x22/0x30 net/socket.c:2125 [<ffffffff84350efa>] do_syscall_64+0x3a/0xb0 arch/x86/entry/common.c:47 [<ffffffff84400068>] entry_SYSCALL_64_after_hwframe+0x44/0xae
Reported-and-tested-by: syzbot+e562383183e4b1766930@syzkaller.appspotmail.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Julian Anastasov ja@ssi.bg Reviewed-by: Simon Horman horms@verge.net.au Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netfilter/ipvs/ip_vs_ctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 3ad1de081e3c7..6208fa09fe713 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1269,7 +1269,7 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ip_vs_addr_copy(svc->af, &svc->addr, &u->addr); svc->port = u->port; svc->fwmark = u->fwmark; - svc->flags = u->flags; + svc->flags = u->flags & ~IP_VS_SVC_F_HASHED; svc->timeout = u->timeout * HZ; svc->netmask = u->netmask; svc->ipvs = ipvs;
From: Pablo Neira Ayuso pablo@netfilter.org
stable inclusion from linux-4.19.194 commit 8aed10cd9497933ebe3fc0afe8d8ddc1af8b0d48
--------------------------------
[ Upstream commit 8971ee8b087750a23f3cd4dc55bff2d0303fd267 ]
The private helper data size cannot be updated. However, updates that contain NFCTH_PRIV_DATA_LEN might bogusly hit EBUSY even if the size is the same.
Fixes: 12f7a505331e ("netfilter: add user-space connection tracking helper infrastructure") Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netfilter/nfnetlink_cthelper.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index ddcb1b6074745..c8b0f1122c44d 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -381,10 +381,14 @@ static int nfnl_cthelper_update(const struct nlattr * const tb[], struct nf_conntrack_helper *helper) { + u32 size; int ret;
- if (tb[NFCTH_PRIV_DATA_LEN]) - return -EBUSY; + if (tb[NFCTH_PRIV_DATA_LEN]) { + size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); + if (size != helper->data_len) + return -EBUSY; + }
if (tb[NFCTH_POLICY]) { ret = nfnl_cthelper_update_policy(helper, tb[NFCTH_POLICY]);
From: Mark Rutland mark.rutland@arm.com
stable inclusion from linux-4.19.194 commit d106f05432e60f9f62d456ef017687f5c73cb414
--------------------------------
commit 0711f0d7050b9e07c44bc159bbc64ac0a1022c7f upstream.
During boot, kernel_init_freeable() initializes `cad_pid` to the init task's struct pid. Later on, we may change `cad_pid` via a sysctl, and when this happens proc_do_cad_pid() will increment the refcount on the new pid via get_pid(), and will decrement the refcount on the old pid via put_pid(). As we never called get_pid() when we initialized `cad_pid`, we decrement a reference we never incremented, can therefore free the init task's struct pid early. As there can be dangling references to the struct pid, we can later encounter a use-after-free (e.g. when delivering signals).
This was spotted when fuzzing v5.13-rc3 with Syzkaller, but seems to have been around since the conversion of `cad_pid` to struct pid in commit 9ec52099e4b8 ("[PATCH] replace cad_pid by a struct pid") from the pre-KASAN stone age of v2.6.19.
Fix this by getting a reference to the init task's struct pid when we assign it to `cad_pid`.
Full KASAN splat below.
================================================================== BUG: KASAN: use-after-free in ns_of_pid include/linux/pid.h:153 [inline] BUG: KASAN: use-after-free in task_active_pid_ns+0xc0/0xc8 kernel/pid.c:509 Read of size 4 at addr ffff23794dda0004 by task syz-executor.0/273
CPU: 1 PID: 273 Comm: syz-executor.0 Not tainted 5.12.0-00001-g9aef892b2d15 #1 Hardware name: linux,dummy-virt (DT) Call trace: ns_of_pid include/linux/pid.h:153 [inline] task_active_pid_ns+0xc0/0xc8 kernel/pid.c:509 do_notify_parent+0x308/0xe60 kernel/signal.c:1950 exit_notify kernel/exit.c:682 [inline] do_exit+0x2334/0x2bd0 kernel/exit.c:845 do_group_exit+0x108/0x2c8 kernel/exit.c:922 get_signal+0x4e4/0x2a88 kernel/signal.c:2781 do_signal arch/arm64/kernel/signal.c:882 [inline] do_notify_resume+0x300/0x970 arch/arm64/kernel/signal.c:936 work_pending+0xc/0x2dc
Allocated by task 0: slab_post_alloc_hook+0x50/0x5c0 mm/slab.h:516 slab_alloc_node mm/slub.c:2907 [inline] slab_alloc mm/slub.c:2915 [inline] kmem_cache_alloc+0x1f4/0x4c0 mm/slub.c:2920 alloc_pid+0xdc/0xc00 kernel/pid.c:180 copy_process+0x2794/0x5e18 kernel/fork.c:2129 kernel_clone+0x194/0x13c8 kernel/fork.c:2500 kernel_thread+0xd4/0x110 kernel/fork.c:2552 rest_init+0x44/0x4a0 init/main.c:687 arch_call_rest_init+0x1c/0x28 start_kernel+0x520/0x554 init/main.c:1064 0x0
Freed by task 270: slab_free_hook mm/slub.c:1562 [inline] slab_free_freelist_hook+0x98/0x260 mm/slub.c:1600 slab_free mm/slub.c:3161 [inline] kmem_cache_free+0x224/0x8e0 mm/slub.c:3177 put_pid.part.4+0xe0/0x1a8 kernel/pid.c:114 put_pid+0x30/0x48 kernel/pid.c:109 proc_do_cad_pid+0x190/0x1b0 kernel/sysctl.c:1401 proc_sys_call_handler+0x338/0x4b0 fs/proc/proc_sysctl.c:591 proc_sys_write+0x34/0x48 fs/proc/proc_sysctl.c:617 call_write_iter include/linux/fs.h:1977 [inline] new_sync_write+0x3ac/0x510 fs/read_write.c:518 vfs_write fs/read_write.c:605 [inline] vfs_write+0x9c4/0x1018 fs/read_write.c:585 ksys_write+0x124/0x240 fs/read_write.c:658 __do_sys_write fs/read_write.c:670 [inline] __se_sys_write fs/read_write.c:667 [inline] __arm64_sys_write+0x78/0xb0 fs/read_write.c:667 __invoke_syscall arch/arm64/kernel/syscall.c:37 [inline] invoke_syscall arch/arm64/kernel/syscall.c:49 [inline] el0_svc_common.constprop.1+0x16c/0x388 arch/arm64/kernel/syscall.c:129 do_el0_svc+0xf8/0x150 arch/arm64/kernel/syscall.c:168 el0_svc+0x28/0x38 arch/arm64/kernel/entry-common.c:416 el0_sync_handler+0x134/0x180 arch/arm64/kernel/entry-common.c:432 el0_sync+0x154/0x180 arch/arm64/kernel/entry.S:701
The buggy address belongs to the object at ffff23794dda0000 which belongs to the cache pid of size 224 The buggy address is located 4 bytes inside of 224-byte region [ffff23794dda0000, ffff23794dda00e0) The buggy address belongs to the page: page:(____ptrval____) refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x4dda0 head:(____ptrval____) order:1 compound_mapcount:0 flags: 0x3fffc0000010200(slab|head) raw: 03fffc0000010200 dead000000000100 dead000000000122 ffff23794d40d080 raw: 0000000000000000 0000000000190019 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected
Memory state around the buggy address: ffff23794dd9ff00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff23794dd9ff80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff23794dda0000: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
^ ffff23794dda0080: fb fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc ffff23794dda0100: fc fc fc fc fc fc fc fc 00 00 00 00 00 00 00 00 ==================================================================
Link: https://lkml.kernel.org/r/20210524172230.38715-1-mark.rutland@arm.com Fixes: 9ec52099e4b8678a ("[PATCH] replace cad_pid by a struct pid") Signed-off-by: Mark Rutland mark.rutland@arm.com Acked-by: Christian Brauner christian.brauner@ubuntu.com Cc: Cedric Le Goater clg@fr.ibm.com Cc: Christian Brauner christian@brauner.io Cc: Eric W. Biederman ebiederm@xmission.com Cc: Kees Cook <keescook@chromium.org Cc: Martin Schwidefsky schwidefsky@de.ibm.com Cc: Paul Mackerras paulus@samba.org Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/init/main.c b/init/main.c index bea0541feba92..664f8a3d6c073 100644 --- a/init/main.c +++ b/init/main.c @@ -1123,7 +1123,7 @@ static noinline void __init kernel_init_freeable(void) */ set_mems_allowed(node_states[N_MEMORY]);
- cad_pid = task_pid(current); + cad_pid = get_pid(task_pid(current));
smp_prepare_cpus(setup_max_cpus);
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.194 commit 7e25cb1b22f81239ae3332e14a1d0cff7014bccd
--------------------------------
commit 7d65f9e80646c595e8c853640a9d0768a33e204c upstream.
PIC interrupts do not support affinity setting and they can end up on any online CPU. Therefore, it's required to mark the associated vectors as system-wide reserved. Otherwise, the corresponding irq descriptors are copied to the secondary CPUs but the vectors are not marked as assigned or reserved. This works correctly for the IO/APIC case.
When the IO/APIC is disabled via config, kernel command line or lack of enumeration then all legacy interrupts are routed through the PIC, but nothing marks them as system-wide reserved vectors.
As a consequence, a subsequent allocation on a secondary CPU can result in allocating one of these vectors, which triggers the BUG() in apic_update_vector() because the interrupt descriptor slot is not empty.
Imran tried to work around that by marking those interrupts as allocated when a CPU comes online. But that's wrong in case that the IO/APIC is available and one of the legacy interrupts, e.g. IRQ0, has been switched to PIC mode because then marking them as allocated will fail as they are already marked as system vectors.
Stay consistent and update the legacy vectors after attempting IO/APIC initialization and mark them as system vectors in case that no IO/APIC is available.
Fixes: 69cde0004a4b ("x86/vector: Use matrix allocator for vector assignment") Reported-by: Imran Khan imran.f.khan@oracle.com Signed-off-by: Thomas Gleixner tglx@linutronix.de Signed-off-by: Borislav Petkov bp@suse.de Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210519233928.2157496-1-imran.f.khan@oracle.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/include/asm/apic.h | 1 + arch/x86/kernel/apic/apic.c | 1 + arch/x86/kernel/apic/vector.c | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+)
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index b7bd996a831ce..75bf0faad30d1 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -173,6 +173,7 @@ static inline int apic_is_clustered_box(void) extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask); extern void lapic_assign_system_vectors(void); extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace); +extern void lapic_update_legacy_vectors(void); extern void lapic_online(void); extern void lapic_offline(void); extern bool apic_needs_pit(void); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 57fdc2fbfac59..cfe893ccd6ab3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -2551,6 +2551,7 @@ void __init apic_bsp_setup(bool upmode) end_local_APIC_setup(); irq_remap_enable_fault_handling(); setup_IO_APIC(); + lapic_update_legacy_vectors(); }
#ifdef CONFIG_UP_LATE_INIT diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 3d6f747ea9bc7..e7fd029e23fbf 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -684,6 +684,26 @@ void lapic_assign_legacy_vector(unsigned int irq, bool replace) irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace); }
+void __init lapic_update_legacy_vectors(void) +{ + unsigned int i; + + if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0) + return; + + /* + * If the IO/APIC is disabled via config, kernel command line or + * lack of enumeration then all legacy interrupts are routed + * through the PIC. Make sure that they are marked as legacy + * vectors. PIC_CASCADE_IRQ has already been marked in + * lapic_assign_system_vectors(). + */ + for (i = 0; i < nr_legacy_irqs(); i++) { + if (i != PIC_CASCADE_IR) + lapic_assign_legacy_vector(i, true); + } +} + void __init lapic_assign_system_vectors(void) { unsigned int i, vector = 0;
From: Mina Almasry almasrymina@google.com
stable inclusion from linux-4.19.194 commit 7de60c2d5a2a66ef2c5d76952a5a3a9a4ea4d436
--------------------------------
[ Upstream commit d84cf06e3dd8c5c5b547b5d8931015fc536678e5 ]
The userfaultfd hugetlb tests cause a resv_huge_pages underflow. This happens when hugetlb_mcopy_atomic_pte() is called with !is_continue on an index for which we already have a page in the cache. When this happens, we allocate a second page, double consuming the reservation, and then fail to insert the page into the cache and return -EEXIST.
To fix this, we first check if there is a page in the cache which already consumed the reservation, and return -EEXIST immediately if so.
There is still a rare condition where we fail to copy the page contents AND race with a call for hugetlb_no_page() for this index and again we will underflow resv_huge_pages. That is fixed in a more complicated patch not targeted for -stable.
Test:
Hacked the code locally such that resv_huge_pages underflows produce a warning, then:
./tools/testing/selftests/vm/userfaultfd hugetlb_shared 10 2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success ./tools/testing/selftests/vm/userfaultfd hugetlb 10 2 /tmp/kokonut_test/huge/userfaultfd_test && echo test success
Both tests succeed and produce no warnings. After the test runs number of free/resv hugepages is correct.
[mike.kravetz@oracle.com: changelog fixes]
Link: https://lkml.kernel.org/r/20210528004649.85298-1-almasrymina@google.com Fixes: 8fb5debc5fcd ("userfaultfd: hugetlbfs: add hugetlb_mcopy_atomic_pte for userfaultfd support") Signed-off-by: Mina Almasry almasrymina@google.com Reviewed-by: Mike Kravetz mike.kravetz@oracle.com Cc: Axel Rasmussen axelrasmussen@google.com Cc: Peter Xu peterx@redhat.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/hugetlb.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f542a32e82f55..6e84c46180ecd 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4314,10 +4314,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, struct page *page;
if (!*pagep) { - ret = -ENOMEM; + /* If a page already exists, then it's UFFDIO_COPY for + * a non-missing case. Return -EEXIST. + */ + if (vm_shared && + hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + ret = -EEXIST; + goto out; + } + page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) + if (IS_ERR(page)) { + ret = -ENOMEM; goto out; + }
ret = copy_huge_page_from_user(page, (const void __user *) src_addr,
From: Erik Schmauss erik.schmauss@intel.com
stable inclusion from linux-4.19.194 commit eeb48f5ca3ab66d793a911917712929875385059
--------------------------------
commit d737f333b211361b6e239fc753b84c3be2634aaa upstream.
It was discovered that AML tables were loaded before or after the ECDT depending on acpi_gbl_execute_tables_as_methods. According to the ACPI spec, the ECDT should be loaded before the namespace is populated by loading AML tables (DSDT and SSDT). Since the ECDT should be loaded early in the boot process, this change moves the ECDT probing to acpi_early_init.
Signed-off-by: Erik Schmauss erik.schmauss@intel.com Signed-off-by: Rafael J. Wysocki rafael.j.wysocki@intel.com Cc: Laurențiu Păncescu lpancescu@gmail.com Cc: Salvatore Bonaccorso carnil@debian.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/acpi/bus.c | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-)
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 92a1468610867..1063b762dda2b 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -1054,15 +1054,17 @@ void __init acpi_early_init(void) goto error0; }
- if (!acpi_gbl_execute_tables_as_methods && - acpi_gbl_group_module_level_code) { - status = acpi_load_tables(); - if (ACPI_FAILURE(status)) { - printk(KERN_ERR PREFIX - "Unable to load the System Description Tables\n"); - goto error0; - } - } + /* + * ACPI 2.0 requires the EC driver to be loaded and work before + * the EC device is found in the namespace (i.e. before + * acpi_load_tables() is called). + * + * This is accomplished by looking for the ECDT table, and getting + * the EC parameters out of that. + * + * Ignore the result. Not having an ECDT is not fatal. + */ + status = acpi_ec_ecdt_probe();
#ifdef CONFIG_X86 if (!acpi_ioapic) { @@ -1133,25 +1135,11 @@ static int __init acpi_bus_init(void)
acpi_os_initialize1();
- /* - * ACPI 2.0 requires the EC driver to be loaded and work before - * the EC device is found in the namespace (i.e. before - * acpi_load_tables() is called). - * - * This is accomplished by looking for the ECDT table, and getting - * the EC parameters out of that. - */ - status = acpi_ec_ecdt_probe(); - /* Ignore result. Not having an ECDT is not fatal. */ - - if (acpi_gbl_execute_tables_as_methods || - !acpi_gbl_group_module_level_code) { - status = acpi_load_tables(); - if (ACPI_FAILURE(status)) { - printk(KERN_ERR PREFIX - "Unable to load the System Description Tables\n"); - goto error1; - } + status = acpi_load_tables(); + if (ACPI_FAILURE(status)) { + printk(KERN_ERR PREFIX + "Unable to load the System Description Tables\n"); + goto error1; }
status = acpi_enable_subsystem(ACPI_NO_ACPI_ENABLE);
From: "Rafael J. Wysocki" rafael.j.wysocki@intel.com
stable inclusion from linux-4.19.194 commit f47c2c06982efbeb8f3aaa418fca1e3cf3b9792c
--------------------------------
commit b1c0330823fe842dbb34641f1410f0afa51c29d3 upstream.
Some systems have had functional issues since commit 5a8361f7ecce (ACPICA: Integrate package handling with module-level code) that, among other things, changed the initial values of the acpi_gbl_group_module_level_code and acpi_gbl_parse_table_as_term_list global flags in ACPICA which implicitly caused acpi_ec_ecdt_probe() to be called before acpi_load_tables() on the vast majority of platforms.
Namely, before commit 5a8361f7ecce, acpi_load_tables() was called from acpi_early_init() if acpi_gbl_parse_table_as_term_list was FALSE and acpi_gbl_group_module_level_code was TRUE, which almost always was the case as FALSE and TRUE were their initial values, respectively. The acpi_gbl_parse_table_as_term_list value would be changed to TRUE for a couple of platforms in acpi_quirks_dmi_table[], but it remained FALSE in the vast majority of cases.
After commit 5a8361f7ecce, the initial values of the two flags have been reversed, so in effect acpi_load_tables() has not been called from acpi_early_init() any more. That, in turn, affects acpi_ec_ecdt_probe() which is invoked before acpi_load_tables() now and it is not possible to evaluate the _REG method for the EC address space handler installed by it. That effectively causes the EC address space to be inaccessible to AML on platforms with an ECDT matching the EC device definition in the DSDT and functional problems ensue in there.
Because the default behavior before commit 5a8361f7ecce was to call acpi_ec_ecdt_probe() after acpi_load_tables(), it should be safe to do that again. Moreover, the EC address space handler installed by acpi_ec_ecdt_probe() is only needed for AML to be able to access the EC address space and the only AML that can run during acpi_load_tables() is module-level code which only is allowed to access address spaces with default handlers (memory, I/O and PCI config space).
For this reason, move the acpi_ec_ecdt_probe() invocation back to acpi_bus_init(), from where it was taken away by commit d737f333b211 (ACPI: probe ECDT before loading AML tables regardless of module-level code flag), and put it after the invocation of acpi_load_tables() to restore the original code ordering from before commit 5a8361f7ecce.
Fixes: 5a8361f7ecce ("ACPICA: Integrate package handling with module-level code") Link: https://bugzilla.kernel.org/show_bug.cgi?id=199981 Reported-by: step-ali sunmooon15@gmail.com Reported-by: Charles Stanhope charles.stanhope@gmail.com Tested-by: Charles Stanhope charles.stanhope@gmail.com Reported-by: Paulo Nascimento paulo.ulusu@googlemail.com Reported-by: David Purton dcpurton@marshwiggle.net Reported-by: Adam Harvey adam@adamharvey.name Reported-by: Zhang Rui rui.zhang@intel.com Tested-by: Zhang Rui rui.zhang@intel.com Tested-by: Jean-Marc Lenoir archlinux@jihemel.com Signed-off-by: Rafael J. Wysocki rafael.j.wysocki@intel.com Cc: Laurențiu Păncescu lpancescu@gmail.com Cc: Salvatore Bonaccorso carnil@debian.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/acpi/bus.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 1063b762dda2b..d60e57d14c859 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -1054,18 +1054,6 @@ void __init acpi_early_init(void) goto error0; }
- /* - * ACPI 2.0 requires the EC driver to be loaded and work before - * the EC device is found in the namespace (i.e. before - * acpi_load_tables() is called). - * - * This is accomplished by looking for the ECDT table, and getting - * the EC parameters out of that. - * - * Ignore the result. Not having an ECDT is not fatal. - */ - status = acpi_ec_ecdt_probe(); - #ifdef CONFIG_X86 if (!acpi_ioapic) { /* compatible (0) means level (3) */ @@ -1142,6 +1130,18 @@ static int __init acpi_bus_init(void) goto error1; }
+ /* + * ACPI 2.0 requires the EC driver to be loaded and work before the EC + * device is found in the namespace. + * + * This is accomplished by looking for the ECDT table and getting the EC + * parameters out of that. + * + * Do that before calling acpi_initialize_objects() which may trigger EC + * address space accesses. + */ + acpi_ec_ecdt_probe(); + status = acpi_enable_subsystem(ACPI_NO_ACPI_ENABLE); if (ACPI_FAILURE(status)) { printk(KERN_ERR PREFIX
From: Kees Cook keescook@chromium.org
stable inclusion from linux-4.19.195 commit 1f41b8f9577907fba56684231c7be89c8243d960
--------------------------------
commit 591a22c14d3f45cc38bd1931c593c221df2f1881 upstream.
Commit bfb819ea20ce ("proc: Check /proc/$pid/attr/ writes against file opener") tried to make sure that there could not be a confusion between the opener of a /proc/$pid/attr/ file and the writer. It used struct cred to make sure the privileges didn't change. However, there were existing cases where a more privileged thread was passing the opened fd to a differently privileged thread (during container setup). Instead, use mm_struct to track whether the opener and writer are still the same process. (This is what several other proc files already do, though for different reasons.)
Reported-by: Christian Brauner christian.brauner@ubuntu.com Reported-by: Andrea Righi andrea.righi@canonical.com Tested-by: Andrea Righi andrea.righi@canonical.com Fixes: bfb819ea20ce ("proc: Check /proc/$pid/attr/ writes against file opener") Cc: stable@vger.kernel.org Signed-off-by: Kees Cook keescook@chromium.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/proc/base.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c index e43c1c9480cba..e46c10acb784b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2535,6 +2535,11 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx, }
#ifdef CONFIG_SECURITY +static int proc_pid_attr_open(struct inode *inode, struct file *file) +{ + return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); +} + static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { @@ -2565,7 +2570,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, int rv;
/* A task may only write when it was the opener. */ - if (file->f_cred != current_real_cred()) + if (file->private_data != current->mm) return -EPERM;
rcu_read_lock(); @@ -2613,9 +2618,11 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, }
static const struct file_operations proc_pid_attr_operations = { + .open = proc_pid_attr_open, .read = proc_pid_attr_read, .write = proc_pid_attr_write, .llseek = generic_file_llseek, + .release = mem_release, };
static const struct pid_entry attr_dir_stuff[] = {
From: Johannes Berg johannes.berg@intel.com
stable inclusion from linux-4.19.195 commit f583748c2a4a1dc731812ae2c12cadca5c1a88b5
--------------------------------
[ Upstream commit 35d96e631860226d5dc4de0fad0a415362ec2457 ]
If bond_kobj_init() or later kzalloc() in bond_alloc_slave() fail, then we call kobject_put() on the slave->kobj. This in turn calls the release function slave_kobj_release() which will always try to cancel_delayed_work_sync(&slave->notify_work), which shouldn't be done on an uninitialized work struct.
Always initialize the work struct earlier to avoid problems here.
Syzbot bisected this down to a completely pointless commit, some fault injection may have been at work here that caused the alloc failure in the first place, which may interact badly with bisect.
Reported-by: syzbot+bfda097c12a00c8cae67@syzkaller.appspotmail.com Signed-off-by: Johannes Berg johannes.berg@intel.com Acked-by: Jay Vosburgh jay.vosburgh@canonical.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/bonding/bond_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b7db57e6c96ae..dd853d85afdd3 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1315,6 +1315,7 @@ static struct slave *bond_alloc_slave(struct bonding *bond,
slave->bond = bond; slave->dev = slave_dev; + INIT_DELAYED_WORK(&slave->notify_work, bond_netdev_notify_work);
if (bond_kobj_init(slave)) return NULL; @@ -1327,7 +1328,6 @@ static struct slave *bond_alloc_slave(struct bonding *bond, return NULL; } } - INIT_DELAYED_WORK(&slave->notify_work, bond_netdev_notify_work);
return slave; }
From: Johannes Berg johannes.berg@intel.com
stable inclusion from linux-4.19.195 commit 59fba11d649854134c75ad88c8adafa9304ac419
--------------------------------
[ Upstream commit 1d482e666b8e74c7555dbdfbfb77205eeed3ff2d ]
Syzbot reports that in mac80211 we have a potential deadlock between our "local->stop_queue_reasons_lock" (spinlock) and netlink's nl_table_lock (rwlock). This is because there's at least one situation in which we might try to send a netlink message with this spinlock held while it is also possible to take the spinlock from a hardirq context, resulting in the following deadlock scenario reported by lockdep:
CPU0 CPU1 ---- ---- lock(nl_table_lock); local_irq_disable(); lock(&local->queue_stop_reason_lock); lock(nl_table_lock); <Interrupt> lock(&local->queue_stop_reason_lock);
This seems valid, we can take the queue_stop_reason_lock in any kind of context ("CPU0"), and call ieee80211_report_ack_skb() with the spinlock held and IRQs disabled ("CPU1") in some code path (ieee80211_do_stop() via ieee80211_free_txskb()).
Short of disallowing netlink use in scenarios like these (which would be rather complex in mac80211's case due to the deep callchain), it seems the only fix for this is to disable IRQs while nl_table_lock is held to avoid hitting this scenario, this disallows the "CPU0" portion of the reported deadlock.
Note that the writer side (netlink_table_grab()) already disables IRQs for this lock.
Unfortunately though, this seems like a huge hammer, and maybe the whole netlink table locking should be reworked.
Reported-by: syzbot+69ff9dff50dcfe14ddd4@syzkaller.appspotmail.com Signed-off-by: Johannes Berg johannes.berg@intel.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netlink/af_netlink.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index ad354a88364a9..7c64ae48c0194 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -461,11 +461,13 @@ void netlink_table_ungrab(void) static inline void netlink_lock_table(void) { + unsigned long flags; + /* read_lock() synchronizes us to netlink_table_grab */
- read_lock(&nl_table_lock); + read_lock_irqsave(&nl_table_lock, flags); atomic_inc(&nl_table_users); - read_unlock(&nl_table_lock); + read_unlock_irqrestore(&nl_table_lock, flags); }
static inline void
From: Dan Carpenter dan.carpenter@oracle.com
stable inclusion from linux-4.19.195 commit acb503deb002ca30e1544dade6059d9cd5d746bd
--------------------------------
[ Upstream commit 1dde47a66d4fb181830d6fa000e5ea86907b639e ]
We spotted a bug recently during a review where a driver was unregistering a bus that wasn't registered, which would trigger this BUG_ON(). Let's handle that situation more gracefully, and just print a warning and return.
Reported-by: Russell King (Oracle) rmk+kernel@armlinux.org.uk Signed-off-by: Dan Carpenter dan.carpenter@oracle.com Reviewed-by: Russell King (Oracle) rmk+kernel@armlinux.org.uk Reviewed-by: Andrew Lunn andrew@lunn.ch Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/phy/mdio_bus.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c index c5588d4508f97..5f25da904eba8 100644 --- a/drivers/net/phy/mdio_bus.c +++ b/drivers/net/phy/mdio_bus.c @@ -445,7 +445,8 @@ void mdiobus_unregister(struct mii_bus *bus) struct mdio_device *mdiodev; int i;
- BUG_ON(bus->state != MDIOBUS_REGISTERED); + if (WARN_ON_ONCE(bus->state != MDIOBUS_REGISTERED)) + return; bus->state = MDIOBUS_UNREGISTERED;
for (i = 0; i < PHY_MAX_ADDR; i++) {
From: Shakeel Butt shakeelb@google.com
stable inclusion from linux-4.19.195 commit ad223fe247dfc23e331f86351bfd84a16f4d955f
--------------------------------
[ Upstream commit 45e1ba40837ac2f6f4d4716bddb8d44bd7e4a251 ]
This patch effectively reverts the commit a3e72739b7a7 ("cgroup: fix too early usage of static_branch_disable()"). The commit 6041186a3258 ("init: initialize jump labels before command line option parsing") has moved the jump_label_init() before parse_args() which has made the commit a3e72739b7a7 unnecessary. On the other hand there are consequences of disabling the controllers later as there are subsystems doing the controller checks for different decisions. One such incident is reported [1] regarding the memory controller and its impact on memory reclaim code.
[1] https://lore.kernel.org/linux-mm/921e53f3-4b13-aab8-4a9e-e83ff15371e4@nec.co...
Signed-off-by: Shakeel Butt shakeelb@google.com Reported-by: NOMURA JUNICHI(野村 淳一) junichi.nomura@nec.com Signed-off-by: Tejun Heo tj@kernel.org Tested-by: Jun'ichi Nomura junichi.nomura@nec.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/cgroup/cgroup.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 79fa4e90499a3..682c5e231bddc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5351,8 +5351,6 @@ int __init cgroup_init_early(void) return 0; }
-static u16 cgroup_disable_mask __initdata; - /** * cgroup_init - cgroup initialization * @@ -5412,12 +5410,8 @@ int __init cgroup_init(void) * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ - if (cgroup_disable_mask & (1 << ssid)) { - static_branch_disable(cgroup_subsys_enabled_key[ssid]); - printk(KERN_INFO "Disabling %s control group subsystem\n", - ss->name); + if (!cgroup_ssid_enabled(ssid)) continue; - }
if (cgroup1_ssid_disabled(ssid)) printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", @@ -5776,7 +5770,10 @@ static int __init cgroup_disable(char *str) if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; - cgroup_disable_mask |= 1 << i; + + static_branch_disable(cgroup_subsys_enabled_key[i]); + pr_info("Disabling %s control group subsystem\n", + ss->name); } } return 1;
From: Sergey Senozhatsky senozhatsky@chromium.org
stable inclusion from linux-4.19.195 commit 50316635e644a0b9e62d3263fb4e8be2104605b6
--------------------------------
[ Upstream commit 940d71c6462e8151c78f28e4919aa8882ff2054e ]
If VCPU is suspended (VM suspend) in wq_watchdog_timer_fn() then once this VCPU resumes it will see the new jiffies value, while it may take a while before IRQ detects PVCLOCK_GUEST_STOPPED on this VCPU and updates all the watchdogs via pvclock_touch_watchdogs(). There is a small chance of misreported WQ stalls in the meantime, because new jiffies is time_after() old 'ts + thresh'.
wq_watchdog_timer_fn() { for_each_pool(pool, pi) { if (time_after(jiffies, ts + thresh)) { pr_emerg("BUG: workqueue lockup - pool"); } } }
Save jiffies at the beginning of this function and use that value for stall detection. If VM gets suspended then we continue using "old" jiffies value and old WQ touch timestamps. If IRQ at some point restarts the stall detection cycle (pvclock_touch_watchdogs()) then old jiffies will always be before new 'ts + thresh'.
Signed-off-by: Sergey Senozhatsky senozhatsky@chromium.org Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/workqueue.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2648c7a10bd47..dca1f9e44e687 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -49,6 +49,7 @@ #include <linux/uaccess.h> #include <linux/sched/isolation.h> #include <linux/nmi.h> +#include <linux/kvm_para.h>
#include "workqueue_internal.h"
@@ -5646,6 +5647,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; bool lockup_detected = false; + unsigned long now = jiffies; struct worker_pool *pool; int pi;
@@ -5660,6 +5662,12 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) if (list_empty(&pool->worklist)) continue;
+ /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like a stall. + */ + kvm_check_and_clear_guest_paused(); + /* get the latest of pool and touched timestamps */ pool_ts = READ_ONCE(pool->watchdog_ts); touched = READ_ONCE(wq_watchdog_touched); @@ -5678,12 +5686,12 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) }
/* did we stall? */ - if (time_after(jiffies, ts + thresh)) { + if (time_after(now, ts + thresh)) { lockup_detected = true; pr_emerg("BUG: workqueue lockup - pool"); pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", - jiffies_to_msecs(jiffies - pool_ts) / 1000); + jiffies_to_msecs(now - pool_ts) / 1000); } }
From: Alexander Kuznetsov wwfq@yandex-team.ru
stable inclusion from linux-4.19.195 commit 7e7ff4d058341ce8d1ab647585a73b38d3b493cf
--------------------------------
commit b7e24eb1caa5f8da20d405d262dba67943aedc42 upstream.
cgroup_mkdir() have restriction on newline usage in names: $ mkdir $'/sys/fs/cgroup/cpu/test\ntest2' mkdir: cannot create directory '/sys/fs/cgroup/cpu/test\ntest2': Invalid argument
But in cgroup1_rename() such check is missed. This allows us to make /proc/<pid>/cgroup unparsable: $ mkdir /sys/fs/cgroup/cpu/test $ mv /sys/fs/cgroup/cpu/test $'/sys/fs/cgroup/cpu/test\ntest2' $ echo $$ > $'/sys/fs/cgroup/cpu/test\ntest2' $ cat /proc/self/cgroup 11:pids:/ 10:freezer:/ 9:hugetlb:/ 8:cpuset:/ 7:blkio:/user.slice 6:memory:/user.slice 5:net_cls,net_prio:/ 4:perf_event:/ 3:devices:/user.slice 2:cpu,cpuacct:/test test2 1:name=systemd:/ 0::/
Signed-off-by: Alexander Kuznetsov wwfq@yandex-team.ru Reported-by: Andrey Krasichkov buglloc@yandex-team.ru Acked-by: Dmitry Yakunin zeil@yandex-team.ru Cc: stable@vger.kernel.org Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/cgroup/cgroup-v1.c | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 542bbe09fa207..055063c9c2a38 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -852,6 +852,10 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent struct cgroup *cgrp = kn->priv; int ret;
+ /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ + if (strchr(new_name_str, '\n')) + return -EINVAL; + if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; if (kn->parent != new_parent)
From: Marco Elver elver@google.com
stable inclusion from linux-4.19.195 commit db72bdb5c7981e7f1f7ffb46282e784292e7f112
--------------------------------
commit 6c605f8371159432ec61cbb1488dcf7ad24ad19a upstream.
KCSAN reports a data race between increment and decrement of pin_count:
write to 0xffff888237c2d4e0 of 4 bytes by task 15740 on cpu 1: find_get_context kernel/events/core.c:4617 __do_sys_perf_event_open kernel/events/core.c:12097 [inline] __se_sys_perf_event_open kernel/events/core.c:11933 ... read to 0xffff888237c2d4e0 of 4 bytes by task 15743 on cpu 0: perf_unpin_context kernel/events/core.c:1525 [inline] __do_sys_perf_event_open kernel/events/core.c:12328 [inline] __se_sys_perf_event_open kernel/events/core.c:11933 ...
Because neither read-modify-write here is atomic, this can lead to one of the operations being lost, resulting in an inconsistent pin_count. Fix it by adding the missing locking in the CPU-event case.
Fixes: fe4b04fa31a6 ("perf: Cure task_oncpu_function_call() races") Reported-by: syzbot+142c9018f5962db69c7e@syzkaller.appspotmail.com Signed-off-by: Marco Elver elver@google.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/20210527104711.2671610-1-elver@google.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+)
diff --git a/kernel/events/core.c b/kernel/events/core.c index 071d0ac45412d..f6da9e11693dd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4155,7 +4155,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task, cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); + raw_spin_lock_irqsave(&ctx->lock, flags); ++ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags);
return ctx; }
From: Vincent Guittot vincent.guittot@linaro.org
stable inclusion from linux-4.19.195 commit 4af84445075da26439598984d1a01c1cdadb0e2d
--------------------------------
commit 02da26ad5ed6ea8680e5d01f20661439611ed776 upstream.
During the update of fair blocked load (__update_blocked_fair()), we update the contribution of the cfs in tg->load_avg if cfs_rq's pelt has decayed. Nevertheless, the pelt values of a cfs_rq could have been recently updated while propagating the change of a child. In this case, cfs_rq's pelt will not decayed because it has already been updated and we don't update tg->load_avg.
__update_blocked_fair ... for_each_leaf_cfs_rq_safe: child cfs_rq update cfs_rq_load_avg() for child cfs_rq ... update_load_avg(cfs_rq_of(se), se, 0) ... update cfs_rq_load_avg() for parent cfs_rq -propagation of child's load makes parent cfs_rq->load_sum becoming null -UPDATE_TG is not set so it doesn't update parent cfs_rq->tg_load_avg_contrib .. for_each_leaf_cfs_rq_safe: parent cfs_rq update cfs_rq_load_avg() for parent cfs_rq - nothing to do because parent cfs_rq has already been updated recently so cfs_rq->tg_load_avg_contrib is not updated ... parent cfs_rq is decayed list_del_leaf_cfs_rq parent cfs_rq - but it still contibutes to tg->load_avg
we must set UPDATE_TG flags when propagting pending load to the parent
Fixes: 039ae8bcf7a5 ("sched/fair: Fix O(nr_cgroups) in the load balancing path") Reported-by: Odin Ugedal odin@uged.al Signed-off-by: Vincent Guittot vincent.guittot@linaro.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Odin Ugedal odin@uged.al Link: https://lkml.kernel.org/r/20210527122916.27683-3-vincent.guittot@linaro.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 08d877b57c4f2..193cce15f69b0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7706,7 +7706,7 @@ static void update_blocked_averages(int cpu) /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) - update_load_avg(cfs_rq_of(se), se, 0); + update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
/* * There can be a lot of idle CPU cgroups. Don't let fully
From: Dan Carpenter dan.carpenter@oracle.com
stable inclusion from linux-4.19.195 commit 4b380a7d84ef2ce3f4f5bec5d8706ed937ac6502
--------------------------------
[ Upstream commit 09226e8303beeec10f2ff844d2e46d1371dc58e0 ]
None of the callers are expecting NULL returns from nfs_get_client() so this code will lead to an Oops. It's better to return an error pointer. I expect that this is dead code so hopefully no one is affected.
Fixes: 31434f496abb ("nfs: check hostname in nfs_get_client") Signed-off-by: Dan Carpenter dan.carpenter@oracle.com Signed-off-by: Trond Myklebust trond.myklebust@hammerspace.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/nfs/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/nfs/client.c b/fs/nfs/client.c index d25230de31364..b8adf3467786e 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -407,7 +407,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
if (cl_init->hostname == NULL) { WARN_ON(1); - return NULL; + return ERR_PTR(-EINVAL); }
/* see if the client already exists */
From: Anna Schumaker Anna.Schumaker@Netapp.com
stable inclusion from linux-4.19.195 commit 42c10b0db064e45f5c5ae7019bbf2168ffab766c
--------------------------------
commit 476bdb04c501fc64bf3b8464ffddefc8dbe01577 upstream.
KASAN reports a use-after-free when attempting to mount two different exports through two different NICs that belong to the same server.
Olga was able to hit this with kernels starting somewhere between 5.7 and 5.10, but I traced the patch that introduced the clear_bit() call to 4.13. So something must have changed in the refcounting of the clp pointer to make this call to nfs_put_client() the very last one.
Fixes: 8dcbec6d20 ("NFSv41: Handle EXCHID4_FLAG_CONFIRMED_R during NFSv4.1 migration") Cc: stable@vger.kernel.org # 4.13+ Signed-off-by: Anna Schumaker Anna.Schumaker@Netapp.com Signed-off-by: Trond Myklebust trond.myklebust@hammerspace.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/nfs/nfs4client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 48a1892f05854..cfc154a59c04f 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -431,8 +431,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, */ nfs_mark_client_ready(clp, -EPERM); } - nfs_put_client(clp); clear_bit(NFS_CS_TSM_POSSIBLE, &clp->cl_flags); + nfs_put_client(clp); return old;
error:
From: Dai Ngo dai.ngo@oracle.com
stable inclusion from linux-4.19.195 commit 626928c3e3127867f078d986cc868c5e3c367bd9
--------------------------------
commit f8849e206ef52b584cd9227255f4724f0cc900bb upstream.
Currently if __nfs4_proc_set_acl fails with NFS4ERR_BADOWNER it re-enables the idmapper by clearing NFS_CAP_UIDGID_NOMAP before retrying again. The NFS_CAP_UIDGID_NOMAP remains cleared even if the retry fails. This causes problem for subsequent setattr requests for v4 server that does not have idmapping configured.
This patch modifies nfs4_proc_set_acl to detect NFS4ERR_BADOWNER and NFS4ERR_BADNAME and skips the retry, since the kernel isn't involved in encoding the ACEs, and return -EINVAL.
Steps to reproduce the problem:
# mount -o vers=4.1,sec=sys server:/export/test /tmp/mnt # touch /tmp/mnt/file1 # chown 99 /tmp/mnt/file1 # nfs4_setfacl -a A::unknown.user@xyz.com:wrtncy /tmp/mnt/file1 Failed setxattr operation: Invalid argument # chown 99 /tmp/mnt/file1 chown: changing ownership of ‘/tmp/mnt/file1’: Invalid argument # umount /tmp/mnt # mount -o vers=4.1,sec=sys server:/export/test /tmp/mnt # chown 99 /tmp/mnt/file1 #
v2: detect NFS4ERR_BADOWNER and NFS4ERR_BADNAME and skip retry in nfs4_proc_set_acl. Signed-off-by: Dai Ngo dai.ngo@oracle.com Signed-off-by: Trond Myklebust trond.myklebust@hammerspace.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/nfs/nfs4proc.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 4257963f5d855..24df1c068ebad 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5659,6 +5659,14 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen do { err = __nfs4_proc_set_acl(inode, buf, buflen); trace_nfs4_set_acl(inode, err); + if (err == -NFS4ERR_BADOWNER || err == -NFS4ERR_BADNAME) { + /* + * no need to retry since the kernel + * isn't involved in encoding the ACEs. + */ + err = -EINVAL; + break; + } err = nfs4_handle_exception(NFS_SERVER(inode), err, &exception); } while (exception.retry);
From: Ming Lei ming.lei@redhat.com
stable inclusion from linux-4.19.195 commit 2dc85045ae65b9302a1d2e2ddd7ce4c030153a6a
--------------------------------
commit 66a834d092930cf41d809c0e989b13cd6f9ca006 upstream.
After device is initialized via device_initialize(), or its name is set via dev_set_name(), the device has to be freed via put_device(). Otherwise device name will be leaked because it is allocated dynamically in dev_set_name().
Fix the leak by replacing kfree() with put_device(). Since scsi_host_dev_release() properly handles IDA and kthread removal, remove special-casing these from the error handling as well.
Link: https://lore.kernel.org/r/20210602133029.2864069-2-ming.lei@redhat.com Cc: Bart Van Assche bvanassche@acm.org Cc: John Garry john.garry@huawei.com Cc: Hannes Reinecke hare@suse.de Tested-by: John Garry john.garry@huawei.com Reviewed-by: Bart Van Assche bvanassche@acm.org Reviewed-by: John Garry john.garry@huawei.com Reviewed-by: Hannes Reinecke hare@suse.de Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/scsi/hosts.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-)
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index d6751764e62c5..ace6f48be98bd 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -400,8 +400,10 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize) mutex_init(&shost->scan_mutex);
index = ida_simple_get(&host_index_ida, 0, 0, GFP_KERNEL); - if (index < 0) - goto fail_kfree; + if (index < 0) { + kfree(shost); + return NULL; + } shost->host_no = index;
shost->dma_channel = 0xff; @@ -488,7 +490,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize) shost_printk(KERN_WARNING, shost, "error handler thread failed to spawn, error = %ld\n", PTR_ERR(shost->ehandler)); - goto fail_index_remove; + goto fail; }
shost->tmf_work_q = alloc_workqueue("scsi_tmf_%d", @@ -497,17 +499,18 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize) if (!shost->tmf_work_q) { shost_printk(KERN_WARNING, shost, "failed to create tmf workq\n"); - goto fail_kthread; + goto fail; } scsi_proc_hostdir_add(shost->hostt); return shost; + fail: + /* + * Host state is still SHOST_CREATED and that is enough to release + * ->shost_gendev. scsi_host_dev_release() will free + * dev_name(&shost->shost_dev). + */ + put_device(&shost->shost_gendev);
- fail_kthread: - kthread_stop(shost->ehandler); - fail_index_remove: - ida_simple_remove(&host_index_ida, shost->host_no); - fail_kfree: - kfree(shost); return NULL; } EXPORT_SYMBOL(scsi_host_alloc);
From: Ming Lei ming.lei@redhat.com
stable inclusion from linux-4.19.195 commit 7dc0595263fd4294e5f355594eff74891d027e86
--------------------------------
commit 11714026c02d613c30a149c3f4c4a15047744529 upstream.
scsi_host_dev_release() only frees dev_name when host state is SHOST_CREATED. After host state has changed to SHOST_RUNNING, scsi_host_dev_release() no longer cleans up.
Fix this by doing a put_device(&shost->shost_dev) in the failure path when host state is SHOST_RUNNING. Move get_device(&shost->shost_gendev) before device_add(&shost->shost_dev) so that scsi_host_cls_release() can do a put on this reference.
Link: https://lore.kernel.org/r/20210602133029.2864069-4-ming.lei@redhat.com Cc: Bart Van Assche bvanassche@acm.org Cc: Hannes Reinecke hare@suse.de Reported-by: John Garry john.garry@huawei.com Tested-by: John Garry john.garry@huawei.com Reviewed-by: John Garry john.garry@huawei.com Reviewed-by: Hannes Reinecke hare@suse.de Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/scsi/hosts.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index ace6f48be98bd..94eec7dfcebcb 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -261,12 +261,11 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev,
device_enable_async_suspend(&shost->shost_dev);
+ get_device(&shost->shost_gendev); error = device_add(&shost->shost_dev); if (error) goto out_del_gendev;
- get_device(&shost->shost_gendev); - if (shost->transportt->host_size) { shost->shost_data = kzalloc(shost->transportt->host_size, GFP_KERNEL); @@ -302,6 +301,11 @@ int scsi_add_host_with_dma(struct Scsi_Host *shost, struct device *dev, out_del_dev: device_del(&shost->shost_dev); out_del_gendev: + /* + * Host state is SHOST_RUNNING so we have to explicitly release + * ->shost_dev. + */ + put_device(&shost->shost_dev); device_del(&shost->shost_gendev); out_disable_runtime_pm: device_disable_async_suspend(&shost->shost_gendev);
From: Ming Lei ming.lei@redhat.com
stable inclusion from linux-4.19.195 commit 681e5c84dd98a22019200a40b6e59ad9326c3a2b
--------------------------------
commit 1e0d4e6225996f05271de1ebcb1a7c9381af0b27 upstream.
get_device(shost->shost_gendev.parent) is called after host state has switched to SHOST_RUNNING. scsi_host_dev_release() shouldn't release the parent device if host state is still SHOST_CREATED.
Link: https://lore.kernel.org/r/20210602133029.2864069-5-ming.lei@redhat.com Cc: Bart Van Assche bvanassche@acm.org Cc: John Garry john.garry@huawei.com Cc: Hannes Reinecke hare@suse.de Tested-by: John Garry john.garry@huawei.com Reviewed-by: John Garry john.garry@huawei.com Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/scsi/hosts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/scsi/hosts.c b/drivers/scsi/hosts.c index 94eec7dfcebcb..3bbbeb8e79f3c 100644 --- a/drivers/scsi/hosts.c +++ b/drivers/scsi/hosts.c @@ -357,7 +357,7 @@ static void scsi_host_dev_release(struct device *dev)
ida_simple_remove(&host_index_ida, shost->host_no);
- if (parent) + if (shost->shost_state != SHOST_CREATED) put_device(parent); kfree(shost); }
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
stable inclusion from linux-4.19.195 commit 862dcc14f2803c556bdd73b43c27b023fafce2fb
--------------------------------
commit 6c14133d2d3f768e0a35128faac8aa6ed4815051 upstream.
It was reported that a bug on arm64 caused a bad ip address to be used for updating into a nop in ftrace_init(), but the error path (rightfully) returned -EINVAL and not -EFAULT, as the bug caused more than one error to occur. But because -EINVAL was returned, the ftrace_bug() tried to report what was at the location of the ip address, and read it directly. This caused the machine to panic, as the ip was not pointing to a valid memory address.
Instead, read the ip address with copy_from_kernel_nofault() to safely access the memory, and if it faults, report that the address faulted, otherwise report what was in that location.
Link: https://lore.kernel.org/lkml/20210607032329.28671-1-mark-pk.tsai@mediatek.co...
Cc: stable@vger.kernel.org Fixes: 05736a427f7e1 ("ftrace: warn on failure to disable mcount callers") Reported-by: Mark-PK Tsai mark-pk.tsai@mediatek.com Tested-by: Mark-PK Tsai mark-pk.tsai@mediatek.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/ftrace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b296e94de2eb1..0da3460fac514 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1977,12 +1977,18 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
static void print_ip_ins(const char *fmt, const unsigned char *p) { + char ins[MCOUNT_INSN_SIZE]; int i;
+ if (probe_kernel_read(ins, p, MCOUNT_INSN_SIZE)) { + printk(KERN_CONT "%s[FAULT] %px\n", fmt, p); + return; + } + printk(KERN_CONT "%s", fmt);
for (i = 0; i < MCOUNT_INSN_SIZE; i++) - printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); + printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]); }
enum ftrace_bug_type ftrace_bug_type;
From: Liangyan liangyan.peng@linux.alibaba.com
stable inclusion from linux-4.19.195 commit 31ceae385556c37e4d286cb6378696448f566883
--------------------------------
commit 3e08a9f9760f4a70d633c328a76408e62d6f80a3 upstream.
We've suffered from severe kernel crashes due to memory corruption on our production environment, like,
Call Trace: [1640542.554277] general protection fault: 0000 [#1] SMP PTI [1640542.554856] CPU: 17 PID: 26996 Comm: python Kdump: loaded Tainted:G [1640542.556629] RIP: 0010:kmem_cache_alloc+0x90/0x190 [1640542.559074] RSP: 0018:ffffb16faa597df8 EFLAGS: 00010286 [1640542.559587] RAX: 0000000000000000 RBX: 0000000000400200 RCX: 0000000006e931bf [1640542.560323] RDX: 0000000006e931be RSI: 0000000000400200 RDI: ffff9a45ff004300 [1640542.560996] RBP: 0000000000400200 R08: 0000000000023420 R09: 0000000000000000 [1640542.561670] R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff9a20608d [1640542.562366] R13: ffff9a45ff004300 R14: ffff9a45ff004300 R15: 696c662f65636976 [1640542.563128] FS: 00007f45d7c6f740(0000) GS:ffff9a45ff840000(0000) knlGS:0000000000000000 [1640542.563937] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [1640542.564557] CR2: 00007f45d71311a0 CR3: 000000189d63e004 CR4: 00000000003606e0 [1640542.565279] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [1640542.566069] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [1640542.566742] Call Trace: [1640542.567009] anon_vma_clone+0x5d/0x170 [1640542.567417] __split_vma+0x91/0x1a0 [1640542.567777] do_munmap+0x2c6/0x320 [1640542.568128] vm_munmap+0x54/0x70 [1640542.569990] __x64_sys_munmap+0x22/0x30 [1640542.572005] do_syscall_64+0x5b/0x1b0 [1640542.573724] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [1640542.575642] RIP: 0033:0x7f45d6e61e27
James Wang has reproduced it stably on the latest 4.19 LTS. After some debugging, we finally proved that it's due to ftrace buffer out-of-bound access using a debug tool as follows: [ 86.775200] BUG: Out-of-bounds write at addr 0xffff88aefe8b7000 [ 86.780806] no_context+0xdf/0x3c0 [ 86.784327] __do_page_fault+0x252/0x470 [ 86.788367] do_page_fault+0x32/0x140 [ 86.792145] page_fault+0x1e/0x30 [ 86.795576] strncpy_from_unsafe+0x66/0xb0 [ 86.799789] fetch_memory_string+0x25/0x40 [ 86.804002] fetch_deref_string+0x51/0x60 [ 86.808134] kprobe_trace_func+0x32d/0x3a0 [ 86.812347] kprobe_dispatcher+0x45/0x50 [ 86.816385] kprobe_ftrace_handler+0x90/0xf0 [ 86.820779] ftrace_ops_assist_func+0xa1/0x140 [ 86.825340] 0xffffffffc00750bf [ 86.828603] do_sys_open+0x5/0x1f0 [ 86.832124] do_syscall_64+0x5b/0x1b0 [ 86.835900] entry_SYSCALL_64_after_hwframe+0x44/0xa9
commit b220c049d519 ("tracing: Check length before giving out the filter buffer") adds length check to protect trace data overflow introduced in 0fc1b09ff1ff, seems that this fix can't prevent overflow entirely, the length check should also take the sizeof entry->array[0] into account, since this array[0] is filled the length of trace data and occupy addtional space and risk overflow.
Link: https://lkml.kernel.org/r/20210607125734.1770447-1-liangyan.peng@linux.aliba...
Cc: stable@vger.kernel.org Cc: Ingo Molnar mingo@redhat.com Cc: Xunlei Pang xlpang@linux.alibaba.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Fixes: b220c049d519 ("tracing: Check length before giving out the filter buffer") Reviewed-by: Xunlei Pang xlpang@linux.alibaba.com Reviewed-by: yinbinbin yinbinbin@alibabacloud.com Reviewed-by: Wetp Zhang wetp.zy@linux.alibaba.com Tested-by: James Wang jnwang@linux.alibaba.com Signed-off-by: Liangyan liangyan.peng@linux.alibaba.com Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cec7e588c9a94..7d3cea9fb874c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2279,7 +2279,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ val = this_cpu_inc_return(trace_buffered_event_cnt); - if ((len < (PAGE_SIZE - sizeof(*entry))) && val == 1) { + if ((len < (PAGE_SIZE - sizeof(*entry) - sizeof(entry->array[0]))) && val == 1) { trace_event_setup(entry, type, flags, pc); entry->array[0] = len; return entry;
From: Linus Torvalds torvalds@linux-foundation.org
stable inclusion from linux-4.19.195 commit 2bc534caba6a9650123e7ddb89b3e34845df9c7b
--------------------------------
commit 94f0b2d4a1d0c52035aef425da5e022bd2cb1c71 upstream.
Commit 591a22c14d3f ("proc: Track /proc/$pid/attr/ opener mm_struct") we started using __mem_open() to track the mm_struct at open-time, so that we could then check it for writes.
But that also ended up making the permission checks at open time much stricter - and not just for writes, but for reads too. And that in turn caused a regression for at least Fedora 29, where NIC interfaces fail to start when using NetworkManager.
Since only the write side wanted the mm_struct test, ignore any failures by __mem_open() at open time, leaving reads unaffected. The write() time verification of the mm_struct pointer will then catch the failure case because a NULL pointer will not match a valid 'current->mm'.
Link: https://lore.kernel.org/netdev/YMjTlp2FSJYvoyFa@unreal/ Fixes: 591a22c14d3f ("proc: Track /proc/$pid/attr/ opener mm_struct") Reported-and-tested-by: Leon Romanovsky leon@kernel.org Cc: Kees Cook keescook@chromium.org Cc: Christian Brauner christian.brauner@ubuntu.com Cc: Andrea Righi andrea.righi@canonical.com Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/proc/base.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c index e46c10acb784b..a8679c244e40a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2537,7 +2537,9 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx, #ifdef CONFIG_SECURITY static int proc_pid_attr_open(struct inode *inode, struct file *file) { - return __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); + file->private_data = NULL; + __mem_open(inode, file, PTRACE_MODE_READ_FSCREDS); + return 0; }
static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
From: Maurizio Lombardi mlombard@redhat.com
stable inclusion from linux-4.19.196 commit 8fa2bb8837ea92f725e63ec38935195bc0266668
--------------------------------
[ Upstream commit 515da6f4295c2c42b8c54572cce3d2dd1167c41e ]
On realtime kernels, spin_lock_irq*(spinlock_t) do not disable the interrupts, a call to irqs_disabled() will return false thus firing a warning in __transport_wait_for_tasks().
Remove the warning and also replace assert_spin_locked() with lockdep_assert_held()
Link: https://lore.kernel.org/r/20210531121326.3649-1-mlombard@redhat.com Reviewed-by: Bart Van Assche bvanassche@acm.org Signed-off-by: Maurizio Lombardi mlombard@redhat.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/target/target_core_transport.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-)
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index bdada97cd4fe2..9c60a090cfd17 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -2982,9 +2982,7 @@ __transport_wait_for_tasks(struct se_cmd *cmd, bool fabric_stop, __releases(&cmd->t_state_lock) __acquires(&cmd->t_state_lock) { - - assert_spin_locked(&cmd->t_state_lock); - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_held(&cmd->t_state_lock);
if (fabric_stop) cmd->transport_state |= CMD_T_FABRIC_STOP;
From: Hannes Reinecke hare@suse.de
stable inclusion from linux-4.19.196 commit a0a60f6092b50a0cc261e829bb5fbb2ac33dac29
--------------------------------
[ Upstream commit a6c144f3d2e230f2b3ac5ed8c51e0f0391556197 ]
The queue count is increased in nvme_loop_init_io_queues(), so we need to reset it to 1 at the end of nvme_loop_destroy_io_queues(). Otherwise the function is not re-entrant safe, and crash will happen during concurrent reset and remove calls.
Signed-off-by: Hannes Reinecke hare@suse.de Reviewed-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/target/loop.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index 122fbfc31889f..ae3d05a9ae28b 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -309,6 +309,7 @@ static void nvme_loop_destroy_io_queues(struct nvme_loop_ctrl *ctrl) clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[i].flags); nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); } + ctrl->ctrl.queue_count = 1; }
static int nvme_loop_init_io_queues(struct nvme_loop_ctrl *ctrl)
From: Hannes Reinecke hare@suse.de
stable inclusion from linux-4.19.196 commit ef244ccfd1c213fa8c05b58b6760640718e9b636
--------------------------------
[ Upstream commit 1c5f8e882a05de5c011e8c3fbeceb0d1c590eb53 ]
When the call to nvme_enable_ctrl() in nvme_loop_configure_admin_queue() fails the NVME_LOOP_Q_LIVE flag is not cleared.
Signed-off-by: Hannes Reinecke hare@suse.de Reviewed-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/target/loop.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index ae3d05a9ae28b..b0bf40e5c00b5 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -418,6 +418,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) return 0;
out_cleanup_queue: + clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); blk_cleanup_queue(ctrl->ctrl.admin_q); out_free_tagset: blk_mq_free_tag_set(&ctrl->admin_tag_set);
From: Hannes Reinecke hare@suse.de
stable inclusion from linux-4.19.196 commit 9ccbe18eb48a9b27b646931e01d84dfe0bc8a10d
--------------------------------
[ Upstream commit 4237de2f73a669e4f89ac0aa2b44fb1a1d9ec583 ]
We need to check the NVME_LOOP_Q_LIVE flag in nvme_loop_destroy_admin_queue() to protect against duplicate invocations eg during concurrent reset and remove calls.
Signed-off-by: Hannes Reinecke hare@suse.de Reviewed-by: Chaitanya Kulkarni chaitanya.kulkarni@wdc.com Signed-off-by: Christoph Hellwig hch@lst.de Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/nvme/target/loop.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index b0bf40e5c00b5..ffe7fa814d8c8 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -274,7 +274,8 @@ static const struct blk_mq_ops nvme_loop_admin_mq_ops = {
static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) { - clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags); + if (!test_and_clear_bit(NVME_LOOP_Q_LIVE, &ctrl->queues[0].flags)) + return; nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); blk_cleanup_queue(ctrl->ctrl.admin_q); blk_mq_free_tag_set(&ctrl->admin_tag_set);
From: Josh Triplett josh@joshtriplett.org
stable inclusion from linux-4.19.196 commit 51c0007f4c6b88f22c2a5bfb099d7654f0db973c
--------------------------------
[ Upstream commit b508d5fb69c2211a1b860fc058aafbefc3b3c3cd ]
If the user specifies a hostname or domain name as part of the ip= command-line option, preserve it and don't overwrite it with one supplied by DHCP/BOOTP.
For instance, ip=::::myhostname::dhcp will use "myhostname" rather than ignoring and overwriting it.
Fix the comment on ic_bootp_string that suggests it only copies a string "if not already set"; it doesn't have any such logic.
Signed-off-by: Josh Triplett josh@joshtriplett.org Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/ipconfig.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 88212615bf4ce..58719b9635d90 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -866,7 +866,7 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
/* - * Copy BOOTP-supplied string if not already set. + * Copy BOOTP-supplied string */ static int __init ic_bootp_string(char *dest, char *src, int len, int max) { @@ -915,12 +915,15 @@ static void __init ic_do_bootp_ext(u8 *ext) } break; case 12: /* Host name */ - ic_bootp_string(utsname()->nodename, ext+1, *ext, - __NEW_UTS_LEN); - ic_host_name_set = 1; + if (!ic_host_name_set) { + ic_bootp_string(utsname()->nodename, ext+1, *ext, + __NEW_UTS_LEN); + ic_host_name_set = 1; + } break; case 15: /* Domain name (DNS) */ - ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); + if (!ic_domain[0]) + ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); break; case 17: /* Root path */ if (!root_server_path[0])
From: Jiapeng Chong jiapeng.chong@linux.alibaba.com
stable inclusion from linux-4.19.196 commit 7e0147403ebae2fc60b1657114058d20e874a89a
--------------------------------
[ Upstream commit a8db57c1d285c758adc7fb43d6e2bad2554106e1 ]
The error code is missing in this code scenario, add the error code '-EINVAL' to the return value 'err'.
Eliminate the follow smatch warning:
net/core/rtnetlink.c:4834 rtnl_bridge_notify() warn: missing error code 'err'.
Reported-by: Abaci Robot abaci@linux.alibaba.com Signed-off-by: Jiapeng Chong jiapeng.chong@linux.alibaba.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/core/rtnetlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 935053ee7765d..7f2dda27f9e72 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4102,8 +4102,10 @@ static int rtnl_bridge_notify(struct net_device *dev) if (err < 0) goto errout;
- if (!skb->len) + if (!skb->len) { + err = -EINVAL; goto errout; + }
rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return 0;
From: Zheng Yongjun zhengyongjun3@huawei.com
stable inclusion from linux-4.19.196 commit b966a0defc93c533c0f96410c20f50c42a5fed52
--------------------------------
[ Upstream commit 49251cd00228a3c983651f6bb2f33f6a0b8f152e ]
When kalloc or kmemdup failed, should return ENOMEM rather than ENOBUF.
Signed-off-by: Zheng Yongjun zhengyongjun3@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/compat.c b/net/compat.c index 42afe8f45ff88..1dcbbf481318a 100644 --- a/net/compat.c +++ b/net/compat.c @@ -175,7 +175,7 @@ int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, struct sock *sk, if (kcmlen > stackbuf_size) kcmsg_base = kcmsg = sock_kmalloc(sk, kcmlen, GFP_KERNEL); if (kcmsg == NULL) - return -ENOBUFS; + return -ENOMEM;
/* Now copy them over neatly. */ memset(kcmsg, 0, kcmlen);
From: Zheng Yongjun zhengyongjun3@huawei.com
stable inclusion from linux-4.19.196 commit 7e3f278d55b0677aa82d07ba521390c8b090ee69
--------------------------------
[ Upstream commit 59607863c54e9eb3f69afc5257dfe71c38bb751e ]
When kalloc or kmemdup failed, should return ENOMEM rather than ENOBUF.
Signed-off-by: Zheng Yongjun zhengyongjun3@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/core/fib_rules.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 8916c5d9b3b3a..46a13ed15c4e8 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -1105,7 +1105,7 @@ static void notify_rule_change(int event, struct fib_rule *rule, { struct net *net; struct sk_buff *skb; - int err = -ENOBUFS; + int err = -ENOMEM;
net = ops->fro_net; skb = nlmsg_new(fib_rule_nlmsg_size(ops, rule), GFP_KERNEL);
From: Nanyong Sun sunnanyong@huawei.com
stable inclusion from linux-4.19.196 commit 5340858147e3dc60913fb3dd0cbb758ec4a26e66
--------------------------------
[ Upstream commit d612c3f3fae221e7ea736d196581c2217304bbbc ]
Reported by syzkaller: BUG: memory leak unreferenced object 0xffff888105df7000 (size 64): comm "syz-executor842", pid 360, jiffies 4294824824 (age 22.546s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000e67ed558>] kmalloc include/linux/slab.h:590 [inline] [<00000000e67ed558>] kzalloc include/linux/slab.h:720 [inline] [<00000000e67ed558>] netlbl_cipsov4_add_std net/netlabel/netlabel_cipso_v4.c:145 [inline] [<00000000e67ed558>] netlbl_cipsov4_add+0x390/0x2340 net/netlabel/netlabel_cipso_v4.c:416 [<0000000006040154>] genl_family_rcv_msg_doit.isra.0+0x20e/0x320 net/netlink/genetlink.c:739 [<00000000204d7a1c>] genl_family_rcv_msg net/netlink/genetlink.c:783 [inline] [<00000000204d7a1c>] genl_rcv_msg+0x2bf/0x4f0 net/netlink/genetlink.c:800 [<00000000c0d6a995>] netlink_rcv_skb+0x134/0x3d0 net/netlink/af_netlink.c:2504 [<00000000d78b9d2c>] genl_rcv+0x24/0x40 net/netlink/genetlink.c:811 [<000000009733081b>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<000000009733081b>] netlink_unicast+0x4a0/0x6a0 net/netlink/af_netlink.c:1340 [<00000000d5fd43b8>] netlink_sendmsg+0x789/0xc70 net/netlink/af_netlink.c:1929 [<000000000a2d1e40>] sock_sendmsg_nosec net/socket.c:654 [inline] [<000000000a2d1e40>] sock_sendmsg+0x139/0x170 net/socket.c:674 [<00000000321d1969>] ____sys_sendmsg+0x658/0x7d0 net/socket.c:2350 [<00000000964e16bc>] ___sys_sendmsg+0xf8/0x170 net/socket.c:2404 [<000000001615e288>] __sys_sendmsg+0xd3/0x190 net/socket.c:2433 [<000000004ee8b6a5>] do_syscall_64+0x37/0x90 arch/x86/entry/common.c:47 [<00000000171c7cee>] entry_SYSCALL_64_after_hwframe+0x44/0xae
The memory of doi_def->map.std pointing is allocated in netlbl_cipsov4_add_std, but no place has freed it. It should be freed in cipso_v4_doi_free which frees the cipso DOI resource.
Fixes: 96cb8e3313c7a ("[NetLabel]: CIPSOv4 and Unlabeled packet integration") Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Nanyong Sun sunnanyong@huawei.com Acked-by: Paul Moore paul@paul-moore.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/cipso_ipv4.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 6a1b52b34e205..e8b8dd1cb1576 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -486,6 +486,7 @@ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) kfree(doi_def->map.std->lvl.local); kfree(doi_def->map.std->cat.cipso); kfree(doi_def->map.std->cat.local); + kfree(doi_def->map.std); break; } kfree(doi_def);
From: Paolo Abeni pabeni@redhat.com
stable inclusion from linux-4.19.196 commit 2f73448041bd0682d4b552cfd314ace66107f1ad
--------------------------------
[ Upstream commit a8b897c7bcd47f4147d066e22cc01d1026d7640e ]
Kaustubh reported and diagnosed a panic in udp_lib_lookup(). The root cause is udp_abort() racing with close(). Both racing functions acquire the socket lock, but udp{v6}_destroy_sock() release it before performing destructive actions.
We can't easily extend the socket lock scope to avoid the race, instead use the SOCK_DEAD flag to prevent udp_abort from doing any action when the critical race happens.
Diagnosed-and-tested-by: Kaustubh Pandey kapandey@codeaurora.org Fixes: 5d77dca82839 ("net: diag: support SOCK_DESTROY for UDP sockets") Signed-off-by: Paolo Abeni pabeni@redhat.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/udp.c | 10 ++++++++++ net/ipv6/udp.c | 3 +++ 2 files changed, 13 insertions(+)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 871f41b7ab70c..7c68b9e213ef3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2429,6 +2429,9 @@ void udp_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); bool slow = lock_sock_fast(sk); + + /* protects from races with udp_abort() */ + sock_set_flag(sk, SOCK_DEAD); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { @@ -2670,10 +2673,17 @@ int udp_abort(struct sock *sk, int err) { lock_sock(sk);
+ /* udp{v6}_destroy_sock() sets it under the sk lock, avoid racing + * with close() + */ + if (sock_flag(sk, SOCK_DEAD)) + goto out; + sk->sk_err = err; sk->sk_error_report(sk); __udp_disconnect(sk, 0);
+out: release_sock(sk);
return 0; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6609f7b6994a9..1978a6d724c9f 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -1476,6 +1476,9 @@ void udpv6_destroy_sock(struct sock *sk) { struct udp_sock *up = udp_sk(sk); lock_sock(sk); + + /* protects from races with udp_abort() */ + sock_set_flag(sk, SOCK_DEAD); udp_v6_flush_pending_frames(sk); release_sock(sk);
From: Ido Schimmel idosch@nvidia.com
stable inclusion from linux-4.19.196 commit ba14e0b49388ef9f02787fe3dc719e0621169e64
--------------------------------
[ Upstream commit d2e381c4963663bca6f30c3b996fa4dbafe8fcb5 ]
Cited commit started returning errors when notification info is not filled by the bridge driver, resulting in the following regression:
# ip link add name br1 type bridge vlan_filtering 1 # bridge vlan add dev br1 vid 555 self pvid untagged RTNETLINK answers: Invalid argument
As long as the bridge driver does not fill notification info for the bridge device itself, an empty notification should not be considered as an error. This is explained in commit 59ccaaaa49b5 ("bridge: dont send notification when skb->len == 0 in rtnl_bridge_notify").
Fix by removing the error and add a comment to avoid future bugs.
Fixes: a8db57c1d285 ("rtnetlink: Fix missing error code in rtnl_bridge_notify()") Signed-off-by: Ido Schimmel idosch@nvidia.com Reviewed-by: Nikolay Aleksandrov nikolay@nvidia.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/core/rtnetlink.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7f2dda27f9e72..055fd09ac1114 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4102,10 +4102,12 @@ static int rtnl_bridge_notify(struct net_device *dev) if (err < 0) goto errout;
- if (!skb->len) { - err = -EINVAL; + /* Notification info is only filled for bridge ports, not the bridge + * device itself. Therefore, a zero notification length is valid and + * should not result in an error. + */ + if (!skb->len) goto errout; - }
rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, GFP_ATOMIC); return 0;
From: Maxim Mikityanskiy maximmi@nvidia.com
stable inclusion from linux-4.19.196 commit 7d9a9a1a88a3da574e019b4de756bc73337b3b0b
--------------------------------
[ Upstream commit 5fc177ab759418c9537433e63301096e733fb915 ]
The TCP option parser in synproxy (synproxy_parse_options) could read one byte out of bounds. When the length is 1, the execution flow gets into the loop, reads one byte of the opcode, and if the opcode is neither TCPOPT_EOL nor TCPOPT_NOP, it reads one more byte, which exceeds the length of 1.
This fix is inspired by commit 9609dad263f8 ("ipv4: tcp_input: fix stack out of bounds when parsing TCP options.").
v2 changes:
Added an early return when length < 0 to avoid calling skb_header_pointer with negative length.
Cc: Young Xiao 92siuyang@gmail.com Fixes: 48b1de4c110a ("netfilter: add SYNPROXY core/target") Signed-off-by: Maxim Mikityanskiy maximmi@nvidia.com Reviewed-by: Florian Westphal fw@strlen.de Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netfilter/nf_synproxy_core.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 353a2aa80c3cc..04b07b63c5408 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -34,6 +34,9 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, int length = (th->doff * 4) - sizeof(*th); u8 buf[40], *ptr;
+ if (unlikely(length < 0)) + return false; + ptr = skb_header_pointer(skb, doff + sizeof(*th), length, buf); if (ptr == NULL) return false; @@ -50,6 +53,8 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, length--; continue; default: + if (length < 2) + return true; opsize = *ptr++; if (opsize < 2) return true;
From: Maxim Mikityanskiy maximmi@nvidia.com
stable inclusion from linux-4.19.196 commit 595897ef118d6fe66690c4fc5b572028c9da95b7
--------------------------------
[ Upstream commit ba91c49dedbde758ba0b72f57ac90b06ddf8e548 ]
The TCP option parser in cake qdisc (cake_get_tcpopt and cake_tcph_may_drop) could read one byte out of bounds. When the length is 1, the execution flow gets into the loop, reads one byte of the opcode, and if the opcode is neither TCPOPT_EOL nor TCPOPT_NOP, it reads one more byte, which exceeds the length of 1.
This fix is inspired by commit 9609dad263f8 ("ipv4: tcp_input: fix stack out of bounds when parsing TCP options.").
v2 changes:
Added doff validation in cake_get_tcphdr to avoid parsing garbage as TCP header. Although it wasn't strictly an out-of-bounds access (memory was allocated), garbage values could be read where CAKE expected the TCP header if doff was smaller than 5.
Cc: Young Xiao 92siuyang@gmail.com Fixes: 8b7138814f29 ("sch_cake: Add optional ACK filter") Signed-off-by: Maxim Mikityanskiy maximmi@nvidia.com Acked-by: Toke Høiland-Jørgensen toke@toke.dk Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/sched/sch_cake.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 32712e7dcbdc2..2025f0f559deb 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -900,7 +900,7 @@ static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb, }
tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); - if (!tcph) + if (!tcph || tcph->doff < 5) return NULL;
return skb_header_pointer(skb, offset, @@ -924,6 +924,8 @@ static const void *cake_get_tcpopt(const struct tcphdr *tcph, length--; continue; } + if (length < 2) + break; opsize = *ptr++; if (opsize < 2 || opsize > length) break; @@ -1061,6 +1063,8 @@ static bool cake_tcph_may_drop(const struct tcphdr *tcph, length--; continue; } + if (length < 2) + break; opsize = *ptr++; if (opsize < 2 || opsize > length) break;
From: Pedro Tammela pctammela@gmail.com
stable inclusion from linux-4.19.196 commit 137e2b34d1c6304b1fa2d55d5232a457bf8b2655
--------------------------------
[ Upstream commit 8a3c245c031944f2176118270e7bc5d4fd4a1075 ]
Adds missing sphinx documentation to the socket.c's functions. Also fixes some whitespaces.
I also changed the style of older documentation as an effort to have an uniform documentation style.
Signed-off-by: Pedro Tammela pctammela@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/net.h | 6 + include/linux/socket.h | 12 +- net/socket.c | 277 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 271 insertions(+), 24 deletions(-)
diff --git a/include/linux/net.h b/include/linux/net.h index f55bee9726fd0..a9c238656424f 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -84,6 +84,12 @@ enum sock_type {
#endif /* ARCH_HAS_SOCKET_TYPES */
+/** + * enum sock_shutdown_cmd - Shutdown types + * @SHUT_RD: shutdown receptions + * @SHUT_WR: shutdown transmissions + * @SHUT_RDWR: shutdown receptions/transmissions + */ enum sock_shutdown_cmd { SHUT_RD, SHUT_WR, diff --git a/include/linux/socket.h b/include/linux/socket.h index 05c87e849a87c..2b06d8ccbd168 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -28,7 +28,7 @@ typedef __kernel_sa_family_t sa_family_t; /* * 1003.1g requires sa_family_t and that sa_data is char. */ - + struct sockaddr { sa_family_t sa_family; /* address family, AF_xxx */ char sa_data[14]; /* 14 bytes of protocol address */ @@ -46,7 +46,7 @@ struct linger { * system, not 4.3. Thus msg_accrights(len) are now missing. They * belong in an obscure libc emulation or the bin. */ - + struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ @@ -56,7 +56,7 @@ struct msghdr { unsigned int msg_flags; /* flags on received message */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ }; - + struct user_msghdr { void __user *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ @@ -124,7 +124,7 @@ struct cmsghdr { * inside range, given by msg->msg_controllen before using * ancillary object DATA. --ANK (980731) */ - + static inline struct cmsghdr * __cmsg_nxthdr(void *__ctl, __kernel_size_t __size, struct cmsghdr *__cmsg) { @@ -266,10 +266,10 @@ struct ucred { /* Maximum queue length specifiable by listen. */ #define SOMAXCONN 128
-/* Flags we can use with send/ and recv. +/* Flags we can use with send/ and recv. Added those for 1003.1g not all are supported yet */ - + #define MSG_OOB 1 #define MSG_PEEK 2 #define MSG_DONTROUTE 4 diff --git a/net/socket.c b/net/socket.c index 50403ebdd8f65..bf943e6d47fe8 100644 --- a/net/socket.c +++ b/net/socket.c @@ -384,6 +384,18 @@ static struct file_system_type sock_fs_type = { * but we take care of internal coherence yet. */
+/** + * sock_alloc_file - Bind a &socket to a &file + * @sock: socket + * @flags: file status flags + * @dname: protocol name + * + * Returns the &file bound with @sock, implicitly storing it + * in sock->file. If dname is %NULL, sets to "". + * On failure the return is a ERR pointer (see linux/err.h). + * This function uses GFP_KERNEL internally. + */ + struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname) { struct file *file; @@ -424,6 +436,14 @@ static int sock_map_fd(struct socket *sock, int flags) return PTR_ERR(newfile); }
+/** + * sock_from_file - Return the &socket bounded to @file. + * @file: file + * @err: pointer to an error code return + * + * On failure returns %NULL and assigns -ENOTSOCK to @err. + */ + struct socket *sock_from_file(struct file *file, int *err) { if (file->f_op == &socket_file_ops) @@ -532,11 +552,11 @@ static const struct inode_operations sockfs_inode_ops = { };
/** - * sock_alloc - allocate a socket + * sock_alloc - allocate a socket * * Allocate a new inode and socket object. The two are bound together * and initialised. The socket is then returned. If we are out of inodes - * NULL is returned. + * NULL is returned. This functions uses GFP_KERNEL internally. */
struct socket *sock_alloc(void) @@ -561,7 +581,7 @@ struct socket *sock_alloc(void) EXPORT_SYMBOL(sock_alloc);
/** - * sock_release - close a socket + * sock_release - close a socket * @sock: socket to close * * The socket is released from the protocol stack if it has a release @@ -617,6 +637,15 @@ void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags) } EXPORT_SYMBOL(__sock_tx_timestamp);
+/** + * sock_sendmsg - send a message through @sock + * @sock: socket + * @msg: message to send + * + * Sends @msg through @sock, passing through LSM. + * Returns the number of bytes sent, or an error code. + */ + static inline int sock_sendmsg_nosec(struct socket *sock, struct msghdr *msg) { int ret = sock->ops->sendmsg(sock, msg, msg_data_left(msg)); @@ -633,6 +662,18 @@ int sock_sendmsg(struct socket *sock, struct msghdr *msg) } EXPORT_SYMBOL(sock_sendmsg);
+/** + * kernel_sendmsg - send a message through @sock (kernel-space) + * @sock: socket + * @msg: message header + * @vec: kernel vec + * @num: vec array length + * @size: total message data size + * + * Builds the message data with @vec and sends it through @sock. + * Returns the number of bytes sent, or an error code. + */ + int kernel_sendmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { @@ -641,6 +682,19 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg, } EXPORT_SYMBOL(kernel_sendmsg);
+/** + * kernel_sendmsg_locked - send a message through @sock (kernel-space) + * @sk: sock + * @msg: message header + * @vec: output s/g array + * @num: output s/g array length + * @size: total message data size + * + * Builds the message data with @vec and sends it through @sock. + * Returns the number of bytes sent, or an error code. + * Caller must hold @sk. + */ + int kernel_sendmsg_locked(struct sock *sk, struct msghdr *msg, struct kvec *vec, size_t num, size_t size) { @@ -789,6 +843,16 @@ void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, } EXPORT_SYMBOL_GPL(__sock_recv_ts_and_drops);
+/** + * sock_recvmsg - receive a message from @sock + * @sock: socket + * @msg: message to receive + * @flags: message flags + * + * Receives @msg from @sock, passing through LSM. Returns the total number + * of bytes received, or an error. + */ + static inline int sock_recvmsg_nosec(struct socket *sock, struct msghdr *msg, int flags) { @@ -804,20 +868,21 @@ int sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags) EXPORT_SYMBOL(sock_recvmsg);
/** - * kernel_recvmsg - Receive a message from a socket (kernel space) - * @sock: The socket to receive the message from - * @msg: Received message - * @vec: Input s/g array for message data - * @num: Size of input s/g array - * @size: Number of bytes to read - * @flags: Message flags (MSG_DONTWAIT, etc...) + * kernel_recvmsg - Receive a message from a socket (kernel space) + * @sock: The socket to receive the message from + * @msg: Received message + * @vec: Input s/g array for message data + * @num: Size of input s/g array + * @size: Number of bytes to read + * @flags: Message flags (MSG_DONTWAIT, etc...) * - * On return the msg structure contains the scatter/gather array passed in the - * vec argument. The array is modified so that it consists of the unfilled - * portion of the original array. + * On return the msg structure contains the scatter/gather array passed in the + * vec argument. The array is modified so that it consists of the unfilled + * portion of the original array. * - * The returned value is the total number of bytes received, or an error. + * The returned value is the total number of bytes received, or an error. */ + int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags) { @@ -983,6 +1048,13 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, * what to do with it - that's up to the protocol still. */
+/** + * get_net_ns - increment the refcount of the network namespace + * @ns: common namespace (net) + * + * Returns the net's common namespace. + */ + struct ns_common *get_net_ns(struct ns_common *ns) { return &get_net(container_of(ns, struct net, ns))->ns; @@ -1077,6 +1149,19 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) return err; }
+/** + * sock_create_lite - creates a socket + * @family: protocol family (AF_INET, ...) + * @type: communication type (SOCK_STREAM, ...) + * @protocol: protocol (0, ...) + * @res: new socket + * + * Creates a new socket and assigns it to @res, passing through LSM. + * The new socket initialization is not complete, see kernel_accept(). + * Returns 0 or an error. On failure @res is set to %NULL. + * This function internally uses GFP_KERNEL. + */ + int sock_create_lite(int family, int type, int protocol, struct socket **res) { int err; @@ -1202,6 +1287,21 @@ int sock_wake_async(struct socket_wq *wq, int how, int band) } EXPORT_SYMBOL(sock_wake_async);
+/** + * __sock_create - creates a socket + * @net: net namespace + * @family: protocol family (AF_INET, ...) + * @type: communication type (SOCK_STREAM, ...) + * @protocol: protocol (0, ...) + * @res: new socket + * @kern: boolean for kernel space sockets + * + * Creates a new socket and assigns it to @res, passing through LSM. + * Returns 0 or an error. On failure @res is set to %NULL. @kern must + * be set to true if the socket resides in kernel space. + * This function internally uses GFP_KERNEL. + */ + int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern) { @@ -1311,12 +1411,35 @@ int __sock_create(struct net *net, int family, int type, int protocol, } EXPORT_SYMBOL(__sock_create);
+/** + * sock_create - creates a socket + * @family: protocol family (AF_INET, ...) + * @type: communication type (SOCK_STREAM, ...) + * @protocol: protocol (0, ...) + * @res: new socket + * + * A wrapper around __sock_create(). + * Returns 0 or an error. This function internally uses GFP_KERNEL. + */ + int sock_create(int family, int type, int protocol, struct socket **res) { return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0); } EXPORT_SYMBOL(sock_create);
+/** + * sock_create_kern - creates a socket (kernel space) + * @net: net namespace + * @family: protocol family (AF_INET, ...) + * @type: communication type (SOCK_STREAM, ...) + * @protocol: protocol (0, ...) + * @res: new socket + * + * A wrapper around __sock_create(). + * Returns 0 or an error. This function internally uses GFP_KERNEL. + */ + int sock_create_kern(struct net *net, int family, int type, int protocol, struct socket **res) { return __sock_create(net, family, type, protocol, res, 1); @@ -3393,18 +3516,46 @@ static long compat_sock_ioctl(struct file *file, unsigned int cmd, } #endif
+/** + * kernel_bind - bind an address to a socket (kernel space) + * @sock: socket + * @addr: address + * @addrlen: length of address + * + * Returns 0 or an error. + */ + int kernel_bind(struct socket *sock, struct sockaddr *addr, int addrlen) { return sock->ops->bind(sock, addr, addrlen); } EXPORT_SYMBOL(kernel_bind);
+/** + * kernel_listen - move socket to listening state (kernel space) + * @sock: socket + * @backlog: pending connections queue size + * + * Returns 0 or an error. + */ + int kernel_listen(struct socket *sock, int backlog) { return sock->ops->listen(sock, backlog); } EXPORT_SYMBOL(kernel_listen);
+/** + * kernel_accept - accept a connection (kernel space) + * @sock: listening socket + * @newsock: new connected socket + * @flags: flags + * + * @flags must be SOCK_CLOEXEC, SOCK_NONBLOCK or 0. + * If it fails, @newsock is guaranteed to be %NULL. + * Returns 0 or an error. + */ + int kernel_accept(struct socket *sock, struct socket **newsock, int flags) { struct sock *sk = sock->sk; @@ -3430,6 +3581,19 @@ int kernel_accept(struct socket *sock, struct socket **newsock, int flags) } EXPORT_SYMBOL(kernel_accept);
+/** + * kernel_connect - connect a socket (kernel space) + * @sock: socket + * @addr: address + * @addrlen: address length + * @flags: flags (O_NONBLOCK, ...) + * + * For datagram sockets, @addr is the addres to which datagrams are sent + * by default, and the only address from which datagrams are received. + * For stream sockets, attempts to connect to @addr. + * Returns 0 or an error code. + */ + int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, int flags) { @@ -3437,18 +3601,48 @@ int kernel_connect(struct socket *sock, struct sockaddr *addr, int addrlen, } EXPORT_SYMBOL(kernel_connect);
+/** + * kernel_getsockname - get the address which the socket is bound (kernel space) + * @sock: socket + * @addr: address holder + * + * Fills the @addr pointer with the address which the socket is bound. + * Returns 0 or an error code. + */ + int kernel_getsockname(struct socket *sock, struct sockaddr *addr) { return sock->ops->getname(sock, addr, 0); } EXPORT_SYMBOL(kernel_getsockname);
+/** + * kernel_peername - get the address which the socket is connected (kernel space) + * @sock: socket + * @addr: address holder + * + * Fills the @addr pointer with the address which the socket is connected. + * Returns 0 or an error code. + */ + int kernel_getpeername(struct socket *sock, struct sockaddr *addr) { return sock->ops->getname(sock, addr, 1); } EXPORT_SYMBOL(kernel_getpeername);
+/** + * kernel_getsockopt - get a socket option (kernel space) + * @sock: socket + * @level: API level (SOL_SOCKET, ...) + * @optname: option tag + * @optval: option value + * @optlen: option length + * + * Assigns the option length to @optlen. + * Returns 0 or an error. + */ + int kernel_getsockopt(struct socket *sock, int level, int optname, char *optval, int *optlen) { @@ -3471,6 +3665,17 @@ int kernel_getsockopt(struct socket *sock, int level, int optname, } EXPORT_SYMBOL(kernel_getsockopt);
+/** + * kernel_setsockopt - set a socket option (kernel space) + * @sock: socket + * @level: API level (SOL_SOCKET, ...) + * @optname: option tag + * @optval: option value + * @optlen: option length + * + * Returns 0 or an error. + */ + int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval, unsigned int optlen) { @@ -3491,6 +3696,17 @@ int kernel_setsockopt(struct socket *sock, int level, int optname, } EXPORT_SYMBOL(kernel_setsockopt);
+/** + * kernel_sendpage - send a &page through a socket (kernel space) + * @sock: socket + * @page: page + * @offset: page offset + * @size: total size in bytes + * @flags: flags (MSG_DONTWAIT, ...) + * + * Returns the total amount sent in bytes or an error. + */ + int kernel_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { @@ -3501,6 +3717,18 @@ int kernel_sendpage(struct socket *sock, struct page *page, int offset, } EXPORT_SYMBOL(kernel_sendpage);
+/** + * kernel_sendpage_locked - send a &page through the locked sock (kernel space) + * @sk: sock + * @page: page + * @offset: page offset + * @size: total size in bytes + * @flags: flags (MSG_DONTWAIT, ...) + * + * Returns the total amount sent in bytes or an error. + * Caller must hold @sk. + */ + int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags) { @@ -3514,17 +3742,30 @@ int kernel_sendpage_locked(struct sock *sk, struct page *page, int offset, } EXPORT_SYMBOL(kernel_sendpage_locked);
+/** + * kernel_shutdown - shut down part of a full-duplex connection (kernel space) + * @sock: socket + * @how: connection part + * + * Returns 0 or an error. + */ + int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how) { return sock->ops->shutdown(sock, how); } EXPORT_SYMBOL(kernel_sock_shutdown);
-/* This routine returns the IP overhead imposed by a socket i.e. - * the length of the underlying IP header, depending on whether - * this is an IPv4 or IPv6 socket and the length from IP options turned - * on at the socket. Assumes that the caller has a lock on the socket. +/** + * kernel_sock_ip_overhead - returns the IP overhead imposed by a socket + * @sk: socket + * + * This routine returns the IP overhead imposed by a socket i.e. + * the length of the underlying IP header, depending on whether + * this is an IPv4 or IPv6 socket and the length from IP options turned + * on at the socket. Assumes that the caller has a lock on the socket. */ + u32 kernel_sock_ip_overhead(struct sock *sk) { struct inet_sock *inet;
From: Changbin Du changbin.du@gmail.com
stable inclusion from linux-4.19.196 commit 29174c883e493b612170b7874e560e006d18985a
--------------------------------
[ Upstream commit ea6932d70e223e02fea3ae20a4feff05d7c1ea9a ]
There is a panic in socket ioctl cmd SIOCGSKNS when NET_NS is not enabled. The reason is that nsfs tries to access ns->ops but the proc_ns_operations is not implemented in this case.
[7.670023] Unable to handle kernel NULL pointer dereference at virtual address 00000010 [7.670268] pgd = 32b54000 [7.670544] [00000010] *pgd=00000000 [7.671861] Internal error: Oops: 5 [#1] SMP ARM [7.672315] Modules linked in: [7.672918] CPU: 0 PID: 1 Comm: systemd Not tainted 5.13.0-rc3-00375-g6799d4f2da49 #16 [7.673309] Hardware name: Generic DT based system [7.673642] PC is at nsfs_evict+0x24/0x30 [7.674486] LR is at clear_inode+0x20/0x9c
The same to tun SIOCGSKNS command.
To fix this problem, we make get_net_ns() return -EINVAL when NET_NS is disabled. Meanwhile move it to right place net/core/net_namespace.c.
Signed-off-by: Changbin Du changbin.du@gmail.com Fixes: c62cce2caee5 ("net: add an ioctl to get a socket network namespace") Cc: Cong Wang xiyou.wangcong@gmail.com Cc: Jakub Kicinski kuba@kernel.org Cc: David Laight David.Laight@ACULAB.COM Cc: Christian Brauner christian.brauner@ubuntu.com Suggested-by: Jakub Kicinski kuba@kernel.org Acked-by: Christian Brauner christian.brauner@ubuntu.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/socket.h | 2 -- include/net/net_namespace.h | 7 +++++++ net/core/net_namespace.c | 12 ++++++++++++ net/socket.c | 13 ------------- 4 files changed, 19 insertions(+), 15 deletions(-)
diff --git a/include/linux/socket.h b/include/linux/socket.h index 2b06d8ccbd168..c37812d9771ca 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -409,6 +409,4 @@ extern int __sys_getpeername(int fd, struct sockaddr __user *usockaddr, extern int __sys_socketpair(int family, int type, int protocol, int __user *usockvec); extern int __sys_shutdown(int fd, int how); - -extern struct ns_common *get_net_ns(struct ns_common *ns); #endif /* _LINUX_SOCKET_H */ diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 5007eaba207d5..bc88ac6c2e1d7 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -175,6 +175,8 @@ struct net *copy_net_ns(unsigned long flags, struct user_namespace *user_ns, void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid);
void net_ns_barrier(void); + +struct ns_common *get_net_ns(struct ns_common *ns); #else /* CONFIG_NET_NS */ #include <linux/sched.h> #include <linux/nsproxy.h> @@ -194,6 +196,11 @@ static inline void net_ns_get_ownership(const struct net *net, }
static inline void net_ns_barrier(void) {} + +static inline struct ns_common *get_net_ns(struct ns_common *ns) +{ + return ERR_PTR(-EINVAL); +} #endif /* CONFIG_NET_NS */
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index c60123dff8039..939d8a31eb82a 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -598,6 +598,18 @@ void __put_net(struct net *net) } EXPORT_SYMBOL_GPL(__put_net);
+/** + * get_net_ns - increment the refcount of the network namespace + * @ns: common namespace (net) + * + * Returns the net's common namespace. + */ +struct ns_common *get_net_ns(struct ns_common *ns) +{ + return &get_net(container_of(ns, struct net, ns))->ns; +} +EXPORT_SYMBOL_GPL(get_net_ns); + struct net *get_net_ns_by_fd(int fd) { struct file *file; diff --git a/net/socket.c b/net/socket.c index bf943e6d47fe8..9e4ef5342dd81 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1048,19 +1048,6 @@ static long sock_do_ioctl(struct net *net, struct socket *sock, * what to do with it - that's up to the protocol still. */
-/** - * get_net_ns - increment the refcount of the network namespace - * @ns: common namespace (net) - * - * Returns the net's common namespace. - */ - -struct ns_common *get_net_ns(struct ns_common *ns) -{ - return &get_net(container_of(ns, struct net, ns))->ns; -} -EXPORT_SYMBOL_GPL(get_net_ns); - static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct socket *sock;
From: Chengyang Fan cy.fan@huawei.com
stable inclusion from linux-4.19.196 commit 1e28018b5c83d5073f74a6fb72eabe8370b2f501
--------------------------------
[ Upstream commit d8e2973029b8b2ce477b564824431f3385c77083 ]
BUG: memory leak unreferenced object 0xffff888101bc4c00 (size 32): comm "syz-executor527", pid 360, jiffies 4294807421 (age 19.329s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 01 00 00 00 00 00 00 00 ac 14 14 bb 00 00 02 00 ................ backtrace: [<00000000f17c5244>] kmalloc include/linux/slab.h:558 [inline] [<00000000f17c5244>] kzalloc include/linux/slab.h:688 [inline] [<00000000f17c5244>] ip_mc_add1_src net/ipv4/igmp.c:1971 [inline] [<00000000f17c5244>] ip_mc_add_src+0x95f/0xdb0 net/ipv4/igmp.c:2095 [<000000001cb99709>] ip_mc_source+0x84c/0xea0 net/ipv4/igmp.c:2416 [<0000000052cf19ed>] do_ip_setsockopt net/ipv4/ip_sockglue.c:1294 [inline] [<0000000052cf19ed>] ip_setsockopt+0x114b/0x30c0 net/ipv4/ip_sockglue.c:1423 [<00000000477edfbc>] raw_setsockopt+0x13d/0x170 net/ipv4/raw.c:857 [<00000000e75ca9bb>] __sys_setsockopt+0x158/0x270 net/socket.c:2117 [<00000000bdb993a8>] __do_sys_setsockopt net/socket.c:2128 [inline] [<00000000bdb993a8>] __se_sys_setsockopt net/socket.c:2125 [inline] [<00000000bdb993a8>] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2125 [<000000006a1ffdbd>] do_syscall_64+0x40/0x80 arch/x86/entry/common.c:47 [<00000000b11467c4>] entry_SYSCALL_64_after_hwframe+0x44/0xae
In commit 24803f38a5c0 ("igmp: do not remove igmp souce list info when set link down"), the ip_mc_clear_src() in ip_mc_destroy_dev() was removed, because it was also called in igmpv3_clear_delrec().
Rough callgraph:
inetdev_destroy -> ip_mc_destroy_dev -> igmpv3_clear_delrec -> ip_mc_clear_src -> RCU_INIT_POINTER(dev->ip_ptr, NULL)
However, ip_mc_clear_src() called in igmpv3_clear_delrec() doesn't release in_dev->mc_list->sources. And RCU_INIT_POINTER() assigns the NULL to dev->ip_ptr. As a result, in_dev cannot be obtained through inetdev_by_index() and then in_dev->mc_list->sources cannot be released by ip_mc_del1_src() in the sock_close. Rough call sequence goes like:
sock_close -> __sock_release -> inet_release -> ip_mc_drop_socket -> inetdev_by_index -> ip_mc_leave_src -> ip_mc_del_src -> ip_mc_del1_src
So we still need to call ip_mc_clear_src() in ip_mc_destroy_dev() to free in_dev->mc_list->sources.
Fixes: 24803f38a5c0 ("igmp: do not remove igmp souce list info ...") Reported-by: Hulk Robot hulkci@huawei.com Signed-off-by: Chengyang Fan cy.fan@huawei.com Acked-by: Hangbin Liu liuhangbin@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/igmp.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 228315321fd09..1a0953b2b1396 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1816,6 +1816,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev) while ((i = rtnl_dereference(in_dev->mc_list)) != NULL) { in_dev->mc_list = i->next_rcu; in_dev->mc_count--; + ip_mc_clear_src(i); ip_ma_put(i); } }
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.196 commit 0693d384ff0de10722e4950333adfc299086b472
--------------------------------
[ Upstream commit a494bd642d9120648b06bb7d28ce6d05f55a7819 ]
While unix_may_send(sk, osk) is called while osk is locked, it appears unix_release_sock() can overwrite unix_peer() after this lock has been released, making KCSAN unhappy.
Changing unix_release_sock() to access/change unix_peer() before lock is released should fix this issue.
BUG: KCSAN: data-race in unix_dgram_sendmsg / unix_release_sock
write to 0xffff88810465a338 of 8 bytes by task 20852 on cpu 1: unix_release_sock+0x4ed/0x6e0 net/unix/af_unix.c:558 unix_release+0x2f/0x50 net/unix/af_unix.c:859 __sock_release net/socket.c:599 [inline] sock_close+0x6c/0x150 net/socket.c:1258 __fput+0x25b/0x4e0 fs/file_table.c:280 ____fput+0x11/0x20 fs/file_table.c:313 task_work_run+0xae/0x130 kernel/task_work.c:164 tracehook_notify_resume include/linux/tracehook.h:189 [inline] exit_to_user_mode_loop kernel/entry/common.c:175 [inline] exit_to_user_mode_prepare+0x156/0x190 kernel/entry/common.c:209 __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] syscall_exit_to_user_mode+0x20/0x40 kernel/entry/common.c:302 do_syscall_64+0x56/0x90 arch/x86/entry/common.c:57 entry_SYSCALL_64_after_hwframe+0x44/0xae
read to 0xffff88810465a338 of 8 bytes by task 20888 on cpu 0: unix_may_send net/unix/af_unix.c:189 [inline] unix_dgram_sendmsg+0x923/0x1610 net/unix/af_unix.c:1712 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg net/socket.c:674 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2350 ___sys_sendmsg net/socket.c:2404 [inline] __sys_sendmmsg+0x315/0x4b0 net/socket.c:2490 __do_sys_sendmmsg net/socket.c:2519 [inline] __se_sys_sendmmsg net/socket.c:2516 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2516 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae
value changed: 0xffff888167905400 -> 0x0000000000000000
Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 20888 Comm: syz-executor.0 Not tainted 5.13.0-rc5-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/unix/af_unix.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index b09f0b567db51..337c4797ab167 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -542,12 +542,14 @@ static void unix_release_sock(struct sock *sk, int embrion) u->path.mnt = NULL; state = sk->sk_state; sk->sk_state = TCP_CLOSE; + + skpair = unix_peer(sk); + unix_peer(sk) = NULL; + unix_state_unlock(sk);
wake_up_interruptible_all(&u->peer_wait);
- skpair = unix_peer(sk); - if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { unix_state_lock(skpair); @@ -562,7 +564,6 @@ static void unix_release_sock(struct sock *sk, int embrion)
unix_dgram_peer_wake_disconnect(sk, skpair); sock_put(skpair); /* It may now die */ - unix_peer(sk) = NULL; }
/* Try to flush out this socket. Throw out buffers at least */
From: Toke Høiland-Jørgensen toke@redhat.com
stable inclusion from linux-4.19.196 commit 3bf50dc858f62b58f0aea9f62322e00d22115260
--------------------------------
[ Upstream commit 321827477360934dc040e9d3c626bf1de6c3ab3c ]
When constructing ICMP response messages, the kernel will try to pick a suitable source address for the outgoing packet. However, if no IPv4 addresses are configured on the system at all, this will fail and we end up producing an ICMP message with a source address of 0.0.0.0. This can happen on a box routing IPv4 traffic via v6 nexthops, for instance.
Since 0.0.0.0 is not generally routable on the internet, there's a good chance that such ICMP messages will never make it back to the sender of the original packet that the ICMP message was sent in response to. This, in turn, can create connectivity and PMTUd problems for senders. Fortunately, RFC7600 reserves a dummy address to be used as a source for ICMP messages (192.0.0.8/32), so let's teach the kernel to substitute that address as a last resort if the regular source address selection procedure fails.
Below is a quick example reproducing this issue with network namespaces:
ip netns add ns0 ip l add type veth peer netns ns0 ip l set dev veth0 up ip a add 10.0.0.1/24 dev veth0 ip a add fc00:dead:cafe:42::1/64 dev veth0 ip r add 10.1.0.0/24 via inet6 fc00:dead:cafe:42::2 ip -n ns0 l set dev veth0 up ip -n ns0 a add fc00:dead:cafe:42::2/64 dev veth0 ip -n ns0 r add 10.0.0.0/24 via inet6 fc00:dead:cafe:42::1 ip netns exec ns0 sysctl -w net.ipv4.icmp_ratelimit=0 ip netns exec ns0 sysctl -w net.ipv4.ip_forward=1 tcpdump -tpni veth0 -c 2 icmp & ping -w 1 10.1.0.1 > /dev/null tcpdump: verbose output suppressed, use -v[v]... for full protocol decode listening on veth0, link-type EN10MB (Ethernet), snapshot length 262144 bytes IP 10.0.0.1 > 10.1.0.1: ICMP echo request, id 29, seq 1, length 64 IP 0.0.0.0 > 10.0.0.1: ICMP net 10.1.0.1 unreachable, length 92 2 packets captured 2 packets received by filter 0 packets dropped by kernel
With this patch the above capture changes to: IP 10.0.0.1 > 10.1.0.1: ICMP echo request, id 31127, seq 1, length 64 IP 192.0.0.8 > 10.0.0.1: ICMP net 10.1.0.1 unreachable, length 92
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Juliusz Chroboczek jch@irif.fr Reviewed-by: David Ahern dsahern@kernel.org Signed-off-by: Toke Høiland-Jørgensen toke@redhat.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/uapi/linux/in.h | 3 +++ net/ipv4/icmp.c | 7 +++++++ 2 files changed, 10 insertions(+)
diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index 48e8a225b985a..2a66ab49f14dd 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -280,6 +280,9 @@ struct sockaddr_in { /* Address indicating an error return. */ #define INADDR_NONE ((unsigned long int) 0xffffffff)
+/* Dummy address for src of ICMP replies if no real address is set (RFC7600). */ +#define INADDR_DUMMY ((unsigned long int) 0xc0000008) + /* Network number for local host loopback. */ #define IN_LOOPBACKNET 127
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 609a73fe6fbbe..88d676beda399 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -746,6 +746,13 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, icmp_param.data_len = room; icmp_param.head_len = sizeof(struct icmphdr);
+ /* if we don't have a source address at this point, fall back to the + * dummy address instead of sending out a packet with a source address + * of 0.0.0.0 + */ + if (!fl4.saddr) + fl4.saddr = htonl(INADDR_DUMMY); + icmp_push_reply(&icmp_param, &fl4, &ipc, &rt); ende: ip_rt_put(rt);
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
stable inclusion from linux-4.19.196 commit ae1fe292b314fbc1db9b1897d1f354c8f2e1931a
--------------------------------
commit 85550c83da421fb12dc1816c45012e1e638d2b38 upstream.
The saved_cmdlines is used to map pids to the task name, such that the output of the tracing does not just show pids, but also gives a human readable name for the task.
If the name is not mapped, the output looks like this:
<...>-1316 [005] ...2 132.044039: ...
Instead of this:
gnome-shell-1316 [005] ...2 132.044039: ...
The names are updated when tracing is running, but are skipped if tracing is stopped. Unfortunately, this stops the recording of the names if the top level tracer is stopped, and not if there's other tracers active.
The recording of a name only happens when a new event is written into a ring buffer, so there is no need to test if tracing is on or not. If tracing is off, then no event is written and no need to test if tracing is off or not.
Remove the check, as it hides the names of tasks for events in the instance buffers.
Cc: stable@vger.kernel.org Fixes: 7ffbd48d5cab2 ("tracing: Cache comms only after an event occurred") Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/trace.c | 2 -- 1 file changed, 2 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7d3cea9fb874c..9b03b8cff9285 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2029,8 +2029,6 @@ static bool tracing_record_taskinfo_skip(int flags) { if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) return true; - if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on()) - return true; if (!__this_cpu_read(trace_taskinfo_save)) return true; return false;
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
stable inclusion from linux-4.19.196 commit 04e7a7c95027e72bba8973e1c4b9b4a1b29f7a0c
--------------------------------
commit 4fdd595e4f9a1ff6d93ec702eaecae451cfc6591 upstream.
A while ago, when the "trace" file was opened, tracing was stopped, and code was added to stop recording the comms to saved_cmdlines, for mapping of the pids to the task name.
Code has been added that only records the comm if a trace event occurred, and there's no reason to not trace it if the trace file is opened.
Cc: stable@vger.kernel.org Fixes: 7ffbd48d5cab2 ("tracing: Cache comms only after an event occurred") Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/trace.c | 9 --------- 1 file changed, 9 deletions(-)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9b03b8cff9285..475577163c0e9 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1741,9 +1741,6 @@ struct saved_cmdlines_buffer { }; static struct saved_cmdlines_buffer *savedcmd;
-/* temporary disable recording */ -static atomic_t trace_record_taskinfo_disabled __read_mostly; - static inline char *get_saved_cmdlines(int idx) { return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; @@ -3254,9 +3251,6 @@ static void *s_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-EBUSY); #endif
- if (!iter->snapshot) - atomic_inc(&trace_record_taskinfo_disabled); - if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; @@ -3299,9 +3293,6 @@ static void s_stop(struct seq_file *m, void *p) return; #endif
- if (!iter->snapshot) - atomic_dec(&trace_record_taskinfo_disabled); - trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); }
From: "Steven Rostedt (VMware)" rostedt@goodmis.org
stable inclusion from linux-4.19.196 commit 79197fb7aea98e5b5084c9c1489064a4b4d51494
--------------------------------
commit 89529d8b8f8daf92d9979382b8d2eb39966846ea upstream.
The trace_clock_global() tries to make sure the events between CPUs is somewhat in order. A global value is used and updated by the latest read of a clock. If one CPU is ahead by a little, and is read by another CPU, a lock is taken, and if the timestamp of the other CPU is behind, it will simply use the other CPUs timestamp.
The lock is also only taken with a "trylock" due to tracing, and strange recursions can happen. The lock is not taken at all in NMI context.
In the case where the lock is not able to be taken, the non synced timestamp is returned. But it will not be less than the saved global timestamp.
The problem arises because when the time goes "backwards" the time returned is the saved timestamp plus 1. If the lock is not taken, and the plus one to the timestamp is returned, there's a small race that can cause the time to go backwards!
CPU0 CPU1 ---- ---- trace_clock_global() { ts = clock() [ 1000 ] trylock(clock_lock) [ success ] global_ts = ts; [ 1000 ]
<interrupted by NMI> trace_clock_global() { ts = clock() [ 999 ] if (ts < global_ts) ts = global_ts + 1 [ 1001 ]
trylock(clock_lock) [ fail ]
return ts [ 1001] } unlock(clock_lock); return ts; [ 1000 ] }
trace_clock_global() { ts = clock() [ 1000 ] if (ts < global_ts) [ false 1000 == 1000 ]
trylock(clock_lock) [ success ] global_ts = ts; [ 1000 ] unlock(clock_lock)
return ts; [ 1000 ] }
The above case shows to reads of trace_clock_global() on the same CPU, but the second read returns one less than the first read. That is, time when backwards, and this is not what is allowed by trace_clock_global().
This was triggered by heavy tracing and the ring buffer checker that tests for the clock going backwards:
Ring buffer clock went backwards: 20613921464 -> 20613921463 ------------[ cut here ]------------ WARNING: CPU: 2 PID: 0 at kernel/trace/ring_buffer.c:3412 check_buffer+0x1b9/0x1c0 Modules linked in: [..] [CPU: 2]TIME DOES NOT MATCH expected:20620711698 actual:20620711697 delta:6790234 before:20613921463 after:20613921463 [20613915818] PAGE TIME STAMP [20613915818] delta:0 [20613915819] delta:1 [20613916035] delta:216 [20613916465] delta:430 [20613916575] delta:110 [20613916749] delta:174 [20613917248] delta:499 [20613917333] delta:85 [20613917775] delta:442 [20613917921] delta:146 [20613918321] delta:400 [20613918568] delta:247 [20613918768] delta:200 [20613919306] delta:538 [20613919353] delta:47 [20613919980] delta:627 [20613920296] delta:316 [20613920571] delta:275 [20613920862] delta:291 [20613921152] delta:290 [20613921464] delta:312 [20613921464] delta:0 TIME EXTEND [20613921464] delta:0
This happened more than once, and always for an off by one result. It also started happening after commit aafe104aa9096 was added.
Cc: stable@vger.kernel.org Fixes: aafe104aa9096 ("tracing: Restructure trace_clock_global() to never block") Signed-off-by: Steven Rostedt (VMware) rostedt@goodmis.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/trace/trace_clock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index c1637f90c8a38..4702efb00ff21 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -115,9 +115,9 @@ u64 notrace trace_clock_global(void) prev_time = READ_ONCE(trace_clock_struct.prev_time); now = sched_clock_cpu(this_cpu);
- /* Make sure that now is always greater than prev_time */ + /* Make sure that now is always greater than or equal to prev_time */ if ((s64)(now - prev_time) < 0) - now = prev_time + 1; + now = prev_time;
/* * If in an NMI context then dont risk lockups and simply return @@ -131,7 +131,7 @@ u64 notrace trace_clock_global(void) /* Reread prev_time in case it was already updated */ prev_time = READ_ONCE(trace_clock_struct.prev_time); if ((s64)(now - prev_time) < 0) - now = prev_time + 1; + now = prev_time;
trace_clock_struct.prev_time = now;
From: Nikolay Aleksandrov nikolay@nvidia.com
stable inclusion from linux-4.19.196 commit 24a6e55f17aa123bc1fc54b7d3c410b41bc16530
--------------------------------
commit 58e2071742e38f29f051b709a5cca014ba51166f upstream.
This patch fixes a tunnel_dst null pointer dereference due to lockless access in the tunnel egress path. When deleting a vlan tunnel the tunnel_dst pointer is set to NULL without waiting a grace period (i.e. while it's still usable) and packets egressing are dereferencing it without checking. Use READ/WRITE_ONCE to annotate the lockless use of tunnel_id, use RCU for accessing tunnel_dst and make sure it is read only once and checked in the egress path. The dst is already properly RCU protected so we don't need to do anything fancy than to make sure tunnel_id and tunnel_dst are read only once and checked in the egress path.
Cc: stable@vger.kernel.org Fixes: 11538d039ac6 ("bridge: vlan dst_metadata hooks in ingress and egress paths") Signed-off-by: Nikolay Aleksandrov nikolay@nvidia.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/bridge/br_private.h | 4 ++-- net/bridge/br_vlan_tunnel.c | 38 +++++++++++++++++++++++-------------- 2 files changed, 26 insertions(+), 16 deletions(-)
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2c9dbd437306c..23fed55942fa5 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -100,8 +100,8 @@ struct br_vlan_stats { };
struct br_tunnel_info { - __be64 tunnel_id; - struct metadata_dst *tunnel_dst; + __be64 tunnel_id; + struct metadata_dst __rcu *tunnel_dst; };
/** diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 6d2c4eed2dc89..4d5100677c685 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -46,26 +46,33 @@ static struct net_bridge_vlan *br_vlan_tunnel_lookup(struct rhashtable *tbl, br_vlan_tunnel_rht_params); }
+static void vlan_tunnel_info_release(struct net_bridge_vlan *vlan) +{ + struct metadata_dst *tdst = rtnl_dereference(vlan->tinfo.tunnel_dst); + + WRITE_ONCE(vlan->tinfo.tunnel_id, 0); + RCU_INIT_POINTER(vlan->tinfo.tunnel_dst, NULL); + dst_release(&tdst->dst); +} + void vlan_tunnel_info_del(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan) { - if (!vlan->tinfo.tunnel_dst) + if (!rcu_access_pointer(vlan->tinfo.tunnel_dst)) return; rhashtable_remove_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); - vlan->tinfo.tunnel_id = 0; - dst_release(&vlan->tinfo.tunnel_dst->dst); - vlan->tinfo.tunnel_dst = NULL; + vlan_tunnel_info_release(vlan); }
static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, struct net_bridge_vlan *vlan, u32 tun_id) { - struct metadata_dst *metadata = NULL; + struct metadata_dst *metadata = rtnl_dereference(vlan->tinfo.tunnel_dst); __be64 key = key32_to_tunnel_id(cpu_to_be32(tun_id)); int err;
- if (vlan->tinfo.tunnel_dst) + if (metadata) return -EEXIST;
metadata = __ip_tun_set_dst(0, 0, 0, 0, 0, TUNNEL_KEY, @@ -74,8 +81,8 @@ static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg, return -EINVAL;
metadata->u.tun_info.mode |= IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_BRIDGE; - vlan->tinfo.tunnel_dst = metadata; - vlan->tinfo.tunnel_id = key; + rcu_assign_pointer(vlan->tinfo.tunnel_dst, metadata); + WRITE_ONCE(vlan->tinfo.tunnel_id, key);
err = rhashtable_lookup_insert_fast(&vg->tunnel_hash, &vlan->tnode, br_vlan_tunnel_rht_params); @@ -84,9 +91,7 @@ static int __vlan_tunnel_info_add(struct net_bridge_vlan_group *vg,
return 0; out: - dst_release(&vlan->tinfo.tunnel_dst->dst); - vlan->tinfo.tunnel_dst = NULL; - vlan->tinfo.tunnel_id = 0; + vlan_tunnel_info_release(vlan);
return err; } @@ -186,12 +191,15 @@ int br_handle_ingress_vlan_tunnel(struct sk_buff *skb, int br_handle_egress_vlan_tunnel(struct sk_buff *skb, struct net_bridge_vlan *vlan) { + struct metadata_dst *tunnel_dst; + __be64 tunnel_id; int err;
- if (!vlan || !vlan->tinfo.tunnel_id) + if (!vlan) return 0;
- if (unlikely(!skb_vlan_tag_present(skb))) + tunnel_id = READ_ONCE(vlan->tinfo.tunnel_id); + if (!tunnel_id || unlikely(!skb_vlan_tag_present(skb))) return 0;
skb_dst_drop(skb); @@ -199,7 +207,9 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, if (err) return err;
- skb_dst_set(skb, dst_clone(&vlan->tinfo.tunnel_dst->dst)); + tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); + if (tunnel_dst) + skb_dst_set(skb, dst_clone(&tunnel_dst->dst));
return 0; }
From: Nikolay Aleksandrov nikolay@nvidia.com
stable inclusion from linux-4.19.196 commit 84fc1c944e45ab317e2e70a0e7f76fa2a5e43b6e
--------------------------------
commit cfc579f9d89af4ada58c69b03bcaa4887840f3b3 upstream.
The egress tunnel code uses dst_clone() and directly sets the result which is wrong because the entry might have 0 refcnt or be already deleted, causing number of problems. It also triggers the WARN_ON() in dst_hold()[1] when a refcnt couldn't be taken. Fix it by using dst_hold_safe() and checking if a reference was actually taken before setting the dst.
[1] dmesg WARN_ON log and following refcnt errors WARNING: CPU: 5 PID: 38 at include/net/dst.h:230 br_handle_egress_vlan_tunnel+0x10b/0x134 [bridge] Modules linked in: 8021q garp mrp bridge stp llc bonding ipv6 virtio_net CPU: 5 PID: 38 Comm: ksoftirqd/5 Kdump: loaded Tainted: G W 5.13.0-rc3+ #360 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-1.fc33 04/01/2014 RIP: 0010:br_handle_egress_vlan_tunnel+0x10b/0x134 [bridge] Code: e8 85 bc 01 e1 45 84 f6 74 90 45 31 f6 85 db 48 c7 c7 a0 02 19 a0 41 0f 94 c6 31 c9 31 d2 44 89 f6 e8 64 bc 01 e1 85 db 75 02 <0f> 0b 31 c9 31 d2 44 89 f6 48 c7 c7 70 02 19 a0 e8 4b bc 01 e1 49 RSP: 0018:ffff8881003d39e8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000001 RDI: ffffffffa01902a0 RBP: ffff8881040c6700 R08: 0000000000000000 R09: 0000000000000001 R10: 2ce93d0054fe0d00 R11: 54fe0d00000e0000 R12: ffff888109515000 R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000401 FS: 0000000000000000(0000) GS:ffff88822bf40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f42ba70f030 CR3: 0000000109926000 CR4: 00000000000006e0 Call Trace: br_handle_vlan+0xbc/0xca [bridge] __br_forward+0x23/0x164 [bridge] deliver_clone+0x41/0x48 [bridge] br_handle_frame_finish+0x36f/0x3aa [bridge] ? skb_dst+0x2e/0x38 [bridge] ? br_handle_ingress_vlan_tunnel+0x3e/0x1c8 [bridge] ? br_handle_frame_finish+0x3aa/0x3aa [bridge] br_handle_frame+0x2c3/0x377 [bridge] ? __skb_pull+0x33/0x51 ? vlan_do_receive+0x4f/0x36a ? br_handle_frame_finish+0x3aa/0x3aa [bridge] __netif_receive_skb_core+0x539/0x7c6 ? __list_del_entry_valid+0x16e/0x1c2 __netif_receive_skb_list_core+0x6d/0xd6 netif_receive_skb_list_internal+0x1d9/0x1fa gro_normal_list+0x22/0x3e dev_gro_receive+0x55b/0x600 ? detach_buf_split+0x58/0x140 napi_gro_receive+0x94/0x12e virtnet_poll+0x15d/0x315 [virtio_net] __napi_poll+0x2c/0x1c9 net_rx_action+0xe6/0x1fb __do_softirq+0x115/0x2d8 run_ksoftirqd+0x18/0x20 smpboot_thread_fn+0x183/0x19c ? smpboot_unregister_percpu_thread+0x66/0x66 kthread+0x10a/0x10f ? kthread_mod_delayed_work+0xb6/0xb6 ret_from_fork+0x22/0x30 ---[ end trace 49f61b07f775fd2b ]--- dst_release: dst:00000000c02d677a refcnt:-1 dst_release underflow
Cc: stable@vger.kernel.org Fixes: 11538d039ac6 ("bridge: vlan dst_metadata hooks in ingress and egress paths") Signed-off-by: Nikolay Aleksandrov nikolay@nvidia.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/bridge/br_vlan_tunnel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/bridge/br_vlan_tunnel.c b/net/bridge/br_vlan_tunnel.c index 4d5100677c685..adb6845ceba46 100644 --- a/net/bridge/br_vlan_tunnel.c +++ b/net/bridge/br_vlan_tunnel.c @@ -208,8 +208,8 @@ int br_handle_egress_vlan_tunnel(struct sk_buff *skb, return err;
tunnel_dst = rcu_dereference(vlan->tinfo.tunnel_dst); - if (tunnel_dst) - skb_dst_set(skb, dst_clone(&tunnel_dst->dst)); + if (tunnel_dst && dst_hold_safe(&tunnel_dst->dst)) + skb_dst_set(skb, &tunnel_dst->dst);
return 0; }
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.196 commit 7f7e23df8509e072593200400a4b094cc44376d2
--------------------------------
commit aa6dd211e4b1dde9d5dc25d699d35f789ae7eeba upstream.
In commit 73f156a6e8c1 ("inetpeer: get rid of ip_id_count") I used a very small hash table that could be abused by patient attackers to reveal sensitive information.
Switch to a dynamic sizing, depending on RAM size.
Typical big hosts will now use 128x more storage (2 MB) to get a similar increase in security and reduction of hash collisions.
As a bonus, use of alloc_large_system_hash() spreads allocated memory among all NUMA nodes.
Fixes: 73f156a6e8c1 ("inetpeer: get rid of ip_id_count") Reported-by: Amit Klein aksecurity@gmail.com Signed-off-by: Eric Dumazet edumazet@google.com Cc: Willy Tarreau w@1wt.eu Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/route.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-)
diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 106c970020883..958df3427c34f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -70,6 +70,7 @@ #include <linux/types.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/bootmem.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/sockios.h> @@ -470,8 +471,10 @@ static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); }
-#define IP_IDENTS_SZ 2048u - +/* Hash tables of size 2048..262144 depending on RAM size. + * Each bucket uses 8 bytes. + */ +static u32 ip_idents_mask __read_mostly; static atomic_t *ip_idents __read_mostly; static u32 *ip_tstamps __read_mostly;
@@ -481,12 +484,16 @@ static u32 *ip_tstamps __read_mostly; */ u32 ip_idents_reserve(u32 hash, int segs) { - u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; - atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; - u32 old = READ_ONCE(*p_tstamp); - u32 now = (u32)jiffies; + u32 bucket, old, now = (u32)jiffies; + atomic_t *p_id; + u32 *p_tstamp; u32 delta = 0;
+ bucket = hash & ip_idents_mask; + p_tstamp = ip_tstamps + bucket; + p_id = ip_idents + bucket; + old = READ_ONCE(*p_tstamp); + if (old != now && cmpxchg(p_tstamp, old, now) == old) delta = prandom_u32_max(now - old);
@@ -3192,18 +3199,25 @@ struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
int __init ip_rt_init(void) { + void *idents_hash; int cpu;
- ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents), - GFP_KERNEL); - if (!ip_idents) - panic("IP: failed to allocate ip_idents\n"); + /* For modern hosts, this will use 2 MB of memory */ + idents_hash = alloc_large_system_hash("IP idents", + sizeof(*ip_idents) + sizeof(*ip_tstamps), + 0, + 16, /* one bucket per 64 KB */ + HASH_ZERO, + NULL, + &ip_idents_mask, + 2048, + 256*1024); + + ip_idents = idents_hash;
- prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); + prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
- ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); - if (!ip_tstamps) - panic("IP: failed to allocate ip_tstamps\n"); + ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
for_each_possible_cpu(cpu) { struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.196 commit 55cfd22f9b5a051fa815d6987825e00aa15427af
--------------------------------
commit efa165504943f2128d50f63de0c02faf6dcceb0d upstream.
If access_ok() or fpregs_soft_set() fails in __fpu__restore_sig() then the function just returns but does not clear the FPU state as it does for all other fatal failures.
Clear the FPU state for these failures as well.
Fixes: 72a671ced66d ("x86, fpu: Unify signal handling code paths for x86 and x86_64 kernels") Signed-off-by: Thomas Gleixner tglx@linutronix.de Signed-off-by: Borislav Petkov bp@suse.de Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/87mtryyhhz.ffs@nanos.tec.linutronix.de Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: arch/x86/kernel/fpu/signal.c Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/kernel/fpu/signal.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-)
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index f6a1d299627c5..3d9f4b7b4121c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -272,6 +272,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) int state_size = fpu_kernel_xstate_size; u64 xfeatures = 0; int fx_only = 0; + int ret = 0;
ia32_fxstate &= (IS_ENABLED(CONFIG_X86_32) || IS_ENABLED(CONFIG_IA32_EMULATION)); @@ -281,15 +282,21 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) return 0; }
- if (!access_ok(buf, size)) - return -EACCES; + if (!access_ok(buf, size)) { + ret = -EACCES; + goto out_err; + }
fpu__initialize(fpu);
- if (!static_cpu_has(X86_FEATURE_FPU)) - return fpregs_soft_set(current, NULL, - 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) != 0; + if (!static_cpu_has(X86_FEATURE_FPU)) { + ret = fpregs_soft_set(current, NULL, + 0, sizeof(struct user_i387_ia32_struct), + NULL, buf) != 0; + if (ret) + goto out_err; + return 0; + }
if (use_xsave()) { struct _fpx_sw_bytes fx_sw_user; @@ -349,6 +356,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpu__restore(fpu); local_bh_enable();
+ /* Failure is already handled */ return err; } else { /* @@ -356,13 +364,14 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) * state to the registers directly (with exceptions handled). */ user_fpu_begin(); - if (copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) { - fpu__clear(fpu); - return -1; - } + if (!copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only)) + return 0; + ret = -1; }
- return 0; +out_err: + fpu__clear(fpu); + return ret; }
static inline int xstate_sigframe_size(void)
From: Austin Kim austindh.kim@gmail.com
stable inclusion from linux-4.19.196 commit 8b0f8cf5b02df6447974eea1a456b9f642fa1241
--------------------------------
[ Upstream commit 80ec82e3d2c1fab42eeb730aaa7985494a963d3f ]
Several ethtool functions leave heap uncleared (potentially) by drivers. This will leave the unused portion of heap unchanged and might copy the full contents back to userspace.
Signed-off-by: Austin Kim austindh.kim@gmail.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/core/ethtool.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 83028017c26dd..4db9512feba83 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1594,7 +1594,7 @@ static int ethtool_get_any_eeprom(struct net_device *dev, void __user *useraddr, if (eeprom.offset + eeprom.len > total_len) return -EINVAL;
- data = kmalloc(PAGE_SIZE, GFP_USER); + data = kzalloc(PAGE_SIZE, GFP_USER); if (!data) return -ENOMEM;
@@ -1659,7 +1659,7 @@ static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) return -EINVAL;
- data = kmalloc(PAGE_SIZE, GFP_USER); + data = kzalloc(PAGE_SIZE, GFP_USER); if (!data) return -ENOMEM;
@@ -1840,7 +1840,7 @@ static int ethtool_self_test(struct net_device *dev, char __user *useraddr) return -EFAULT;
test.len = test_len; - data = kmalloc_array(test_len, sizeof(u64), GFP_USER); + data = kcalloc(test_len, sizeof(u64), GFP_USER); if (!data) return -ENOMEM;
@@ -2372,7 +2372,7 @@ static int ethtool_get_tunable(struct net_device *dev, void __user *useraddr) ret = ethtool_tunable_valid(&tuna); if (ret) return ret; - data = kmalloc(tuna.len, GFP_USER); + data = kzalloc(tuna.len, GFP_USER); if (!data) return -ENOMEM; ret = ops->get_tunable(dev, &tuna, data); @@ -2552,7 +2552,7 @@ static int get_phy_tunable(struct net_device *dev, void __user *useraddr) ret = ethtool_phy_tunable_valid(&tuna); if (ret) return ret; - data = kmalloc(tuna.len, GFP_USER); + data = kzalloc(tuna.len, GFP_USER); if (!data) return -ENOMEM; mutex_lock(&phydev->lock);
From: Zheng Yongjun zhengyongjun3@huawei.com
stable inclusion from linux-4.19.196 commit 1d86498c252e6b680710a31c3a003491477dd50d
--------------------------------
[ Upstream commit 9d44fa3e50cc91691896934d106c86e4027e61ca ]
Function 'ping_queue_rcv_skb' not always return success, which will also return fail. If not check the wrong return value of it, lead to function `ping_rcv` return success.
Signed-off-by: Zheng Yongjun zhengyongjun3@huawei.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/ping.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index c59144502ee88..862744c285482 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -968,6 +968,7 @@ bool ping_rcv(struct sk_buff *skb) struct sock *sk; struct net *net = dev_net(skb->dev); struct icmphdr *icmph = icmp_hdr(skb); + bool rc = false;
/* We assume the packet has already been checked by icmp_rcv */
@@ -982,14 +983,15 @@ bool ping_rcv(struct sk_buff *skb) struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
pr_debug("rcv on socket %p\n", sk); - if (skb2) - ping_queue_rcv_skb(sk, skb2); + if (skb2 && !ping_queue_rcv_skb(sk, skb2)) + rc = true; sock_put(sk); - return true; } - pr_debug("no socket, dropping\n");
- return false; + if (!rc) + pr_debug("no socket, dropping\n"); + + return rc; } EXPORT_SYMBOL_GPL(ping_rcv);
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.196 commit a501d3006839162b34a64b7641027278c419235d
--------------------------------
[ Upstream commit b71eaed8c04f72a919a9c44e83e4ee254e69e7f3 ]
UDP sendmsg() path can be lockless, it is possible for another thread to re-connect an change sk->sk_txhash under us.
There is no serious impact, but we can use READ_ONCE()/WRITE_ONCE() pair to document the race.
BUG: KCSAN: data-race in __ip4_datagram_connect / skb_set_owner_w
write to 0xffff88813397920c of 4 bytes by task 30997 on cpu 1: sk_set_txhash include/net/sock.h:1937 [inline] __ip4_datagram_connect+0x69e/0x710 net/ipv4/datagram.c:75 __ip6_datagram_connect+0x551/0x840 net/ipv6/datagram.c:189 ip6_datagram_connect+0x2a/0x40 net/ipv6/datagram.c:272 inet_dgram_connect+0xfd/0x180 net/ipv4/af_inet.c:580 __sys_connect_file net/socket.c:1837 [inline] __sys_connect+0x245/0x280 net/socket.c:1854 __do_sys_connect net/socket.c:1864 [inline] __se_sys_connect net/socket.c:1861 [inline] __x64_sys_connect+0x3d/0x50 net/socket.c:1861 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae
read to 0xffff88813397920c of 4 bytes by task 31039 on cpu 0: skb_set_hash_from_sk include/net/sock.h:2211 [inline] skb_set_owner_w+0x118/0x220 net/core/sock.c:2101 sock_alloc_send_pskb+0x452/0x4e0 net/core/sock.c:2359 sock_alloc_send_skb+0x2d/0x40 net/core/sock.c:2373 __ip6_append_data+0x1743/0x21a0 net/ipv6/ip6_output.c:1621 ip6_make_skb+0x258/0x420 net/ipv6/ip6_output.c:1983 udpv6_sendmsg+0x160a/0x16b0 net/ipv6/udp.c:1527 inet6_sendmsg+0x5f/0x80 net/ipv6/af_inet6.c:642 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg net/socket.c:674 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2350 ___sys_sendmsg net/socket.c:2404 [inline] __sys_sendmmsg+0x315/0x4b0 net/socket.c:2490 __do_sys_sendmmsg net/socket.c:2519 [inline] __se_sys_sendmmsg net/socket.c:2516 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2516 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae
value changed: 0xbca3c43d -> 0xfdb309e0
Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 31039 Comm: syz-executor.2 Not tainted 5.13.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/sock.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/include/net/sock.h b/include/net/sock.h index 0a26b42a7b2f4..920094bb75920 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1867,7 +1867,8 @@ static inline u32 net_tx_rndhash(void)
static inline void sk_set_txhash(struct sock *sk) { - sk->sk_txhash = net_tx_rndhash(); + /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ + WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); }
static inline void sk_rethink_txhash(struct sock *sk) @@ -2138,9 +2139,12 @@ static inline void sock_poll_wait(struct file *filp, struct socket *sock,
static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) { - if (sk->sk_txhash) { + /* This pairs with WRITE_ONCE() in sk_set_txhash() */ + u32 txhash = READ_ONCE(sk->sk_txhash); + + if (txhash) { skb->l4_hash = 1; - skb->hash = sk->sk_txhash; + skb->hash = txhash; } }
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.196 commit 5079679471fb8535637e6c86609488eff1aac35e
--------------------------------
[ Upstream commit c7d2ef5dd4b03ed0ee1d13bc0c55f9cf62d49bd6 ]
tpacket_snd(), packet_snd(), packet_getname() and packet_seq_show() can read po->num without holding a lock. This means other threads can change po->num at the same time.
KCSAN complained about this known fact [1] Add READ_ONCE()/WRITE_ONCE() to address the issue.
[1] BUG: KCSAN: data-race in packet_do_bind / packet_sendmsg
write to 0xffff888131a0dcc0 of 2 bytes by task 24714 on cpu 0: packet_do_bind+0x3ab/0x7e0 net/packet/af_packet.c:3181 packet_bind+0xc3/0xd0 net/packet/af_packet.c:3255 __sys_bind+0x200/0x290 net/socket.c:1637 __do_sys_bind net/socket.c:1648 [inline] __se_sys_bind net/socket.c:1646 [inline] __x64_sys_bind+0x3d/0x50 net/socket.c:1646 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae
read to 0xffff888131a0dcc0 of 2 bytes by task 24719 on cpu 1: packet_snd net/packet/af_packet.c:2899 [inline] packet_sendmsg+0x317/0x3570 net/packet/af_packet.c:3040 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg net/socket.c:674 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2350 ___sys_sendmsg net/socket.c:2404 [inline] __sys_sendmsg+0x1ed/0x270 net/socket.c:2433 __do_sys_sendmsg net/socket.c:2442 [inline] __se_sys_sendmsg net/socket.c:2440 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2440 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae
value changed: 0x0000 -> 0x1200
Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 24719 Comm: syz-executor.5 Not tainted 5.13.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/packet/af_packet.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c5a3256976426..36accd02b423f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2652,7 +2652,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) } if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); - proto = po->num; + proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) @@ -2865,7 +2865,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); - proto = po->num; + proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) @@ -3137,7 +3137,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, /* prevents packet_notifier() from calling * register_prot_hook() */ - po->num = 0; + WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, true); rcu_read_lock(); dev_curr = po->prot_hook.dev; @@ -3147,7 +3147,7 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, }
BUG_ON(po->running); - po->num = proto; + WRITE_ONCE(po->num, proto); po->prot_hook.type = proto;
if (unlikely(unlisted)) { @@ -3493,7 +3493,7 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
sll->sll_family = AF_PACKET; sll->sll_ifindex = po->ifindex; - sll->sll_protocol = po->num; + sll->sll_protocol = READ_ONCE(po->num); sll->sll_pkttype = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); @@ -4391,7 +4391,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, was_running = po->running; num = po->num; if (was_running) { - po->num = 0; + WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, false); } spin_unlock(&po->bind_lock); @@ -4426,7 +4426,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
spin_lock(&po->bind_lock); if (was_running) { - po->num = num; + WRITE_ONCE(po->num, num); register_prot_hook(sk); } spin_unlock(&po->bind_lock); @@ -4597,7 +4597,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) s, refcount_read(&s->sk_refcnt), s->sk_type, - ntohs(po->num), + ntohs(READ_ONCE(po->num)), po->ifindex, po->running, atomic_read(&s->sk_rmem_alloc),
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.196 commit 47ee8bbf6c12a98dcb24f9014d32b283ca22a664
--------------------------------
[ Upstream commit e032f7c9c7cefffcfb79b9fc16c53011d2d9d11f ]
Like prior patch, we need to annotate lockless accesses to po->ifindex For instance, packet_getname() is reading po->ifindex (twice) while another thread is able to change po->ifindex.
KCSAN reported:
BUG: KCSAN: data-race in packet_do_bind / packet_getname
write to 0xffff888143ce3cbc of 4 bytes by task 25573 on cpu 1: packet_do_bind+0x420/0x7e0 net/packet/af_packet.c:3191 packet_bind+0xc3/0xd0 net/packet/af_packet.c:3255 __sys_bind+0x200/0x290 net/socket.c:1637 __do_sys_bind net/socket.c:1648 [inline] __se_sys_bind net/socket.c:1646 [inline] __x64_sys_bind+0x3d/0x50 net/socket.c:1646 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae
read to 0xffff888143ce3cbc of 4 bytes by task 25578 on cpu 0: packet_getname+0x5b/0x1a0 net/packet/af_packet.c:3525 __sys_getsockname+0x10e/0x1a0 net/socket.c:1887 __do_sys_getsockname net/socket.c:1902 [inline] __se_sys_getsockname net/socket.c:1899 [inline] __x64_sys_getsockname+0x3e/0x50 net/socket.c:1899 do_syscall_64+0x4a/0x90 arch/x86/entry/common.c:47 entry_SYSCALL_64_after_hwframe+0x44/0xae
value changed: 0x00000000 -> 0x00000001
Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 25578 Comm: syz-executor.5 Not tainted 5.13.0-rc6-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/packet/af_packet.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 36accd02b423f..743193041ccee 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3153,11 +3153,11 @@ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, if (unlikely(unlisted)) { dev_put(dev); po->prot_hook.dev = NULL; - po->ifindex = -1; + WRITE_ONCE(po->ifindex, -1); packet_cached_dev_reset(po); } else { po->prot_hook.dev = dev; - po->ifindex = dev ? dev->ifindex : 0; + WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0); packet_cached_dev_assign(po, dev); } } @@ -3472,7 +3472,7 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, uaddr->sa_family = AF_PACKET; memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); rcu_read_lock(); - dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex); + dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); rcu_read_unlock(); @@ -3487,16 +3487,18 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr, struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); + int ifindex;
if (peer) return -EOPNOTSUPP;
+ ifindex = READ_ONCE(po->ifindex); sll->sll_family = AF_PACKET; - sll->sll_ifindex = po->ifindex; + sll->sll_ifindex = ifindex; sll->sll_protocol = READ_ONCE(po->num); sll->sll_pkttype = 0; rcu_read_lock(); - dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex); + dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; @@ -4079,7 +4081,7 @@ static int packet_notifier(struct notifier_block *this, } if (msg == NETDEV_UNREGISTER) { packet_cached_dev_reset(po); - po->ifindex = -1; + WRITE_ONCE(po->ifindex, -1); if (po->prot_hook.dev) dev_put(po->prot_hook.dev); po->prot_hook.dev = NULL; @@ -4598,7 +4600,7 @@ static int packet_seq_show(struct seq_file *seq, void *v) refcount_read(&s->sk_refcnt), s->sk_type, ntohs(READ_ONCE(po->num)), - po->ifindex, + READ_ONCE(po->ifindex), po->running, atomic_read(&s->sk_rmem_alloc), from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),