From: Michael Chan michael.chan@broadcom.com
stable inclusion from linux-4.19.215 commit 02302cbd52264337630a32848ac03648648e9685
--------------------------------
commit 0c57eeecc559ca6bc18b8c4e2808bc78dbe769b0 upstream.
Drivers call netdev_set_num_tc() and then netdev_set_tc_queue() to set the queue count and offset for each TC. So the queue count and offset for the TCs may be zero for a short period after dev->num_tc has been set. If a TX packet is being transmitted at this time in the code path netdev_pick_tx() -> skb_tx_hash(), skb_tx_hash() may see nonzero dev->num_tc but zero qcount for the TC. The while loop that keeps looping while hash >= qcount will not end.
Fix it by checking the TC's qcount to be nonzero before using it.
Fixes: eadec877ce9c ("net: Add support for subordinate traffic classes to netdev_pick_tx") Reviewed-by: Andy Gospodarek gospo@broadcom.com Signed-off-by: Michael Chan michael.chan@broadcom.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/net/core/dev.c b/net/core/dev.c index 5d9800804d4a4..6ed7810ed7bd4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2846,6 +2846,12 @@ static u16 skb_tx_hash(const struct net_device *dev,
qoffset = sb_dev->tc_to_txq[tc].offset; qcount = sb_dev->tc_to_txq[tc].count; + if (unlikely(!qcount)) { + net_warn_ratelimited("%s: invalid qcount, qoffset %u for tc %u\n", + sb_dev->name, qoffset, tc); + qoffset = 0; + qcount = dev->real_num_tx_queues; + } }
if (skb_rx_queue_recorded(skb)) {
From: Ming Lei ming.lei@redhat.com
stable inclusion from linux-4.19.216 commit c2df161f69fb1c67f63adbd193368b47f511edc0
--------------------------------
commit f2b85040acec9a928b4eb1b57a989324e8e38d3f upstream.
SCSI host release is triggered when SCSI device is freed. We have to make sure that the low-level device driver module won't be unloaded before SCSI host instance is released because shost->hostt is required in the release handler.
Make sure to put LLD module refcnt after SCSI device is released.
Fixes a kernel panic of 'BUG: unable to handle page fault for address' reported by Changhui and Yi.
Link: https://lore.kernel.org/r/20211008050118.1440686-1-ming.lei@redhat.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Reported-by: Changhui Zhong czhong@redhat.com Reported-by: Yi Zhang yi.zhang@redhat.com Tested-by: Yi Zhang yi.zhang@redhat.com Signed-off-by: Ming Lei ming.lei@redhat.com Signed-off-by: Martin K. Petersen martin.petersen@oracle.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/scsi/scsi.c | 4 +++- drivers/scsi/scsi_sysfs.c | 9 +++++++++ 2 files changed, 12 insertions(+), 1 deletion(-)
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index fc1356d101b0a..febe29a9b8b06 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -575,8 +575,10 @@ EXPORT_SYMBOL(scsi_device_get); */ void scsi_device_put(struct scsi_device *sdev) { - module_put(sdev->host->hostt->module); + struct module *mod = sdev->host->hostt->module; + put_device(&sdev->sdev_gendev); + module_put(mod); } EXPORT_SYMBOL(scsi_device_put);
diff --git a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c index 12d9dba346058..4030e1fa57e52 100644 --- a/drivers/scsi/scsi_sysfs.c +++ b/drivers/scsi/scsi_sysfs.c @@ -431,9 +431,12 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work) struct list_head *this, *tmp; struct scsi_vpd *vpd_pg80 = NULL, *vpd_pg83 = NULL; unsigned long flags; + struct module *mod;
sdev = container_of(work, struct scsi_device, ew.work);
+ mod = sdev->host->hostt->module; + scsi_dh_release_device(sdev);
parent = sdev->sdev_gendev.parent; @@ -474,11 +477,17 @@ static void scsi_device_dev_release_usercontext(struct work_struct *work)
if (parent) put_device(parent); + module_put(mod); }
static void scsi_device_dev_release(struct device *dev) { struct scsi_device *sdp = to_scsi_device(dev); + + /* Set module pointer as NULL in case of module unloading */ + if (!try_module_get(sdp->host->hostt->module)) + sdp->host->hostt->module = NULL; + execute_in_process_context(scsi_device_dev_release_usercontext, &sdp->ew); }
From: Dan Carpenter dan.carpenter@oracle.com
stable inclusion from linux-4.19.218 commit bf3a1a8c9120f0cc55dc65993674bc1ac1f2968a
--------------------------------
commit a0bcce2b2a169e10eb265c8f0ebdd5ae4c875670 upstream.
The "4 * be32_to_cpu(data->count)" multiplication can potentially overflow which would lead to memory corruption. Add a check for that.
Cc: stable@vger.kernel.org Fixes: 745b361e989a ("tpm: infrastructure for TPM spaces") Signed-off-by: Dan Carpenter dan.carpenter@oracle.com Reviewed-by: Jarkko Sakkinen jarkko@kernel.org Signed-off-by: Jarkko Sakkinen jarkko@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/char/tpm/tpm2-space.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/char/tpm/tpm2-space.c b/drivers/char/tpm/tpm2-space.c index 9f4e22dcde270..205d930d7ea60 100644 --- a/drivers/char/tpm/tpm2-space.c +++ b/drivers/char/tpm/tpm2-space.c @@ -419,6 +419,9 @@ static int tpm2_map_response_body(struct tpm_chip *chip, u32 cc, u8 *rsp, if (be32_to_cpu(data->capability) != TPM2_CAP_HANDLES) return 0;
+ if (be32_to_cpu(data->count) > (UINT_MAX - TPM_HEADER_SIZE - 9) / 4) + return -EFAULT; + if (len != TPM_HEADER_SIZE + 9 + 4 * be32_to_cpu(data->count)) return -EFAULT;
From: Tom Lendacky thomas.lendacky@amd.com
stable inclusion from linux-4.19.218 commit 9a8dd3aedd0dea69b61b3d853a15f1306d9bed6a
--------------------------------
commit e7d445ab26db833d6640d4c9a08bee176777cc82 upstream.
When runtime support for converting between 4-level and 5-level pagetables was added to the kernel, the SME code that built pagetables was updated to use the pagetable functions, e.g. p4d_offset(), etc., in order to simplify the code. However, the use of the pagetable functions in early boot code requires the use of the USE_EARLY_PGTABLE_L5 #define in order to ensure that the proper definition of pgtable_l5_enabled() is used.
Without the #define, pgtable_l5_enabled() is #defined as cpu_feature_enabled(X86_FEATURE_LA57). In early boot, the CPU features have not yet been discovered and populated, so pgtable_l5_enabled() will return false even when 5-level paging is enabled. This causes the SME code to always build 4-level pagetables to perform the in-place encryption. If 5-level paging is enabled, switching to the SME pagetables results in a page-fault that kills the boot.
Adding the #define results in pgtable_l5_enabled() using the __pgtable_l5_enabled variable set in early boot and the SME code building pagetables for the proper paging level.
Fixes: aad983913d77 ("x86/mm/encrypt: Simplify sme_populate_pgd() and sme_populate_pgd_large()") Signed-off-by: Tom Lendacky thomas.lendacky@amd.com Signed-off-by: Borislav Petkov bp@suse.de Acked-by: Kirill A. Shutemov kirill.shutemov@linux.intel.com Cc: stable@vger.kernel.org # 4.18.x Link: https://lkml.kernel.org/r/2cb8329655f5c753905812d951e212022a480475.163431865... Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/mm/mem_encrypt_identity.c | 9 +++++++++ 1 file changed, 9 insertions(+)
diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index c9faf34cbb62e..4cbaacbe889a2 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -29,6 +29,15 @@ #undef CONFIG_PARAVIRT #undef CONFIG_PARAVIRT_SPINLOCKS
+/* + * This code runs before CPU feature bits are set. By default, the + * pgtable_l5_enabled() function uses bit X86_FEATURE_LA57 to determine if + * 5-level paging is active, so that won't work here. USE_EARLY_PGTABLE_L5 + * is provided to handle this situation and, instead, use a variable that + * has been set by the early boot code. + */ +#define USE_EARLY_PGTABLE_L5 + #include <linux/kernel.h> #include <linux/mm.h> #include <linux/mem_encrypt.h>
From: Lorenz Bauer lmb@cloudflare.com
stable inclusion from linux-4.19.218 commit 5c6fb0e0c73b4b45b15e4c6089ca5ef03197e246
--------------------------------
[ Upstream commit fadb7ff1a6c2c565af56b4aacdd086b067eed440 ]
Restrict bpf_jit_limit to the maximum supported by the arch's JIT.
Signed-off-by: Lorenz Bauer lmb@cloudflare.com Signed-off-by: Alexei Starovoitov ast@kernel.org Link: https://lore.kernel.org/bpf/20211014142554.53120-4-lmb@cloudflare.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/filter.h | 1 + kernel/bpf/core.c | 4 +++- net/core/sysctl_net_core.c | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-)
diff --git a/include/linux/filter.h b/include/linux/filter.h index b541abf7f32fd..2fb8c67c063e1 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -877,6 +877,7 @@ extern int bpf_jit_enable; extern int bpf_jit_harden; extern int bpf_jit_kallsyms; extern long bpf_jit_limit; +extern long bpf_jit_limit_max;
typedef void (*bpf_jit_fill_hole_t)(void *area, unsigned int size);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d4dc65f7bed51..641622b65de44 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -372,6 +372,7 @@ int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); int bpf_jit_harden __read_mostly; int bpf_jit_kallsyms __read_mostly; long bpf_jit_limit __read_mostly; +long bpf_jit_limit_max __read_mostly;
static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, @@ -598,7 +599,8 @@ u64 __weak bpf_jit_alloc_exec_limit(void) static int __init bpf_jit_charge_init(void) { /* Only used as heuristic here to derive limit. */ - bpf_jit_limit = min_t(u64, round_up(bpf_jit_alloc_exec_limit() >> 2, + bpf_jit_limit_max = bpf_jit_alloc_exec_limit(); + bpf_jit_limit = min_t(u64, round_up(bpf_jit_limit_max >> 2, PAGE_SIZE), LONG_MAX); return 0; } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 6cec08cd0bb9a..aa254c609ca9b 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -417,7 +417,7 @@ static struct ctl_table net_core_table[] = { .mode = 0600, .proc_handler = proc_dolongvec_minmax_bpf_restricted, .extra1 = &long_one, - .extra2 = &long_max, + .extra2 = &bpf_jit_limit_max, }, #endif {
From: "Eric W. Biederman" ebiederm@xmission.com
stable inclusion from linux-4.19.218 commit 09cdce82d6a8653f22b6c71f7b1529dc0415678b
--------------------------------
commit 7d613f9f72ec8f90ddefcae038fdae5adb8404b3 upstream.
The existence of sigkill_pending is a little silly as it is functionally a duplicate of fatal_signal_pending that is used in exactly one place.
Checking for pending fatal signals and returning early in ptrace_stop is actively harmful. It casues the ptrace_stop called by ptrace_signal to return early before setting current->exit_code. Later when ptrace_signal reads the signal number from current->exit_code is undefined, making it unpredictable what will happen.
Instead rely on the fact that schedule will not sleep if there is a pending signal that can awaken a task.
Removing the explict sigkill_pending test fixes fixes ptrace_signal when ptrace_stop does not stop because current->exit_code is always set to to signr.
Cc: stable@vger.kernel.org Fixes: 3d749b9e676b ("ptrace: simplify ptrace_stop()->sigkill_pending() path") Fixes: 1a669c2f16d4 ("Add arch_ptrace_stop") Link: https://lkml.kernel.org/r/87pmsyx29t.fsf@disp2133 Reviewed-by: Kees Cook keescook@chromium.org Signed-off-by: "Eric W. Biederman" ebiederm@xmission.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/signal.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-)
diff --git a/kernel/signal.c b/kernel/signal.c index dd8690d861729..bc558abbf4335 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2008,15 +2008,6 @@ static inline bool may_ptrace_stop(void) return true; }
-/* - * Return non-zero if there is a SIGKILL that should be waking us up. - * Called with the siglock held. - */ -static bool sigkill_pending(struct task_struct *tsk) -{ - return sigismember(&tsk->pending.signal, SIGKILL) || - sigismember(&tsk->signal->shared_pending.signal, SIGKILL); -}
/* * This must be called with current->sighand->siglock held. @@ -2043,17 +2034,16 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) * calling arch_ptrace_stop, so we must release it now. * To preserve proper semantics, we must do this before * any signal bookkeeping like checking group_stop_count. - * Meanwhile, a SIGKILL could come in before we retake the - * siglock. That must prevent us from sleeping in TASK_TRACED. - * So after regaining the lock, we must check for SIGKILL. */ spin_unlock_irq(¤t->sighand->siglock); arch_ptrace_stop(exit_code, info); spin_lock_irq(¤t->sighand->siglock); - if (sigkill_pending(current)) - return; }
+ /* + * schedule() will not sleep if there is a pending signal that + * can awaken the task. + */ set_special_state(TASK_TRACED);
/*
From: Pali Rohár pali@kernel.org
stable inclusion from linux-4.19.218 commit cac21c3eda6fc2940d59c1e72118cd7cde844f79
--------------------------------
commit 027b57170bf8bb6999a28e4a5f3d78bf1db0f90c upstream.
Since commit edc6afc54968 ("tty: switch to ktermios and new framework") termios speed is no longer stored only in c_cflag member but also in new additional c_ispeed and c_ospeed members. If BOTHER flag is set in c_cflag then termios speed is stored only in these new members.
Therefore to correctly restore termios speed it is required to store also ispeed and ospeed members, not only cflag member.
In case only cflag member with BOTHER flag is restored then functions tty_termios_baud_rate() and tty_termios_input_baud_rate() returns baudrate stored in c_ospeed / c_ispeed member, which is zero as it was not restored too. If reported baudrate is invalid (e.g. zero) then serial core functions report fallback baudrate value 9600. So it means that in this case original baudrate is lost and kernel changes it to value 9600.
Simple reproducer of this issue is to boot kernel with following command line argument: "console=ttyXXX,86400" (where ttyXXX is the device name). For speed 86400 there is no Bnnn constant and therefore kernel has to represent this speed via BOTHER c_cflag. Which means that speed is stored only in c_ospeed and c_ispeed members, not in c_cflag anymore.
If bootloader correctly configures serial device to speed 86400 then kernel prints boot log to early console at speed speed 86400 without any issue. But after kernel starts initializing real console device ttyXXX then speed is changed to fallback value 9600 because information about speed was lost.
This patch fixes above issue by storing and restoring also ispeed and ospeed members, which are required for BOTHER flag.
Fixes: edc6afc54968 ("[PATCH] tty: switch to ktermios and new framework") Cc: stable@vger.kernel.org Signed-off-by: Pali Rohár pali@kernel.org Link: https://lore.kernel.org/r/20211002130900.9518-1-pali@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/tty/serial/serial_core.c | 16 ++++++++++++++-- include/linux/console.h | 2 ++ 2 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 80fa06b16d9d7..5ed1327d284f9 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -219,7 +219,11 @@ static int uart_port_startup(struct tty_struct *tty, struct uart_state *state, if (retval == 0) { if (uart_console(uport) && uport->cons->cflag) { tty->termios.c_cflag = uport->cons->cflag; + tty->termios.c_ispeed = uport->cons->ispeed; + tty->termios.c_ospeed = uport->cons->ospeed; uport->cons->cflag = 0; + uport->cons->ispeed = 0; + uport->cons->ospeed = 0; } /* * Initialise the hardware port settings. @@ -287,8 +291,11 @@ static void uart_shutdown(struct tty_struct *tty, struct uart_state *state) /* * Turn off DTR and RTS early. */ - if (uport && uart_console(uport) && tty) + if (uport && uart_console(uport) && tty) { uport->cons->cflag = tty->termios.c_cflag; + uport->cons->ispeed = tty->termios.c_ispeed; + uport->cons->ospeed = tty->termios.c_ospeed; + }
if (!tty || C_HUPCL(tty)) uart_port_dtr_rts(uport, 0); @@ -2062,8 +2069,11 @@ uart_set_options(struct uart_port *port, struct console *co, * Allow the setting of the UART parameters with a NULL console * too: */ - if (co) + if (co) { co->cflag = termios.c_cflag; + co->ispeed = termios.c_ispeed; + co->ospeed = termios.c_ospeed; + }
return 0; } @@ -2197,6 +2207,8 @@ int uart_resume_port(struct uart_driver *drv, struct uart_port *uport) */ memset(&termios, 0, sizeof(struct ktermios)); termios.c_cflag = uport->cons->cflag; + termios.c_ispeed = uport->cons->ispeed; + termios.c_ospeed = uport->cons->ospeed;
/* * If that's unset, use the tty termios setting. diff --git a/include/linux/console.h b/include/linux/console.h index ec9bdb3d7bab6..35e3cc4fb4a68 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -153,6 +153,8 @@ struct console { short flags; short index; int cflag; + uint ispeed; + uint ospeed; void *data; struct console *next; };
From: Pawan Gupta pawan.kumar.gupta@linux.intel.com
stable inclusion from linux-4.19.218 commit 28fab448b2a5fb46727771a8a2dce607cd363649
--------------------------------
[ Upstream commit 0817534ff9ea809fac1322c5c8c574be8483ea57 ]
Syzkaller reported use-after-free bug as described in [1]. The bug is triggered when smk_set_cipso() tries to free stale category bitmaps while there are concurrent reader(s) using the same bitmaps.
Wait for RCU grace period to finish before freeing the category bitmaps in smk_set_cipso(). This makes sure that there are no more readers using the stale bitmaps and freeing them should be safe.
[1] https://lore.kernel.org/netdev/000000000000a814c505ca657a4e@google.com/
Reported-by: syzbot+3f91de0b813cc3d19a80@syzkaller.appspotmail.com Signed-off-by: Pawan Gupta pawan.kumar.gupta@linux.intel.com Signed-off-by: Casey Schaufler casey@schaufler-ca.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- security/smack/smackfs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index f6482e53d55a8..115f790c3b129 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -859,6 +859,7 @@ static int smk_open_cipso(struct inode *inode, struct file *file) static ssize_t smk_set_cipso(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int format) { + struct netlbl_lsm_catmap *old_cat; struct smack_known *skp; struct netlbl_lsm_secattr ncats; char mapcatset[SMK_CIPSOLEN]; @@ -932,9 +933,11 @@ static ssize_t smk_set_cipso(struct file *file, const char __user *buf,
rc = smk_netlbl_mls(maplevel, mapcatset, &ncats, SMK_CIPSOLEN); if (rc >= 0) { - netlbl_catmap_free(skp->smk_netlabel.attr.mls.cat); + old_cat = skp->smk_netlabel.attr.mls.cat; skp->smk_netlabel.attr.mls.cat = ncats.attr.mls.cat; skp->smk_netlabel.attr.mls.lvl = ncats.attr.mls.lvl; + synchronize_rcu(); + netlbl_catmap_free(old_cat); rc = count; }
From: Stephen Suryaputra ssuryaextr@gmail.com
stable inclusion from linux-4.19.218 commit 5cc5784a9a0330e599311f77278e129f352f57cd
--------------------------------
[ Upstream commit 61e18ce7348bfefb5688a8bcd4b4d6b37c0f9b2a ]
When addr_gen_mode is set to IN6_ADDR_GEN_MODE_NONE, the link-local addr should not be generated. But it isn't the case for GRE (as well as GRE6) and SIT tunnels. Make it so that tunnels consider the addr_gen_mode, especially for IN6_ADDR_GEN_MODE_NONE.
Do this in add_v4_addrs() to cover both GRE and SIT only if the addr scope is link.
Signed-off-by: Stephen Suryaputra ssuryaextr@gmail.com Acked-by: Antonio Quartulli a@unstable.cc Link: https://lore.kernel.org/r/20211020200618.467342-1-ssuryaextr@gmail.com Signed-off-by: Jakub Kicinski kuba@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv6/addrconf.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 76c097552ea74..9d8b791f63efc 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3054,6 +3054,9 @@ static void sit_add_v4_addrs(struct inet6_dev *idev) memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4);
if (idev->dev->flags&IFF_POINTOPOINT) { + if (idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_NONE) + return; + addr.s6_addr32[0] = htonl(0xfe800000); scope = IFA_LINK; plen = 64;
From: Helge Deller deller@gmx.de
stable inclusion from linux-4.19.218 commit ca6b1b3e032cf6ae2a61303eba04848b10bd6efc
--------------------------------
[ Upstream commit 9cc2fa4f4a92ccc6760d764e7341be46ee8aaaa1 ]
The function end_of_stack() returns a pointer to the last entry of a stack. For architectures like parisc where the stack grows upwards return the pointer to the highest address in the stack.
Without this change I faced a crash on parisc, because the stackleak functionality wrote STACKLEAK_POISON to the lowest address and thus overwrote the first 4 bytes of the task_struct which included the TIF_FLAGS.
Signed-off-by: Helge Deller deller@gmx.de Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/linux/sched/task_stack.h | 4 ++++ 1 file changed, 4 insertions(+)
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h index 6a841929073f9..4f099d3fed3a9 100644 --- a/include/linux/sched/task_stack.h +++ b/include/linux/sched/task_stack.h @@ -25,7 +25,11 @@ static inline void *task_stack_page(const struct task_struct *task)
static inline unsigned long *end_of_stack(const struct task_struct *task) { +#ifdef CONFIG_STACK_GROWSUP + return (unsigned long *)((unsigned long)task->stack + THREAD_SIZE) - 1; +#else return task->stack; +#endif }
#elif !defined(__HAVE_THREAD_FUNCTIONS)
From: Waiman Long longman@redhat.com
stable inclusion from linux-4.19.218 commit 7b170c73cc9604508bffbb03e5cc22fa3f815eb3
--------------------------------
[ Upstream commit 7ee285395b211cad474b2b989db52666e0430daf ]
It was found that the following warning was displayed when remounting controllers from cgroup v2 to v1:
[ 8042.997778] WARNING: CPU: 88 PID: 80682 at kernel/cgroup/cgroup.c:3130 cgroup_apply_control_disable+0x158/0x190 : [ 8043.091109] RIP: 0010:cgroup_apply_control_disable+0x158/0x190 [ 8043.096946] Code: ff f6 45 54 01 74 39 48 8d 7d 10 48 c7 c6 e0 46 5a a4 e8 7b 67 33 00 e9 41 ff ff ff 49 8b 84 24 e8 01 00 00 0f b7 40 08 eb 95 <0f> 0b e9 5f ff ff ff 48 83 c4 08 5b 5d 41 5c 41 5d 41 5e 41 5f c3 [ 8043.115692] RSP: 0018:ffffba8a47c23d28 EFLAGS: 00010202 [ 8043.120916] RAX: 0000000000000036 RBX: ffffffffa624ce40 RCX: 000000000000181a [ 8043.128047] RDX: ffffffffa63c43e0 RSI: ffffffffa63c43e0 RDI: ffff9d7284ee1000 [ 8043.135180] RBP: ffff9d72874c5800 R08: ffffffffa624b090 R09: 0000000000000004 [ 8043.142314] R10: ffffffffa624b080 R11: 0000000000002000 R12: ffff9d7284ee1000 [ 8043.149447] R13: ffff9d7284ee1000 R14: ffffffffa624ce70 R15: ffffffffa6269e20 [ 8043.156576] FS: 00007f7747cff740(0000) GS:ffff9d7a5fc00000(0000) knlGS:0000000000000000 [ 8043.164663] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 8043.170409] CR2: 00007f7747e96680 CR3: 0000000887d60001 CR4: 00000000007706e0 [ 8043.177539] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 8043.184673] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 8043.191804] PKRU: 55555554 [ 8043.194517] Call Trace: [ 8043.196970] rebind_subsystems+0x18c/0x470 [ 8043.201070] cgroup_setup_root+0x16c/0x2f0 [ 8043.205177] cgroup1_root_to_use+0x204/0x2a0 [ 8043.209456] cgroup1_get_tree+0x3e/0x120 [ 8043.213384] vfs_get_tree+0x22/0xb0 [ 8043.216883] do_new_mount+0x176/0x2d0 [ 8043.220550] __x64_sys_mount+0x103/0x140 [ 8043.224474] do_syscall_64+0x38/0x90 [ 8043.228063] entry_SYSCALL_64_after_hwframe+0x44/0xae
It was caused by the fact that rebind_subsystem() disables controllers to be rebound one by one. If more than one disabled controllers are originally from the default hierarchy, it means that cgroup_apply_control_disable() will be called multiple times for the same default hierarchy. A controller may be killed by css_kill() in the first round. In the second round, the killed controller may not be completely dead yet leading to the warning.
To avoid this problem, we collect all the ssid's of controllers that needed to be disabled from the default hierarchy and then disable them in one go instead of one by one.
Fixes: 334c3679ec4b ("cgroup: reimplement rebind_subsystems() using cgroup_apply_control() and friends") Signed-off-by: Waiman Long longman@redhat.com Signed-off-by: Tejun Heo tj@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/cgroup/cgroup.c | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6f4dcdd6f77b2..f98321d18672c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1656,6 +1656,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; int ssid, i, ret; + u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -1672,8 +1673,28 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; + + /* + * Collect ssid's that need to be disabled from default + * hierarchy. + */ + if (ss->root == &cgrp_dfl_root) + dfl_disable_ss_mask |= 1 << ssid; + } while_each_subsys_mask();
+ if (dfl_disable_ss_mask) { + struct cgroup *scgrp = &cgrp_dfl_root.cgrp; + + /* + * Controllers from default hierarchy that need to be rebound + * are all disabled together in one go. + */ + cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask; + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + } + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; @@ -1682,10 +1703,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
WARN_ON(!css || cgroup_css(dcgrp, ss));
- /* disable from the source */ - src_root->subsys_mask &= ~(1 << ssid); - WARN_ON(cgroup_apply_control(scgrp)); - cgroup_finalize_control(scgrp, 0); + if (src_root != &cgrp_dfl_root) { + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + }
/* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
From: Jon Maxwell jmaxwell37@gmail.com
stable inclusion from linux-4.19.218 commit b4d407a6467f854c1efb0713b68180192d93e3e8
--------------------------------
[ Upstream commit cf12e6f9124629b18a6182deefc0315f0a73a199 ]
v1: Implement a more general statement as recommended by Eric Dumazet. The sequence number will be advanced, so this check will fix the FIN case and other cases.
A customer reported sockets stuck in the CLOSING state. A Vmcore revealed that the write_queue was not empty as determined by tcp_write_queue_empty() but the sk_buff containing the FIN flag had been freed and the socket was zombied in that state. Corresponding pcaps show no FIN from the Linux kernel on the wire.
Some instrumentation was added to the kernel and it was found that there is a timing window where tcp_sendmsg() can run after tcp_send_fin().
tcp_sendmsg() will hit an error, for example:
1269 ▹ if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))↩ 1270 ▹ ▹ goto do_error;↩
tcp_remove_empty_skb() will then free the FIN sk_buff as "skb->len == 0". The TCP socket is now wedged in the FIN-WAIT-1 state because the FIN is never sent.
If the other side sends a FIN packet the socket will transition to CLOSING and remain that way until the system is rebooted.
Fix this by checking for the FIN flag in the sk_buff and don't free it if that is the case. Testing confirmed that fixed the issue.
Fixes: fdfc5c8594c2 ("tcp: remove empty skb from write queue in error cases") Signed-off-by: Jon Maxwell jmaxwell37@gmail.com Reported-by: Monir Zouaoui Monir.Zouaoui@mail.schwarz Reported-by: Simon Stier simon.stier@mail.schwarz Reviewed-by: Eric Dumazet edumazet@google.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/ipv4/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a13c9cde258a8..5b410171a20ba 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -957,7 +957,7 @@ static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) */ static void tcp_remove_empty_skb(struct sock *sk, struct sk_buff *skb) { - if (skb && !skb->len) { + if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { tcp_unlink_write_queue(skb, sk); if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
From: Daniel Jordan daniel.m.jordan@oracle.com
stable inclusion from linux-4.19.218 commit fca288ae3d058a4b795bdc5bb154b8ae5566636d
--------------------------------
[ Upstream commit 68b6dea802cea0dbdd8bd7ccc60716b5a32a5d8a ]
These three events can race when pcrypt is used multiple times in a template ("pcrypt(pcrypt(...))"):
1. [taskA] The caller makes the crypto request via crypto_aead_encrypt() 2. [kworkerB] padata serializes the inner pcrypt request 3. [kworkerC] padata serializes the outer pcrypt request
3 might finish before the call to crypto_aead_encrypt() returns in 1, resulting in two possible issues.
First, a use-after-free of the crypto request's memory when, for example, taskA writes to the outer pcrypt request's padata->info in pcrypt_aead_enc() after kworkerC completes the request.
Second, the outer pcrypt request overwrites the inner pcrypt request's return code with -EINPROGRESS, making a successful request appear to fail. For instance, kworkerB writes the outer pcrypt request's padata->info in pcrypt_aead_done() and then taskA overwrites it in pcrypt_aead_enc().
Avoid both situations by delaying the write of padata->info until after the inner crypto request's return code is checked. This prevents the use-after-free by not touching the crypto request's memory after the next-inner crypto request is made, and stops padata->info from being overwritten.
Fixes: 5068c7a883d16 ("crypto: pcrypt - Add pcrypt crypto parallelization wrapper") Reported-by: syzbot+b187b77c8474f9648fae@syzkaller.appspotmail.com Signed-off-by: Daniel Jordan daniel.m.jordan@oracle.com Signed-off-by: Herbert Xu herbert@gondor.apana.org.au Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- crypto/pcrypt.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/crypto/pcrypt.c b/crypto/pcrypt.c index a5718c0a3dc4e..00c7230acfed1 100644 --- a/crypto/pcrypt.c +++ b/crypto/pcrypt.c @@ -139,12 +139,14 @@ static void pcrypt_aead_enc(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); + int ret;
- padata->info = crypto_aead_encrypt(req); + ret = crypto_aead_encrypt(req);
- if (padata->info == -EINPROGRESS) + if (ret == -EINPROGRESS) return;
+ padata->info = ret; padata_do_serial(padata); }
@@ -181,12 +183,14 @@ static void pcrypt_aead_dec(struct padata_priv *padata) { struct pcrypt_request *preq = pcrypt_padata_request(padata); struct aead_request *req = pcrypt_request_ctx(preq); + int ret;
- padata->info = crypto_aead_decrypt(req); + ret = crypto_aead_decrypt(req);
- if (padata->info == -EINPROGRESS) + if (ret == -EINPROGRESS) return;
+ padata->info = ret; padata_do_serial(padata); }
From: Andy Shevchenko andriy.shevchenko@linux.intel.com
stable inclusion from linux-4.19.218 commit 7dcbeeda147dfabd02b272fc569e8646ff7d4ee1
--------------------------------
[ Upstream commit ebabb77a2a115b6c5e68f7364b598310b5f61fb2 ]
ACPI_PTR() is more harmful than helpful. For example, in this case if CONFIG_ACPI=n, the ID table left unused which is not what we want.
Instead of adding ifdeffery here and there, drop ACPI_PTR().
Fixes: 6a7320c4669f ("serial: 8250_dw: Add ACPI 5.0 support") Reported-by: Daniel Palmer daniel@0x0f.com Signed-off-by: Andy Shevchenko andriy.shevchenko@linux.intel.com Link: https://lore.kernel.org/r/20211005134516.23218-1-andriy.shevchenko@linux.int... Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/tty/serial/8250/8250_dw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/tty/serial/8250/8250_dw.c b/drivers/tty/serial/8250/8250_dw.c index 284e8d052fc3c..c73d0eddd9b8d 100644 --- a/drivers/tty/serial/8250/8250_dw.c +++ b/drivers/tty/serial/8250/8250_dw.c @@ -769,7 +769,7 @@ static struct platform_driver dw8250_platform_driver = { .name = "dw-apb-uart", .pm = &dw8250_pm_ops, .of_match_table = dw8250_of_match, - .acpi_match_table = ACPI_PTR(dw8250_acpi_match), + .acpi_match_table = dw8250_acpi_match, }, .probe = dw8250_probe, .remove = dw8250_remove,
From: Tom Rix trix@redhat.com
stable inclusion from linux-4.19.218 commit 9f3eb0a8a5c401f2c29aa2b3b979bbaae5401190
--------------------------------
[ Upstream commit d108370c644b153382632b3e5511ade575c91c86 ]
clang static analysis reports this representative problem:
label.c:1463:16: warning: Assigned value is garbage or undefined label->hname = name; ^ ~~~~
In aa_update_label_name(), this the problem block of code
if (aa_label_acntsxprint(&name, ...) == -1) return res;
On failure, aa_label_acntsxprint() has a more complicated return that just -1. So check for a negative return.
It was also noted that the aa_label_acntsxprint() main comment refers to a nonexistent parameter, so clean up the comment.
Fixes: f1bd904175e8 ("apparmor: add the base fns() for domain labels") Signed-off-by: Tom Rix trix@redhat.com Reviewed-by: Nick Desaulniers ndesaulniers@google.com Signed-off-by: John Johansen john.johansen@canonical.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- security/apparmor/label.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/security/apparmor/label.c b/security/apparmor/label.c index 2469549842d24..d4691f3e81083 100644 --- a/security/apparmor/label.c +++ b/security/apparmor/label.c @@ -1430,7 +1430,7 @@ bool aa_update_label_name(struct aa_ns *ns, struct aa_label *label, gfp_t gfp) if (label->hname || labels_ns(label) != ns) return res;
- if (aa_label_acntsxprint(&name, ns, label, FLAGS_NONE, gfp) == -1) + if (aa_label_acntsxprint(&name, ns, label, FLAGS_NONE, gfp) < 0) return res;
ls = labels_set(label); @@ -1680,7 +1680,7 @@ int aa_label_asxprint(char **strp, struct aa_ns *ns, struct aa_label *label,
/** * aa_label_acntsxprint - allocate a __counted string buffer and print label - * @strp: buffer to write to. (MAY BE NULL if @size == 0) + * @strp: buffer to write to. * @ns: namespace profile is being viewed from * @label: label to view (NOT NULL) * @flags: flags controlling what label info is printed
From: Trond Myklebust trond.myklebust@hammerspace.com
stable inclusion from linux-4.19.218 commit cc806af48ba4aa0e8641268cac95557f85a64c90
--------------------------------
[ Upstream commit 64a93dbf25d3a1368bb58ddf0f61d0a92d7479e3 ]
Partially revert commit 2ce209c42c01 ("NFS: Wait for requests that are locked on the commit list"), since it can lead to deadlocks between commit requests and nfs_join_page_group(). For now we should assume that any locked requests on the commit list are either about to be removed and committed by another task, or the writes they describe are about to be retransmitted. In either case, we should not need to worry.
Fixes: 2ce209c42c01 ("NFS: Wait for requests that are locked on the commit list") Signed-off-by: Trond Myklebust trond.myklebust@hammerspace.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/nfs/write.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 5ae98c0cde04c..e383d265e65b1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1050,25 +1050,11 @@ nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_page *req, *tmp; int ret = 0;
-restart: list_for_each_entry_safe(req, tmp, src, wb_list) { kref_get(&req->wb_kref); if (!nfs_lock_request(req)) { - int status; - - /* Prevent deadlock with nfs_lock_and_join_requests */ - if (!list_empty(dst)) { - nfs_release_request(req); - continue; - } - /* Ensure we make progress to prevent livelock */ - mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex); - status = nfs_wait_on_request(req); nfs_release_request(req); - mutex_lock(&NFS_I(cinfo->inode)->commit_mutex); - if (status < 0) - break; - goto restart; + continue; } nfs_request_remove_commit_list(req, cinfo); clear_bit(PG_COMMIT_TO_DS, &req->wb_flags); @@ -1916,6 +1902,7 @@ static int __nfs_commit_inode(struct inode *inode, int how, int may_wait = how & FLUSH_SYNC; int ret, nscan;
+ how &= ~FLUSH_SYNC; nfs_init_cinfo_from_inode(&cinfo, inode); nfs_commit_begin(cinfo.mds); for (;;) {
From: Florian Westphal fw@strlen.de
stable inclusion from linux-4.19.218 commit 779e36a18ecca58ad1637680feb7ca431458506b
--------------------------------
[ Upstream commit 5648b5e1169ff1d6d6a46c35c0b5fbebd2a5cbb2 ]
On 64bit platforms the MAC header is set to 0xffff on allocation and also when a helper like skb_unset_mac_header() is called.
dev_parse_header may call skb_mac_header() which assumes valid mac offset:
BUG: KASAN: use-after-free in eth_header_parse+0x75/0x90 Read of size 6 at addr ffff8881075a5c05 by task nf-queue/1364 Call Trace: memcpy+0x20/0x60 eth_header_parse+0x75/0x90 __nfqnl_enqueue_packet+0x1a61/0x3380 __nf_queue+0x597/0x1300 nf_queue+0xf/0x40 nf_hook_slow+0xed/0x190 nf_hook+0x184/0x440 ip_output+0x1c0/0x2a0 nf_reinject+0x26f/0x700 nfqnl_recv_verdict+0xa16/0x18b0 nfnetlink_rcv_msg+0x506/0xe70
The existing code only works if the skb has a mac header.
Fixes: 2c38de4c1f8da7 ("netfilter: fix looped (broad|multi)cast's MAC handling") Signed-off-by: Florian Westphal fw@strlen.de Signed-off-by: Pablo Neira Ayuso pablo@netfilter.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- net/netfilter/nfnetlink_queue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f81a3ce0fe48e..eb5a052d3b252 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -566,7 +566,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, goto nla_put_failure;
if (indev && entskb->dev && - entskb->mac_header != entskb->network_header) { + skb_mac_header_was_set(entskb)) { struct nfqnl_msg_packet_hw phw; int len;
From: Lars-Peter Clausen lars@metafoo.de
stable inclusion from linux-4.19.218 commit aa4de56beeed5d28f647c981b284b1775d9a99d1
--------------------------------
[ Upstream commit e7e1e880b114ca640a2f280b0d5d38aed98f98c6 ]
Before the `callback_result` callback was introduced drivers coded their invocation to the callback in a similar way to:
if (cb->callback) { spin_unlock(&dma->lock); cb->callback(cb->callback_param); spin_lock(&dma->lock); }
With the introduction of `callback_result` two helpers where introduced to transparently handle both types of callbacks. And drivers where updated to look like this:
if (dmaengine_desc_callback_valid(cb)) { spin_unlock(&dma->lock); dmaengine_desc_callback_invoke(cb, ...); spin_lock(&dma->lock); }
dmaengine_desc_callback_invoke() correctly handles both `callback_result` and `callback`. But we forgot to update the dmaengine_desc_callback_valid() function to check for `callback_result`. As a result DMA descriptors that use the `callback_result` rather than `callback` don't have their callback invoked by drivers that follow the pattern above.
Fix this by checking for both `callback` and `callback_result` in dmaengine_desc_callback_valid().
Fixes: f067025bc676 ("dmaengine: add support to provide error result from a DMA transation") Signed-off-by: Lars-Peter Clausen lars@metafoo.de Acked-by: Dave Jiang dave.jiang@intel.com Link: https://lore.kernel.org/r/20211023134101.28042-1-lars@metafoo.de Signed-off-by: Vinod Koul vkoul@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/dma/dmaengine.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/dma/dmaengine.h b/drivers/dma/dmaengine.h index 501c0b063f852..302f13efd35d9 100644 --- a/drivers/dma/dmaengine.h +++ b/drivers/dma/dmaengine.h @@ -168,7 +168,7 @@ dmaengine_desc_get_callback_invoke(struct dma_async_tx_descriptor *tx, static inline bool dmaengine_desc_callback_valid(struct dmaengine_desc_callback *cb) { - return (cb->callback) ? true : false; + return cb->callback || cb->callback_result; }
#endif
From: Miaohe Lin linmiaohe@huawei.com
stable inclusion from linux-4.19.218 commit b19675b6a046da6b4e160a78eedb3dd2584aa889
--------------------------------
[ Upstream commit afe8605ca45424629fdddfd85984b442c763dc47 ]
There is one possible race window between zs_pool_dec_isolated() and zs_unregister_migration() because wait_for_isolated_drain() checks the isolated count without holding class->lock and there is no order inside zs_pool_dec_isolated(). Thus the below race window could be possible:
zs_pool_dec_isolated zs_unregister_migration check pool->destroying != 0 pool->destroying = true; smp_mb(); wait_for_isolated_drain() wait for pool->isolated_pages == 0 atomic_long_dec(&pool->isolated_pages); atomic_long_read(&pool->isolated_pages) == 0
Since we observe the pool->destroying (false) before atomic_long_dec() for pool->isolated_pages, waking pool->migration_wait up is missed.
Fix this by ensure checking pool->destroying happens after the atomic_long_dec(&pool->isolated_pages).
Link: https://lkml.kernel.org/r/20210708115027.7557-1-linmiaohe@huawei.com Fixes: 701d678599d0 ("mm/zsmalloc.c: fix race condition in zs_destroy_pool") Signed-off-by: Miaohe Lin linmiaohe@huawei.com Cc: Minchan Kim minchan@kernel.org Cc: Sergey Senozhatsky senozhatsky@chromium.org Cc: Henry Burns henryburns@google.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/zsmalloc.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 45a53ffd7c974..05d3d587209b3 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1906,10 +1906,11 @@ static inline void zs_pool_dec_isolated(struct zs_pool *pool) VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); atomic_long_dec(&pool->isolated_pages); /* - * There's no possibility of racing, since wait_for_isolated_drain() - * checks the isolated count under &class->lock after enqueuing - * on migration_wait. + * Checking pool->destroying must happen after atomic_long_dec() + * for pool->isolated_pages above. Paired with the smp_mb() in + * zs_unregister_migration(). */ + smp_mb__after_atomic(); if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) wake_up_all(&pool->migration_wait); }
From: Dan Carpenter dan.carpenter@oracle.com
stable inclusion from linux-4.19.218 commit 26df52b3e8168a736b6f463b2f6f46c93c4e0c1a
--------------------------------
[ Upstream commit a88e03cf3d190cf46bc4063a9b7efe87590de5f4 ]
snprintf() returns the number of bytes it would have printed if there were space. But it does not count the NUL terminator. So that means that if "count == copied" then this has already overflowed by one character.
This bug likely isn't super harmful in real life.
Link: https://lkml.kernel.org/r/20210916130404.GA25094@kili Fixes: c0265342bff4 ("zram: introduce zram memory tracking") Signed-off-by: Dan Carpenter dan.carpenter@oracle.com Cc: Minchan Kim minchan@kernel.org Cc: Sergey Senozhatsky senozhatsky@chromium.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 52850c10165e1..dade3734a8cae 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -699,7 +699,7 @@ static ssize_t read_block_state(struct file *file, char __user *buf, zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.', zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.');
- if (count < copied) { + if (count <= copied) { zram_slot_unlock(zram, index); break; }
From: Eric Dumazet edumazet@google.com
stable inclusion from linux-4.19.218 commit 0c727425668ddc43bcf1a19c77bad215de966e65
--------------------------------
[ Upstream commit 8ac9dfd58b138f7e82098a4e0a0d46858b12215b ]
Both ifindex and LLC_SK_DEV_HASH_ENTRIES are signed.
This means that (ifindex % LLC_SK_DEV_HASH_ENTRIES) is negative if @ifindex is negative.
We could simply make LLC_SK_DEV_HASH_ENTRIES unsigned.
In this patch I chose to use hash_32() to get more entropy from @ifindex, like llc_sk_laddr_hashfn().
UBSAN: array-index-out-of-bounds in ./include/net/llc.h:75:26 index -43 is out of range for type 'hlist_head [64]' CPU: 1 PID: 20999 Comm: syz-executor.3 Not tainted 5.15.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: <TASK> __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 ubsan_epilogue+0xb/0x5a lib/ubsan.c:151 __ubsan_handle_out_of_bounds.cold+0x62/0x6c lib/ubsan.c:291 llc_sk_dev_hash include/net/llc.h:75 [inline] llc_sap_add_socket+0x49c/0x520 net/llc/llc_conn.c:697 llc_ui_bind+0x680/0xd70 net/llc/af_llc.c:404 __sys_bind+0x1e9/0x250 net/socket.c:1693 __do_sys_bind net/socket.c:1704 [inline] __se_sys_bind net/socket.c:1702 [inline] __x64_sys_bind+0x6f/0xb0 net/socket.c:1702 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fa503407ae9
Fixes: 6d2e3ea28446 ("llc: use a device based hash table to speed up multicast delivery") Signed-off-by: Eric Dumazet edumazet@google.com Reported-by: syzbot syzkaller@googlegroups.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/net/llc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/include/net/llc.h b/include/net/llc.h index df282d9b40170..9c10b121b49b0 100644 --- a/include/net/llc.h +++ b/include/net/llc.h @@ -72,7 +72,9 @@ struct llc_sap { static inline struct hlist_head *llc_sk_dev_hash(struct llc_sap *sap, int ifindex) { - return &sap->sk_dev_hash[ifindex % LLC_SK_DEV_HASH_ENTRIES]; + u32 bucket = hash_32(ifindex, LLC_SK_DEV_HASH_BITS); + + return &sap->sk_dev_hash[bucket]; }
static inline
From: Vasily Averin vvs@virtuozzo.com
stable inclusion from linux-4.19.218 commit 213739de40a2c173505b3f2ebbcda3dfbee3a690
--------------------------------
commit 0b28179a6138a5edd9d82ad2687c05b3773c387b upstream.
Patch series "memcg: prohibit unconditional exceeding the limit of dying tasks", v3.
Memory cgroup charging allows killed or exiting tasks to exceed the hard limit. It can be misused and allowed to trigger global OOM from inside a memcg-limited container. On the other hand if memcg fails allocation, called from inside #PF handler it triggers global OOM from inside pagefault_out_of_memory().
To prevent these problems this patchset: (a) removes execution of out_of_memory() from pagefault_out_of_memory(), becasue nobody can explain why it is necessary. (b) allow memcg to fail allocation of dying/killed tasks.
This patch (of 3):
Any allocation failure during the #PF path will return with VM_FAULT_OOM which in turn results in pagefault_out_of_memory which in turn executes out_out_memory() and can kill a random task.
An allocation might fail when the current task is the oom victim and there are no memory reserves left. The OOM killer is already handled at the page allocator level for the global OOM and at the charging level for the memcg one. Both have much more information about the scope of allocation/charge request. This means that either the OOM killer has been invoked properly and didn't lead to the allocation success or it has been skipped because it couldn't have been invoked. In both cases triggering it from here is pointless and even harmful.
It makes much more sense to let the killed task die rather than to wake up an eternally hungry oom-killer and send him to choose a fatter victim for breakfast.
Link: https://lkml.kernel.org/r/0828a149-786e-7c06-b70a-52d086818ea3@virtuozzo.com Signed-off-by: Vasily Averin vvs@virtuozzo.com Suggested-by: Michal Hocko mhocko@suse.com Acked-by: Michal Hocko mhocko@suse.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Mel Gorman mgorman@techsingularity.net Cc: Roman Gushchin guro@fb.com Cc: Shakeel Butt shakeelb@google.com Cc: Tetsuo Handa penguin-kernel@i-love.sakura.ne.jp Cc: Uladzislau Rezki urezki@gmail.com Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/oom_kill.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 2799a47105014..7aa7f797a4bf8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1302,6 +1302,9 @@ void pagefault_out_of_memory(void) if (mem_cgroup_oom_synchronize(true)) return;
+ if (fatal_signal_pending(current)) + return; + if (!mutex_trylock(&oom_lock)) return; out_of_memory(&oc);
From: Michal Hocko mhocko@suse.com
stable inclusion from linux-4.19.218 commit d508b70eaa8d6d994c289b757c0ca0355d4dbe29
--------------------------------
commit 60e2793d440a3ec95abb5d6d4fc034a4b480472d upstream.
Any allocation failure during the #PF path will return with VM_FAULT_OOM which in turn results in pagefault_out_of_memory. This can happen for 2 different reasons. a) Memcg is out of memory and we rely on mem_cgroup_oom_synchronize to perform the memcg OOM handling or b) normal allocation fails.
The latter is quite problematic because allocation paths already trigger out_of_memory and the page allocator tries really hard to not fail allocations. Anyway, if the OOM killer has been already invoked there is no reason to invoke it again from the #PF path. Especially when the OOM condition might be gone by that time and we have no way to find out other than allocate.
Moreover if the allocation failed and the OOM killer hasn't been invoked then we are unlikely to do the right thing from the #PF context because we have already lost the allocation context and restictions and therefore might oom kill a task from a different NUMA domain.
This all suggests that there is no legitimate reason to trigger out_of_memory from pagefault_out_of_memory so drop it. Just to be sure that no #PF path returns with VM_FAULT_OOM without allocation print a warning that this is happening before we restart the #PF.
[VvS: #PF allocation can hit into limit of cgroup v1 kmem controller. This is a local problem related to memcg, however, it causes unnecessary global OOM kills that are repeated over and over again and escalate into a real disaster. This has been broken since kmem accounting has been introduced for cgroup v1 (3.8). There was no kmem specific reclaim for the separate limit so the only way to handle kmem hard limit was to return with ENOMEM. In upstream the problem will be fixed by removing the outdated kmem limit, however stable and LTS kernels cannot do it and are still affected. This patch fixes the problem and should be backported into stable/LTS.]
Link: https://lkml.kernel.org/r/f5fd8dd8-0ad4-c524-5f65-920b01972a42@virtuozzo.com Signed-off-by: Michal Hocko mhocko@suse.com Signed-off-by: Vasily Averin vvs@virtuozzo.com Acked-by: Michal Hocko mhocko@suse.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Mel Gorman mgorman@techsingularity.net Cc: Roman Gushchin guro@fb.com Cc: Shakeel Butt shakeelb@google.com Cc: Tetsuo Handa penguin-kernel@i-love.sakura.ne.jp Cc: Uladzislau Rezki urezki@gmail.com Cc: Vladimir Davydov vdavydov.dev@gmail.com Cc: Vlastimil Babka vbabka@suse.cz Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/oom_kill.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-)
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7aa7f797a4bf8..020fb6ac24970 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1285,19 +1285,15 @@ bool out_of_memory(struct oom_control *oc) }
/* - * The pagefault handler calls here because it is out of memory, so kill a - * memory-hogging task. If oom_lock is held by somebody else, a parallel oom - * killing is already in progress so do nothing. + * The pagefault handler calls here because some allocation has failed. We have + * to take care of the memcg OOM here because this is the only safe context without + * any locks held but let the oom killer triggered from the allocation context care + * about the global OOM. */ void pagefault_out_of_memory(void) { - struct oom_control oc = { - .zonelist = NULL, - .nodemask = NULL, - .memcg = NULL, - .gfp_mask = 0, - .order = 0, - }; + static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST);
if (mem_cgroup_oom_synchronize(true)) return; @@ -1305,8 +1301,6 @@ void pagefault_out_of_memory(void) if (fatal_signal_pending(current)) return;
- if (!mutex_trylock(&oom_lock)) - return; - out_of_memory(&oc); - mutex_unlock(&oom_lock); + if (__ratelimit(&pfoom_rs)) + pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n"); }
From: Jane Malalane jane.malalane@citrix.com
stable inclusion from linux-4.19.218 commit a57977b0b8d963a92d482ea25ad85fe6369071ad
--------------------------------
commit 415de44076640483648d6c0f6d645a9ee61328ad upstream.
Currently, Linux probes for X86_BUG_NULL_SEL unconditionally which makes it unsafe to migrate in a virtualised environment as the properties across the migration pool might differ.
To be specific, the case which goes wrong is:
1. Zen1 (or earlier) and Zen2 (or later) in a migration pool 2. Linux boots on Zen2, probes and finds the absence of X86_BUG_NULL_SEL 3. Linux is then migrated to Zen1
Linux is now running on a X86_BUG_NULL_SEL-impacted CPU while believing that the bug is fixed.
The only way to address the problem is to fully trust the "no longer affected" CPUID bit when virtualised, because in the above case it would be clear deliberately to indicate the fact "you might migrate to somewhere which has this behaviour".
Zen3 adds the NullSelectorClearsBase CPUID bit to indicate that loading a NULL segment selector zeroes the base and limit fields, as well as just attributes. Zen2 also has this behaviour but doesn't have the NSCB bit.
[ bp: Minor touchups. ]
Signed-off-by: Jane Malalane jane.malalane@citrix.com Signed-off-by: Borislav Petkov bp@suse.de CC: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20211021104744.24126-1-jane.malalane@citrix.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Conflicts: arch/x86/kernel/cpu/amd.c [yyl: adjust context] Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/kernel/cpu/amd.c | 2 ++ arch/x86/kernel/cpu/common.c | 44 ++++++++++++++++++++++++++++++------ arch/x86/kernel/cpu/cpu.h | 1 + 3 files changed, 40 insertions(+), 7 deletions(-)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f86f912ce2158..5d68964d968bb 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -983,6 +983,8 @@ static void init_amd(struct cpuinfo_x86 *c) /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ if (!cpu_has(c, X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + + check_null_seg_clears_base(c); }
#ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 988d9c6a5c898..6fe28a1b18671 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1271,9 +1271,8 @@ void __init early_cpu_init(void) early_identify_cpu(&boot_cpu_data); }
-static void detect_null_seg_behavior(struct cpuinfo_x86 *c) +static bool detect_null_seg_behavior(void) { -#ifdef CONFIG_X86_64 /* * Empirically, writing zero to a segment selector on AMD does * not clear the base, whereas writing zero to a segment @@ -1294,10 +1293,43 @@ static void detect_null_seg_behavior(struct cpuinfo_x86 *c) wrmsrl(MSR_FS_BASE, 1); loadsegment(fs, 0); rdmsrl(MSR_FS_BASE, tmp); - if (tmp != 0) - set_cpu_bug(c, X86_BUG_NULL_SEG); wrmsrl(MSR_FS_BASE, old_base); -#endif + return tmp == 0; +} + +void check_null_seg_clears_base(struct cpuinfo_x86 *c) +{ + /* BUG_NULL_SEG is only relevant with 64bit userspace */ + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + /* Zen3 CPUs advertise Null Selector Clears Base in CPUID. */ + if (c->extended_cpuid_level >= 0x80000021 && + cpuid_eax(0x80000021) & BIT(6)) + return; + + /* + * CPUID bit above wasn't set. If this kernel is still running + * as a HV guest, then the HV has decided not to advertize + * that CPUID bit for whatever reason. For example, one + * member of the migration pool might be vulnerable. Which + * means, the bug is present: set the BUG flag and return. + */ + if (cpu_has(c, X86_FEATURE_HYPERVISOR)) { + set_cpu_bug(c, X86_BUG_NULL_SEG); + return; + } + + /* + * Zen2 CPUs also have this behaviour, but no CPUID bit. + * 0x18 is the respective family for Hygon. + */ + if ((c->x86 == 0x17 || c->x86 == 0x18) && + detect_null_seg_behavior()) + return; + + /* All the remaining ones are affected */ + set_cpu_bug(c, X86_BUG_NULL_SEG); }
static void generic_identify(struct cpuinfo_x86 *c) @@ -1333,8 +1365,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
get_model_name(c); /* Default name */
- detect_null_seg_behavior(c); - /* * ESPFIX is a strange bug. All real CPUs have it. Paravirt * systems that run Linux at CPL > 0 may or may not have the diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 3cddbb3db7464..e2e30de5e2751 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -77,6 +77,7 @@ extern int detect_extended_topology_early(struct cpuinfo_x86 *c); extern int detect_extended_topology(struct cpuinfo_x86 *c); extern int detect_ht_early(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); +extern void check_null_seg_clears_base(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
From: Shaoying Xu shaoyi@amazon.com
stable inclusion from linux-4.19.218 commit 0898c7a70aae87eeec6b00f03e0bb592fefd295f
--------------------------------
commit 39fec6889d15a658c3a3ebb06fd69d3584ddffd3 upstream.
Ext4 file system has default lazy inode table initialization setup once it is mounted. However, it has issue on computing the next schedule time that makes the timeout same amount in jiffies but different real time in secs if with various HZ values. Therefore, fix by measuring the current time in a more granular unit nanoseconds and make the next schedule time independent of the HZ value.
Fixes: bfff68738f1c ("ext4: add support for lazy inode table initialization") Signed-off-by: Shaoying Xu shaoyi@amazon.com Cc: stable@vger.kernel.org Signed-off-by: Theodore Ts'o tytso@mit.edu Link: https://lore.kernel.org/r/20210902164412.9994-2-shaoyi@amazon.com Signed-off-by: Theodore Ts'o tytso@mit.edu Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/ext4/super.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 5a58f72ac2090..587817ac60990 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3196,8 +3196,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr) struct ext4_group_desc *gdp = NULL; ext4_group_t group, ngroups; struct super_block *sb; - unsigned long timeout = 0; int ret = 0; + u64 start_time;
sb = elr->lr_super; ngroups = EXT4_SB(sb)->s_groups_count; @@ -3217,13 +3217,12 @@ static int ext4_run_li_request(struct ext4_li_request *elr) ret = 1;
if (!ret) { - timeout = jiffies; + start_time = ktime_get_real_ns(); ret = ext4_init_inode_table(sb, group, elr->lr_timeout ? 0 : 1); if (elr->lr_timeout == 0) { - timeout = (jiffies - timeout) * - elr->lr_sbi->s_li_wait_mult; - elr->lr_timeout = timeout; + elr->lr_timeout = nsecs_to_jiffies((ktime_get_real_ns() - start_time) * + elr->lr_sbi->s_li_wait_mult); } elr->lr_next_sched = jiffies + elr->lr_timeout; elr->lr_next_group = group + 1;
From: Thomas Gleixner tglx@linutronix.de
stable inclusion from linux-4.19.218 commit d8ea896354e3177316a24e2fd379785082d1a2fb
--------------------------------
commit 3735459037114d31e5acd9894fad9aed104231a0 upstream.
free_msi_irqs() frees the MSI entries before destroying the sysfs entries which are exposing them. Nothing prevents a concurrent free while a sysfs file is read and accesses the possibly freed entry.
Move the sysfs release ahead of freeing the entries.
Fixes: 1c51b50c2995 ("PCI/MSI: Export MSI mode using attributes, not kobjects") Signed-off-by: Thomas Gleixner tglx@linutronix.de Reviewed-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Bjorn Helgaas helgaas@kernel.org Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87sfw5305m.ffs@tglx Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/msi.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 219bad4c12641..43640155c66eb 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -372,18 +372,6 @@ static void free_msi_irqs(struct pci_dev *dev) for (i = 0; i < entry->nvec_used; i++) BUG_ON(irq_has_action(entry->irq + i));
- pci_msi_teardown_msi_irqs(dev); - - list_for_each_entry_safe(entry, tmp, msi_list, list) { - if (entry->msi_attrib.is_msix) { - if (list_is_last(&entry->list, msi_list)) - iounmap(entry->mask_base); - } - - list_del(&entry->list); - free_msi_entry(entry); - } - if (dev->msi_irq_groups) { sysfs_remove_groups(&dev->dev.kobj, dev->msi_irq_groups); msi_attrs = dev->msi_irq_groups[0]->attrs; @@ -399,6 +387,18 @@ static void free_msi_irqs(struct pci_dev *dev) kfree(dev->msi_irq_groups); dev->msi_irq_groups = NULL; } + + pci_msi_teardown_msi_irqs(dev); + + list_for_each_entry_safe(entry, tmp, msi_list, list) { + if (entry->msi_attrib.is_msix) { + if (list_is_last(&entry->list, msi_list)) + iounmap(entry->mask_base); + } + + list_del(&entry->list); + free_msi_entry(entry); + } }
static void pci_intx_for_msi(struct pci_dev *dev, int enable)
From: Marc Zyngier maz@kernel.org
stable inclusion from linux-4.19.218 commit a632cb0b4013ff9ba8ce5f062a45e5652975a68b
--------------------------------
commit 2226667a145db2e1f314d7f57fd644fe69863ab9 upstream.
It appears that some devices are lying about their mask capability, pretending that they don't have it, while they actually do. The net result is that now that we don't enable MSIs on such endpoint.
Add a new per-device flag to deal with this. Further patches will make use of it, sadly.
Signed-off-by: Marc Zyngier maz@kernel.org Signed-off-by: Thomas Gleixner tglx@linutronix.de Reviewed-by: Thomas Gleixner tglx@linutronix.de Link: https://lore.kernel.org/r/20211104180130.3825416-2-maz@kernel.org Cc: Bjorn Helgaas helgaas@kernel.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/pci/msi.c | 3 +++ include/linux/pci.h | 2 ++ 2 files changed, 5 insertions(+)
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index 43640155c66eb..9c1ee2ee6cc0d 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -569,6 +569,9 @@ msi_setup_entry(struct pci_dev *dev, int nvec, const struct irq_affinity *affd) goto out;
pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control); + /* Lies, damned lies, and MSIs */ + if (dev->dev_flags & PCI_DEV_FLAGS_HAS_MSI_MASKING) + control |= PCI_MSI_FLAGS_MASKBIT;
entry->msi_attrib.is_msix = 0; entry->msi_attrib.is_64 = !!(control & PCI_MSI_FLAGS_64BIT); diff --git a/include/linux/pci.h b/include/linux/pci.h index e4edd2d79d275..bc49349fcc53e 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -206,6 +206,8 @@ enum pci_dev_flags { PCI_DEV_FLAGS_NO_FLR_RESET = (__force pci_dev_flags_t) (1 << 10), /* Don't use Relaxed Ordering for TLPs directed at this device */ PCI_DEV_FLAGS_NO_RELAXED_ORDERING = (__force pci_dev_flags_t) (1 << 11), + /* Device does honor MSI masking despite saying otherwise */ + PCI_DEV_FLAGS_HAS_MSI_MASKING = (__force pci_dev_flags_t) (1 << 12), };
enum pci_irq_reroute_variant {
From: Guanghui Feng guanghuifeng@linux.alibaba.com
stable inclusion from linux-4.19.218 commit 4f300f47dbcf9c3d4b2ea76c8554c8f360400725
--------------------------------
[ Upstream commit 3968ddcf05fb4b9409cd1859feb06a5b0550a1c1 ]
When running ltp testcase(ltp/testcases/kernel/pty/pty04.c) with arm64, there is a soft lockup, which look like this one:
Workqueue: events_unbound flush_to_ldisc Call trace: dump_backtrace+0x0/0x1ec show_stack+0x24/0x30 dump_stack+0xd0/0x128 panic+0x15c/0x374 watchdog_timer_fn+0x2b8/0x304 __run_hrtimer+0x88/0x2c0 __hrtimer_run_queues+0xa4/0x120 hrtimer_interrupt+0xfc/0x270 arch_timer_handler_phys+0x40/0x50 handle_percpu_devid_irq+0x94/0x220 __handle_domain_irq+0x88/0xf0 gic_handle_irq+0x84/0xfc el1_irq+0xc8/0x180 slip_unesc+0x80/0x214 [slip] tty_ldisc_receive_buf+0x64/0x80 tty_port_default_receive_buf+0x50/0x90 flush_to_ldisc+0xbc/0x110 process_one_work+0x1d4/0x4b0 worker_thread+0x180/0x430 kthread+0x11c/0x120
In the testcase pty04, The first process call the write syscall to send data to the pty master. At the same time, the workqueue will do the flush_to_ldisc to pop data in a loop until there is no more data left. When the sender and workqueue running in different core, the sender sends data fastly in full time which will result in workqueue doing work in loop for a long time and occuring softlockup in flush_to_ldisc with kernel configured without preempt. So I add need_resched check and cond_resched in the flush_to_ldisc loop to avoid it.
Signed-off-by: Guanghui Feng guanghuifeng@linux.alibaba.com Link: https://lore.kernel.org/r/1633961304-24759-1-git-send-email-guanghuifeng@lin... Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/tty/tty_buffer.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index 480688b71c758..b190470127700 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -531,6 +531,9 @@ static void flush_to_ldisc(struct work_struct *work) if (!count) break; head->read += count; + + if (need_resched()) + cond_resched(); }
mutex_unlock(&buf->lock);
From: Vincent Donnefort vincent.donnefort@arm.com
stable inclusion from linux-4.19.218 commit 71731e0d68f4dd352a958942d6d0e39cb0f0fa58
--------------------------------
[ Upstream commit 42dc938a590c96eeb429e1830123fef2366d9c80 ]
Nothing protects the access to the per_cpu variable sd_llc_id. When testing the same CPU (i.e. this_cpu == that_cpu), a race condition exists with update_top_cache_domain(). One scenario being:
CPU1 CPU2 ==================================================================
per_cpu(sd_llc_id, CPUX) => 0 partition_sched_domains_locked() detach_destroy_domains() cpus_share_cache(CPUX, CPUX) update_top_cache_domain(CPUX) per_cpu(sd_llc_id, CPUX) => 0 per_cpu(sd_llc_id, CPUX) = CPUX per_cpu(sd_llc_id, CPUX) => CPUX return false
ttwu_queue_cond() wouldn't catch smp_processor_id() == cpu and the result is a warning triggered from ttwu_queue_wakelist().
Avoid a such race in cpus_share_cache() by always returning true when this_cpu == that_cpu.
Fixes: 518cd6234178 ("sched: Only queue remote wakeups when crossing cache boundaries") Reported-by: Jing-Ting Wu jing-ting.wu@mediatek.com Signed-off-by: Vincent Donnefort vincent.donnefort@arm.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Valentin Schneider valentin.schneider@arm.com Reviewed-by: Vincent Guittot vincent.guittot@linaro.org Link: https://lore.kernel.org/r/20211104175120.857087-1-vincent.donnefort@arm.com Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 487091389934e..b36a3b4c60e9c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1843,6 +1843,9 @@ void wake_up_if_idle(int cpu)
bool cpus_share_cache(int this_cpu, int that_cpu) { + if (this_cpu == that_cpu) + return true; + return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } #endif /* CONFIG_SMP */
From: Alexander Antonov alexander.antonov@linux.intel.com
stable inclusion from linux-4.19.218 commit 1b36736c257dc5468338e43d540490274329210c
--------------------------------
[ Upstream commit e324234e0aa881b7841c7c713306403e12b069ff ]
According Uncore Reference Manual: any of the CHA events may be filtered by Thread/Core-ID by using tid modifier in CHA Filter 0 Register. Update skx_cha_hw_config() to follow Uncore Guide.
Fixes: cd34cd97b7b4 ("perf/x86/intel/uncore: Add Skylake server uncore support") Signed-off-by: Alexander Antonov alexander.antonov@linux.intel.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Kan Liang kan.liang@linux.intel.com Link: https://lore.kernel.org/r/20211115090334.3789-2-alexander.antonov@linux.inte... Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/events/intel/uncore_snbep.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 01f53d0d3af18..eaa0a493069b3 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3530,6 +3530,9 @@ static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *ev struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; struct extra_reg *er; int idx = 0; + /* Any of the CHA events may be filtered by Thread/Core-ID.*/ + if (event->hw.config & SNBEP_CBO_PMON_CTL_TID_EN) + idx = SKX_CHA_MSR_PMON_BOX_FILTER_TID;
for (er = skx_uncore_cha_extra_regs; er->msr; er++) { if (er->event != (event->hw.config & er->config_mask))
From: Alexander Antonov alexander.antonov@linux.intel.com
stable inclusion from linux-4.19.218 commit 4c56f86ced54b7cf1727f6cf538776040e742a8a
--------------------------------
[ Upstream commit 3866ae319c846a612109c008f43cba80b8c15e86 ]
According to the latest uncore document, COMP_BUF_OCCUPANCY (0xd5) event can be collected on 2-3 counters. Update uncore IIO event constraints for Skylake Server.
Fixes: cd34cd97b7b4 ("perf/x86/intel/uncore: Add Skylake server uncore support") Signed-off-by: Alexander Antonov alexander.antonov@linux.intel.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Reviewed-by: Kan Liang kan.liang@linux.intel.com Link: https://lore.kernel.org/r/20211115090334.3789-3-alexander.antonov@linux.inte... Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/events/intel/uncore_snbep.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index eaa0a493069b3..8326dd5b5068a 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -3600,6 +3600,7 @@ static struct event_constraint skx_uncore_iio_constraints[] = { UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), UNCORE_EVENT_CONSTRAINT(0xd4, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd5, 0xc), EVENT_CONSTRAINT_END };
From: Nicolas Dichtel nicolas.dichtel@6wind.com
stable inclusion from linux-4.19.218 commit a57c5cbfb5c4dff95797a189b4d093b6343bc6d2
--------------------------------
commit a31d27fbed5d518734cb60956303eb15089a7634 upstream.
As stated in the bonding doc, trans_start must be set manually for drivers using NETIF_F_LLTX: Drivers that use NETIF_F_LLTX flag must also update netdev_queue->trans_start. If they do not, then the ARP monitor will immediately fail any slaves using that driver, and those slaves will stay down.
Link: https://www.kernel.org/doc/html/v5.15/networking/bonding.html#arp-monitor-op... Signed-off-by: Nicolas Dichtel nicolas.dichtel@6wind.com Signed-off-by: David S. Miller davem@davemloft.net Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/tun.c | 5 +++++ 1 file changed, 5 insertions(+)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 2ef5fcedf5a08..9e20cd1906498 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1077,6 +1077,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) { struct tun_struct *tun = netdev_priv(dev); int txq = skb->queue_mapping; + struct netdev_queue *queue; struct tun_file *tfile; int len = skb->len;
@@ -1123,6 +1124,10 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) if (ptr_ring_produce(&tfile->tx_ring, skb)) goto drop;
+ /* NETIF_F_LLTX requires to do our own update of trans_start */ + queue = netdev_get_tx_queue(dev, txq); + queue->trans_start = jiffies; + /* Notify and wake up reader process */ if (tfile->flags & TUN_FASYNC) kill_fasync(&tfile->fasync, SIGIO, POLL_IN);
From: Alexander Mikhalitsyn alexander.mikhalitsyn@virtuozzo.com
stable inclusion from linux-4.19.218 commit 766e08a7af4e61c8837ab1f41228ab5980868784
--------------------------------
commit 126e8bee943e9926238c891e2df5b5573aee76bc upstream.
Patch series "shm: shm_rmid_forced feature fixes".
Some time ago I met kernel crash after CRIU restore procedure, fortunately, it was CRIU restore, so, I had dump files and could do restore many times and crash reproduced easily. After some investigation I've constructed the minimal reproducer. It was found that it's use-after-free and it happens only if sysctl kernel.shm_rmid_forced = 1.
The key of the problem is that the exit_shm() function not handles shp's object destroy when task->sysvshm.shm_clist contains items from different IPC namespaces. In most cases this list will contain only items from one IPC namespace.
How can this list contain object from different namespaces? The exit_shm() function is designed to clean up this list always when process leaves IPC namespace. But we made a mistake a long time ago and did not add a exit_shm() call into the setns() syscall procedures.
The first idea was just to add this call to setns() syscall but it obviously changes semantics of setns() syscall and that's userspace-visible change. So, I gave up on this idea.
The first real attempt to address the issue was just to omit forced destroy if we meet shp object not from current task IPC namespace [1]. But that was not the best idea because task->sysvshm.shm_clist was protected by rwsem which belongs to current task IPC namespace. It means that list corruption may occur.
Second approach is just extend exit_shm() to properly handle shp's from different IPC namespaces [2]. This is really non-trivial thing, I've put a lot of effort into that but not believed that it's possible to make it fully safe, clean and clear.
Thanks to the efforts of Manfred Spraul working an elegant solution was designed. Thanks a lot, Manfred!
Eric also suggested the way to address the issue in ("[RFC][PATCH] shm: In shm_exit destroy all created and never attached segments") Eric's idea was to maintain a list of shm_clists one per IPC namespace, use lock-less lists. But there is some extra memory consumption-related concerns.
An alternative solution which was suggested by me was implemented in ("shm: reset shm_clist on setns but omit forced shm destroy"). The idea is pretty simple, we add exit_shm() syscall to setns() but DO NOT destroy shm segments even if sysctl kernel.shm_rmid_forced = 1, we just clean up the task->sysvshm.shm_clist list.
This chages semantics of setns() syscall a little bit but in comparision to the "naive" solution when we just add exit_shm() without any special exclusions this looks like a safer option.
[1] https://lkml.org/lkml/2021/7/6/1108 [2] https://lkml.org/lkml/2021/7/14/736
This patch (of 2):
Let's produce a warning if we trying to remove non-existing IPC object from IPC namespace kht/idr structures.
This allows us to catch possible bugs when the ipc_rmid() function was called with inconsistent struct ipc_ids*, struct kern_ipc_perm* arguments.
Link: https://lkml.kernel.org/r/20211027224348.611025-1-alexander.mikhalitsyn@virt... Link: https://lkml.kernel.org/r/20211027224348.611025-2-alexander.mikhalitsyn@virt... Co-developed-by: Manfred Spraul manfred@colorfullife.com Signed-off-by: Manfred Spraul manfred@colorfullife.com Signed-off-by: Alexander Mikhalitsyn alexander.mikhalitsyn@virtuozzo.com Cc: "Eric W. Biederman" ebiederm@xmission.com Cc: Davidlohr Bueso dave@stgolabs.net Cc: Greg KH gregkh@linuxfoundation.org Cc: Andrei Vagin avagin@gmail.com Cc: Pavel Tikhomirov ptikhomirov@virtuozzo.com Cc: Vasily Averin vvs@virtuozzo.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- ipc/util.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/ipc/util.c b/ipc/util.c index 53204b1d2e207..5e2674e3d56c7 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -417,8 +417,8 @@ static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids, static void ipc_kht_remove(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { if (ipcp->key != IPC_PRIVATE) - rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, - ipc_kht_params); + WARN_ON_ONCE(rhashtable_remove_fast(&ids->key_ht, &ipcp->khtnode, + ipc_kht_params)); }
/** @@ -433,7 +433,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) { int idx = ipcid_to_idx(ipcp->id);
- idr_remove(&ids->ipcs_idr, idx); + WARN_ON_ONCE(idr_remove(&ids->ipcs_idr, idx) != ipcp); ipc_kht_remove(ids, ipcp); ids->in_use--; ipcp->deleted = true;
From: Rustam Kovhaev rkovhaev@gmail.com
stable inclusion from linux-4.19.218 commit c52cef42f912ad74cea9e643edef9aec952b23cf
--------------------------------
commit 34dbc3aaf5d9e89ba6cc5e24add9458c21ab1950 upstream.
When kmemleak is enabled for SLOB, system does not boot and does not print anything to the console. At the very early stage in the boot process we hit infinite recursion from kmemleak_init() and eventually kernel crashes.
kmemleak_init() specifies SLAB_NOLEAKTRACE for KMEM_CACHE(), but kmem_cache_create_usercopy() removes it because CACHE_CREATE_MASK is not valid for SLOB.
Let's fix CACHE_CREATE_MASK and make kmemleak work with SLOB
Link: https://lkml.kernel.org/r/20211115020850.3154366-1-rkovhaev@gmail.com Fixes: d8843922fba4 ("slab: Ignore internal flags in cache creation") Signed-off-by: Rustam Kovhaev rkovhaev@gmail.com Acked-by: Vlastimil Babka vbabka@suse.cz Reviewed-by: Muchun Song songmuchun@bytedance.com Cc: Christoph Lameter cl@linux.com Cc: Pekka Enberg penberg@kernel.org Cc: David Rientjes rientjes@google.com Cc: Joonsoo Kim iamjoonsoo.kim@lge.com Cc: Catalin Marinas catalin.marinas@arm.com Cc: Greg Kroah-Hartman gregkh@linuxfoundation.org Cc: Glauber Costa glommer@parallels.com Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/slab.h b/mm/slab.h index 68ad9d348771e..c683b07ff5797 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -148,7 +148,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | SLAB_ACCOUNT) #else -#define SLAB_CACHE_FLAGS (0) +#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) #endif
/* Common flags available with current configuration */
From: Jiri Olsa jolsa@kernel.org
stable inclusion from linux-4.19.119 commit 0cb5f1e1a019d6fd753137e8bc35e3e9b25cf30d
--------------------------------
[ Upstream commit d3296fb372bf7497b0e5d0478c4e7a677ec6f6e9 ]
We hit following warning when running tests on kernel compiled with CONFIG_DEBUG_ATOMIC_SLEEP=y:
WARNING: CPU: 19 PID: 4472 at mm/gup.c:2381 __get_user_pages_fast+0x1a4/0x200 CPU: 19 PID: 4472 Comm: dummy Not tainted 5.6.0-rc6+ #3 RIP: 0010:__get_user_pages_fast+0x1a4/0x200 ... Call Trace: perf_prepare_sample+0xff1/0x1d90 perf_event_output_forward+0xe8/0x210 __perf_event_overflow+0x11a/0x310 __intel_pmu_pebs_event+0x657/0x850 intel_pmu_drain_pebs_nhm+0x7de/0x11d0 handle_pmi_common+0x1b2/0x650 intel_pmu_handle_irq+0x17b/0x370 perf_event_nmi_handler+0x40/0x60 nmi_handle+0x192/0x590 default_do_nmi+0x6d/0x150 do_nmi+0x2f9/0x3c0 nmi+0x8e/0xd7
While __get_user_pages_fast() is IRQ-safe, it calls access_ok(), which warns on:
WARN_ON_ONCE(!in_task() && !pagefault_disabled())
Peter suggested disabling page faults around __get_user_pages_fast(), which gets rid of the warning in access_ok() call.
Suggested-by: Peter Zijlstra peterz@infradead.org Signed-off-by: Jiri Olsa jolsa@kernel.org Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Signed-off-by: Ingo Molnar mingo@kernel.org Link: https://lkml.kernel.org/r/20200407141427.3184722-1-jolsa@kernel.org Signed-off-by: Sasha Levin sashal@kernel.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/events/core.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c index d6293a2f01e9b..84e67e3162d98 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6472,9 +6472,12 @@ static u64 perf_virt_to_phys(u64 virt) * Try IRQ-safe __get_user_pages_fast first. * If failed, leave phys_addr as 0. */ - if ((current->mm != NULL) && - (__get_user_pages_fast(virt, 1, 0, &p) == 1)) - phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + if (current->mm != NULL) { + pagefault_disable(); + if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + pagefault_enable(); + }
if (p) put_page(p);
From: Greg Thelen gthelen@google.com
stable inclusion from linux-4.19.218 commit 2786340f4fc97d938ac775d3b6291849090f2056
--------------------------------
commit 4716023a8f6a0f4a28047f14dd7ebdc319606b84 upstream.
PEBS PERF_SAMPLE_PHYS_ADDR events use perf_virt_to_phys() to convert PMU sampled virtual addresses to physical using get_user_page_fast_only() and page_to_phys().
Some get_user_page_fast_only() error cases return false, indicating no page reference, but still initialize the output page pointer with an unreferenced page. In these error cases perf_virt_to_phys() calls put_page(). This causes page reference count underflow, which can lead to unintentional page sharing.
Fix perf_virt_to_phys() to only put_page() if get_user_page_fast_only() returns a referenced page.
Fixes: fc7ce9c74c3ad ("perf/core, x86: Add PERF_SAMPLE_PHYS_ADDR") Signed-off-by: Greg Thelen gthelen@google.com Signed-off-by: Peter Zijlstra (Intel) peterz@infradead.org Link: https://lkml.kernel.org/r/20211111021814.757086-1-gthelen@google.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/events/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c index 84e67e3162d98..00b22c820d1f3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6454,7 +6454,6 @@ void perf_output_sample(struct perf_output_handle *handle, static u64 perf_virt_to_phys(u64 virt) { u64 phys_addr = 0; - struct page *p = NULL;
if (!virt) return 0; @@ -6473,14 +6472,15 @@ static u64 perf_virt_to_phys(u64 virt) * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { + struct page *p; + pagefault_disable(); - if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + if (__get_user_pages_fast(virt, 1, 0, &p) == 1) { phys_addr = page_to_phys(p) + virt % PAGE_SIZE; + put_page(p); + } pagefault_enable(); } - - if (p) - put_page(p); }
return phys_addr;