Boris Ostrovsky (1): x86/kvm: Be careful not to clear KVM_VCPU_FLUSH_TLB bit
Li Bin (1): iommu/iova: avoid softlockup in fq_flush_timeout
Xiongfeng Wang (1): sysrq: avoid concurrently info printing by 'sysrq-trigger'
Yang Yingliang (1): timer_list: avoid other cpu soft lockup when printing timer list
Yufen Yu (2): bdi: fix memleak in bdi_register_va() bdi: get device name under rcu protect
Zhang Xiaoxu (1): files_cgroup: fix error pointer when kvm_vm_worker_thread
arch/x86/kvm/x86.c | 3 +++ block/bfq-iosched.c | 7 +++++-- block/blk-core.c | 4 +++- drivers/iommu/iova.c | 31 +++++++++++++++++++++---------- drivers/tty/sysrq.c | 6 ++++++ fs/filescontrol.c | 8 +++++++- fs/fs-writeback.c | 4 +++- include/linux/backing-dev.h | 15 +++++++++++++++ include/linux/iova.h | 1 + include/trace/events/wbt.h | 12 ++++-------- include/trace/events/writeback.h | 18 ++++++++---------- kernel/time/timer_list.c | 8 ++++++-- mm/backing-dev.c | 4 ++-- 13 files changed, 84 insertions(+), 37 deletions(-)
From: Li Bin huawei.libin@huawei.com
hulk inclusion category: bugfix bugzilla: 30859 CVE: NA
---------------------------
There is softlockup under fio pressure test with smmu enabled: watchdog: BUG: soft lockup - CPU#81 stuck for 22s! [swapper/81:0] ... Call trace: fq_flush_timeout+0xc0/0x110 call_timer_fn+0x34/0x178 expire_timers+0xec/0x158 run_timer_softirq+0xc0/0x1f8 __do_softirq+0x120/0x324 irq_exit+0x11c/0x140 __handle_domain_irq+0x6c/0xc0 gic_handle_irq+0x6c/0x170 el1_irq+0xb8/0x140 arch_cpu_idle+0x38/0x1c0 default_idle_call+0x24/0x44 do_idle+0x1f4/0x2d8 cpu_startup_entry+0x2c/0x30 secondary_start_kernel+0x17c/0x1c8
This is because the timer callback fq_flush_timeout may run more than 10ms, and timer may be processed continuously in the softirq so trigger softlockup. We can use work to deal with fq_ring_free for each cpu which may take long time, that to avoid triggering softlockup.
Signed-off-by: Li Bin huawei.libin@huawei.com Reviewed-By: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/iommu/iova.c | 31 +++++++++++++++++++++---------- include/linux/iova.h | 1 + 2 files changed, 22 insertions(+), 10 deletions(-)
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 24e3f0c..fd9ab92 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -78,6 +78,7 @@ static void free_iova_flush_queue(struct iova_domain *iovad)
del_timer_sync(&iovad->fq_timer);
+ flush_work(&iovad->free_iova_work); fq_destroy_all_entries(iovad);
free_percpu(iovad->fq); @@ -87,6 +88,24 @@ static void free_iova_flush_queue(struct iova_domain *iovad) iovad->entry_dtor = NULL; }
+static void fq_ring_free(struct iova_domain *iovad, struct iova_fq *fq); +static void free_iova_work_func(struct work_struct *work) +{ + struct iova_domain *iovad; + int cpu; + + iovad = container_of(work, struct iova_domain, free_iova_work); + for_each_possible_cpu(cpu) { + unsigned long flags; + struct iova_fq *fq; + + fq = per_cpu_ptr(iovad->fq, cpu); + spin_lock_irqsave(&fq->lock, flags); + fq_ring_free(iovad, fq); + spin_unlock_irqrestore(&fq->lock, flags); + } +} + int init_iova_flush_queue(struct iova_domain *iovad, iova_flush_cb flush_cb, iova_entry_dtor entry_dtor) { @@ -117,6 +136,7 @@ int init_iova_flush_queue(struct iova_domain *iovad,
iovad->fq = queue;
+ INIT_WORK(&iovad->free_iova_work, free_iova_work_func); timer_setup(&iovad->fq_timer, fq_flush_timeout, 0); atomic_set(&iovad->fq_timer_on, 0);
@@ -541,20 +561,11 @@ static void fq_destroy_all_entries(struct iova_domain *iovad) static void fq_flush_timeout(struct timer_list *t) { struct iova_domain *iovad = from_timer(iovad, t, fq_timer); - int cpu;
atomic_set(&iovad->fq_timer_on, 0); iova_domain_flush(iovad);
- for_each_possible_cpu(cpu) { - unsigned long flags; - struct iova_fq *fq; - - fq = per_cpu_ptr(iovad->fq, cpu); - spin_lock_irqsave(&fq->lock, flags); - fq_ring_free(iovad, fq); - spin_unlock_irqrestore(&fq->lock, flags); - } + schedule_work(&iovad->free_iova_work); }
void queue_iova(struct iova_domain *iovad, diff --git a/include/linux/iova.h b/include/linux/iova.h index 35c4530..983872f 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -97,6 +97,7 @@ struct iova_domain { flush-queues */ atomic_t fq_timer_on; /* 1 when timer is active, 0 when not */ + struct work_struct free_iova_work; };
static inline unsigned long iova_size(struct iova *iova)
From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: bugfix bugzilla: 30109 CVE: NA ---------------------------
When device_add() fail, we just free rcu_dev and forget kobj->name. Using put_devcie to free both of rcu_dev and kobj->name.
Fixes: 5ca4579ae59b ("bdi: fix use-after-free for the bdi device") Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/backing-dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 040d778..75a6117 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -921,7 +921,7 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) return 0;
error: - kfree(rcu_dev); + put_device(&rcu_dev->dev); return retval; } EXPORT_SYMBOL(bdi_register_va); @@ -974,12 +974,12 @@ static void bdi_put_device_rcu(struct rcu_head *rcu) void bdi_unregister(struct backing_dev_info *bdi) { /* make sure nobody finds us on the bdi_list anymore */ - struct rcu_device *rcu_dev = bdi->rcu_dev; bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); cgwb_bdi_unregister(bdi);
if (bdi->dev) { + struct rcu_device *rcu_dev = bdi->rcu_dev; bdi_debug_unregister(bdi); get_device(bdi->dev); device_unregister(bdi->dev);
From: Xiongfeng Wang wangxiongfeng2@huawei.com
hulk inclusion category: bugfix bugzilla: 30212 CVE: NA
---------------------------
When we print system information by echo 't' into 'sysrq-trigger' on several cores at the same time, we got the following calltrace.
[ 1352.854632] NMI watchdog: Watchdog detected hard LOCKUP on cpu 6 [ 1352.854633] Modules linked in: nf_log_arp nf_log_ipv6 nf_log_ipv4 nf_log_common binfmt_misc salsa20_generic camellia_generic cast6_generic cast_common rfkill serpent_generic twofish_generic twofish_common xts lrw tgr192 wp512 rmd320 rmd256 rmd160 rmd128 md4 sha512_generic loop jprob(OE) ip6t_rpfilter ip6t_REJECT nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_conntrack ebtable_nat ip6table_nat nf_nat_ipv6 ip6table_mangle ip6table_raw ip6table_security iptable_nat nf_nat_ipv4 nf_nat iptable_mangle iptable_raw iptable_security nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c ip_set nfnetlink ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter vfat fat hns_roce_hw_v2 hns_roce ib_core aes_ce_blk crypto_simd cryptd aes_ce_cipher ghash_ce sha2_ce ipmi_ssif ofpart sha256_arm64 sha1_ce cmdlinepart [ 1352.854649] hi_sfc ses enclosure mtd sg sbsa_gwdt ipmi_si ipmi_devintf ipmi_msghandler spi_dw_mmio sch_fq_codel ip_tables ext4 mbcache jbd2 sr_mod cdrom sd_mod realtek hclge hisi_sas_v3_hw hisi_sas_main ahci libsas libahci hns3 hinic libata usb_storage hnae3 megaraid_sas scsi_transport_sas i2c_designware_platform i2c_designware_core dm_multipath dm_mirror dm_region_hash dm_log dm_mod [last unloaded: ip_vs] [ 1352.854658] CPU: 6 PID: 220569 Comm: sh Kdump: loaded Tainted: G OEL 4.19.90-vhulk2001.1.0.0026.aarch64 #1 [ 1352.854659] Hardware name: Huawei TaiShan 200 (Model 2280)/BC82AMDDA, BIOS 1.06 10/29/2019 [ 1352.854659] pstate: 80400089 (Nzcv daIf +PAN -UAO) [ 1352.854660] pc : queued_spin_lock_slowpath+0x1d8/0x2e0 [ 1352.854660] lr : print_cpu+0x414/0x690 [ 1352.854660] sp : ffff0001743afb80 [ 1352.854661] x29: ffff0001743afb80 x28: ffff805fcef6e880 [ 1352.854662] x27: 0000000000000000 x26: 0000000000000000 [ 1352.854662] x25: ffff000008cab000 x24: ffff000008cab000 [ 1352.854663] x23: 0000000000000000 x22: 0000000000000000 [ 1352.854664] x21: ffff000009478000 x20: 0000000000900001 [ 1352.854664] x19: ffff000009478d20 x18: ffffffffffffffff [ 1352.854665] x17: 0000000000000000 x16: 0000000000000000 [ 1352.854666] x15: ffff000009273708 x14: ffff00000947af60 [ 1352.854667] x13: ffff00000947abab x12: ffff00000929d000 [ 1352.854668] x11: 0000000000006fc8 x10: ffff00000947a1c0 [ 1352.854668] x9 : 0000000000000001 x8 : 0000000000000000 [ 1352.854669] x7 : ffff0000092737c8 x6 : ffff803fffc9e1c0 [ 1352.854670] x5 : 0000000000000000 x4 : ffff803fffc9e1c0 [ 1352.854671] x3 : ffff000008f5e000 x2 : 00000000001c0000 [ 1352.854671] x1 : 0000000000000000 x0 : ffff803fffc9e1c8 [ 1352.854672] Call trace: [ 1352.854673] queued_spin_lock_slowpath+0x1d8/0x2e0 [ 1352.854673] print_cpu+0x414/0x690 [ 1352.854673] sysrq_sched_debug_show+0x50/0x80 [ 1352.854674] show_state_filter+0xc0/0xd0 [ 1352.854674] sysrq_handle_showstate+0x18/0x28 [ 1352.854674] __handle_sysrq+0xa0/0x190 [ 1352.854675] write_sysrq_trigger+0x70/0x88 [ 1352.854675] proc_reg_write+0x80/0xd8 [ 1352.854675] __vfs_write+0x60/0x190 [ 1352.854676] vfs_write+0xac/0x1c0 [ 1352.854676] ksys_write+0x74/0xf0 [ 1352.854676] __arm64_sys_write+0x24/0x30 [ 1352.854677] el0_svc_common+0x78/0x130 [ 1352.854677] el0_svc_handler+0x38/0x78 [ 1352.854677] el0_svc+0x8/0xc [ 1352.854678] Kernel panic - not syncing: Hard LOCKUP [ 1352.854679] CPU: 6 PID: 220569 Comm: sh Kdump: loaded Tainted: G OEL 4.19.90-vhulk2001.1.0.0026.aarch64 #1 [ 1352.854679] Hardware name: Huawei TaiShan 200 (Model 2280)/BC82AMDDA, BIOS 1.06 10/29/2019 [ 1352.854679] Call trace: [ 1352.854680] dump_backtrace+0x0/0x198 [ 1352.854680] show_stack+0x24/0x30 [ 1352.854681] dump_stack+0xa4/0xc4 [ 1352.854681] panic+0x130/0x304 [ 1352.854681] __stack_chk_fail+0x0/0x28 [ 1352.854682] watchdog_hardlockup_check+0x138/0x140 [ 1352.854682] sdei_watchdog_callback+0x20/0x30 [ 1352.854682] sdei_event_handler+0x50/0xf0 [ 1352.854683] __sdei_handler+0xd8/0x228 [ 1352.854683] __sdei_asm_handler+0xbc/0x134 [ 1352.854683] queued_spin_lock_slowpath+0x1d8/0x2e0 [ 1352.854684] print_cpu+0x414/0x690 [ 1352.854684] sysrq_sched_debug_show+0x50/0x80 [ 1352.854684] show_state_filter+0xc0/0xd0 [ 1352.854685] sysrq_handle_showstate+0x18/0x28 [ 1352.854685] __handle_sysrq+0xa0/0x190 [ 1352.854685] write_sysrq_trigger+0x70/0x88 [ 1352.854686] proc_reg_write+0x80/0xd8 [ 1352.854686] __vfs_write+0x60/0x190 [ 1352.854686] vfs_write+0xac/0x1c0 [ 1352.854687] ksys_write+0x74/0xf0 [ 1352.854687] __arm64_sys_write+0x24/0x30 [ 1352.854687] el0_svc_common+0x78/0x130 [ 1352.854688] el0_svc_handler+0x38/0x78 [ 1352.854688] el0_svc+0x8/0xc
It is because there are many processes in the system. 'print_cpu()' aquires 'sched_debug_lock', print some information, and releases 'sched_debug_lock'. This procedure takes about 4 seconds in our testcase. When four cores concurrently print system info by sysrq, it will takes the last core 12 seconds to get the spinlock. This will cause a hardlockup.
Signed-off-by: Kai Shen shenkai8@huawei.com Signed-off-by: Xiongfeng Wang wangxiongfeng2@huawei.com Reviewed-By: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/tty/sysrq.c | 6 ++++++ 1 file changed, 6 insertions(+)
diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index aa2e394..72a8c70 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -1091,6 +1091,9 @@ int unregister_sysrq_key(int key, struct sysrq_key_op *op_p) EXPORT_SYMBOL(unregister_sysrq_key);
#ifdef CONFIG_PROC_FS + +static DEFINE_MUTEX(sysrq_mutex); + /* * writing 'C' to /proc/sysrq-trigger is like sysrq-C */ @@ -1102,7 +1105,10 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
if (get_user(c, buf)) return -EFAULT; + + mutex_lock(&sysrq_mutex); __handle_sysrq(c, false); + mutex_unlock(&sysrq_mutex); }
return count;
hulk inclusion category: bugfix bugzilla: NA CVE: NA
---------------------------
If system has many cpus (e.g. 128), it will spend a lot of time to print message to the console when execute echo q > /proc/sysrq-trigger. When /proc/sys/kernel/numa_balancing is enabled, if the migration threads is woke up, the thread cannot continue until the print finish, it will trigger a soft lockup.
PID: 619 TASK: ffffa02fdd8bec80 CPU: 121 COMMAND: "migration/121" #0 [ffff00000a103b10] __crash_kexec at ffff0000081bf200 #1 [ffff00000a103ca0] panic at ffff0000080ec93c #2 [ffff00000a103d80] watchdog_timer_fn at ffff0000081f8a14 #3 [ffff00000a103e00] __run_hrtimer at ffff00000819701c #4 [ffff00000a103e40] __hrtimer_run_queues at ffff000008197420 #5 [ffff00000a103ea0] hrtimer_interrupt at ffff00000819831c #6 [ffff00000a103f10] arch_timer_dying_cpu at ffff000008b53144 #7 [ffff00000a103f30] handle_percpu_devid_irq at ffff000008174e34 #8 [ffff00000a103f70] generic_handle_irq at ffff00000816c5e8 #9 [ffff00000a103f90] __handle_domain_irq at ffff00000816d1f4 #10 [ffff00000a103fd0] gic_handle_irq at ffff000008081860 --- <IRQ stack> --- #11 [ffff00000d6e3d50] el1_irq at ffff0000080834c8 #12 [ffff00000d6e3d60] multi_cpu_stop at ffff0000081d9964 #13 [ffff00000d6e3db0] cpu_stopper_thread at ffff0000081d9cfc #14 [ffff00000d6e3e10] smpboot_thread_fn at ffff00000811e0a8 #15 [ffff00000d6e3e70] kthread at ffff000008118988
To avoid this soft lockup, add touch_all_softlockup_watchdogs() in sysrq_timer_list_show()
Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-By: Xie XiuQi xiexiuqi@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- kernel/time/timer_list.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 07afcfe..3e74918 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -294,13 +294,17 @@ void sysrq_timer_list_show(void)
timer_list_header(NULL, now);
- for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + touch_all_softlockup_watchdogs(); print_cpu(NULL, cpu, now); + }
#ifdef CONFIG_GENERIC_CLOCKEVENTS timer_list_show_tickdevices_header(NULL); - for_each_online_cpu(cpu) + for_each_online_cpu(cpu) { + touch_all_softlockup_watchdogs(); print_tickdevice(NULL, tick_get_device(cpu), cpu); + } #endif return; }
From: Boris Ostrovsky boris.ostrovsky@oracle.com
commit 8c6de56a42e0c657955e12b882a81ef07d1d073e upstream.
kvm_steal_time_set_preempted() may accidentally clear KVM_VCPU_FLUSH_TLB bit if it is called more than once while VCPU is preempted.
This is part of CVE-2019-3016.
(This bug was also independently discovered by Jim Mattson jmattson@google.com)
Signed-off-by: Boris Ostrovsky boris.ostrovsky@oracle.com Reviewed-by: Joao Martins joao.m.martins@oracle.com Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/x86/kvm/x86.c | 3 +++ 1 file changed, 3 insertions(+)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b60867c..c768bf1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3234,6 +3234,9 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return;
+ if (vcpu->arch.st.steal.preempted) + return; + vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
From: Yufen Yu yuyufen@huawei.com
hulk inclusion category: bugfix bugzilla: 30109 CVE: NA ---------------------------
bdi->dev may be set as "NULL" or freed by bdi_unregister(). To avoid causing "NULL" pointer reference or use-after-free in user, we add a common function bdi_get_dev_name(), in which dev is protected by RCU lock. Then, the caller can get device name safely.
Fixes: 5ca4579ae59b ("bdi: fix use-after-free for the bdi device") Signed-off-by: Yufen Yu yuyufen@huawei.com Reviewed-by: Hou Tao houao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- block/bfq-iosched.c | 7 +++++-- block/blk-core.c | 4 +++- fs/fs-writeback.c | 4 +++- include/linux/backing-dev.h | 15 +++++++++++++++ include/trace/events/wbt.h | 12 ++++-------- include/trace/events/writeback.h | 18 ++++++++---------- 6 files changed, 38 insertions(+), 22 deletions(-)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 66b1ebc..73ef266 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -132,6 +132,7 @@ #include <linux/ioprio.h> #include <linux/sbitmap.h> #include <linux/delay.h> +#include <linux/backing-dev.h>
#include "blk.h" #include "blk-mq.h" @@ -4204,6 +4205,7 @@ static void bfq_exit_icq(struct io_cq *icq) struct task_struct *tsk = current; int ioprio_class; struct bfq_data *bfqd = bfqq->bfqd; + char dname[BDI_DEV_NAME_LEN];
if (!bfqd) return; @@ -4211,8 +4213,9 @@ static void bfq_exit_icq(struct io_cq *icq) ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); switch (ioprio_class) { default: - dev_err(bfqq->bfqd->queue->backing_dev_info->dev, - "bfq: bad prio class %d\n", ioprio_class); + bdi_get_dev_name(bfqq->bfqd->queue->backing_dev_info, + dname, BDI_DEV_NAME_LEN); + pr_err("%s bfq: bad prio class %d\n", dname, ioprio_class); /* fall through */ case IOPRIO_CLASS_NONE: /* diff --git a/block/blk-core.c b/block/blk-core.c index a52a2f5..b64dec2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1353,6 +1353,7 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, const bool is_sync = op_is_sync(op); int may_queue; req_flags_t rq_flags = RQF_ALLOCED; + char dname[BDI_DEV_NAME_LEN];
lockdep_assert_held(q->queue_lock);
@@ -1474,8 +1475,9 @@ static struct request *__get_request(struct request_list *rl, unsigned int op, * shouldn't stall IO. Treat this request as !elvpriv. This will * disturb iosched and blkcg but weird is bettern than dead. */ + bdi_get_dev_name(q->backing_dev_info, dname, BDI_DEV_NAME_LEN); printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", - __func__, dev_name(q->backing_dev_info->dev)); + __func__, dname);
rq->rq_flags &= ~RQF_ELVPRIV; rq->elv.icq = NULL; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a89e273..027e504 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1980,8 +1980,10 @@ void wb_workfn(struct work_struct *work) struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, dwork); long pages_written; + char dname[BDI_DEV_NAME_LEN];
- set_worker_desc("flush-%s", dev_name(wb->bdi->dev)); + bdi_get_dev_name(wb->bdi, dname, BDI_DEV_NAME_LEN); + set_worker_desc("flush-%s", dname); current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() || diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c28a47c..947b046 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -18,6 +18,8 @@ #include <linux/backing-dev-defs.h> #include <linux/slab.h>
+#define BDI_DEV_NAME_LEN 32 + static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) { kref_get(&bdi->refcnt); @@ -498,4 +500,17 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi) (1 << WB_async_congested)); }
+static inline void bdi_get_dev_name(struct backing_dev_info *bdi, char *dname, + int len) +{ + struct rcu_device *rcu_dev; + + rcu_read_lock(); + + rcu_dev = rcu_dereference(bdi->rcu_dev); + strlcpy(dname, rcu_dev ? dev_name(&rcu_dev->dev) : "(unknown)", len); + + rcu_read_unlock(); +} + #endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/trace/events/wbt.h b/include/trace/events/wbt.h index 37342a1..5dc52ef 100644 --- a/include/trace/events/wbt.h +++ b/include/trace/events/wbt.h @@ -33,8 +33,7 @@ ),
TP_fast_assign( - strlcpy(__entry->name, dev_name(bdi->dev), - ARRAY_SIZE(__entry->name)); + bdi_get_dev_name(bdi, __entry->name, ARRAY_SIZE(__entry->name)); __entry->rmean = stat[0].mean; __entry->rmin = stat[0].min; __entry->rmax = stat[0].max; @@ -68,8 +67,7 @@ ),
TP_fast_assign( - strlcpy(__entry->name, dev_name(bdi->dev), - ARRAY_SIZE(__entry->name)); + bdi_get_dev_name(bdi, __entry->name, ARRAY_SIZE(__entry->name)); __entry->lat = div_u64(lat, 1000); ),
@@ -105,8 +103,7 @@ ),
TP_fast_assign( - strlcpy(__entry->name, dev_name(bdi->dev), - ARRAY_SIZE(__entry->name)); + bdi_get_dev_name(bdi, __entry->name, ARRAY_SIZE(__entry->name)); __entry->msg = msg; __entry->step = step; __entry->window = div_u64(window, 1000); @@ -141,8 +138,7 @@ ),
TP_fast_assign( - strlcpy(__entry->name, dev_name(bdi->dev), - ARRAY_SIZE(__entry->name)); + bdi_get_dev_name(bdi, __entry->name, ARRAY_SIZE(__entry->name)); __entry->status = status; __entry->step = step; __entry->inflight = inflight; diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 32db72c..fbba9d2 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -96,8 +96,7 @@ struct backing_dev_info *bdi = inode_to_bdi(inode);
/* may be called for files on pseudo FSes w/ unregistered bdi */ - strncpy(__entry->name, - bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32); + bdi_get_dev_name(bdi, __entry->name, 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->flags = flags; @@ -220,8 +219,7 @@ static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *w __field(unsigned int, cgroup_ino) ), TP_fast_assign( - strncpy(__entry->name, - wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32); + bdi_get_dev_name(wb->bdi, __entry->name, 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; @@ -274,7 +272,7 @@ static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *w __field(unsigned int, cgroup_ino) ), TP_fast_assign( - strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + bdi_get_dev_name(wb->bdi, __entry->name, 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%u", @@ -296,7 +294,7 @@ static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *w __array(char, name, 32) ), TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + bdi_get_dev_name(bdi, __entry->name, 32); ), TP_printk("bdi %s", __entry->name @@ -321,7 +319,7 @@ static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *w ),
TP_fast_assign( - strncpy(__entry->name, dev_name(bdi->dev), 32); + bdi_get_dev_name(bdi, __entry->name, 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; @@ -372,7 +370,7 @@ static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *w ), TP_fast_assign( unsigned long *older_than_this = work->older_than_this; - strncpy(__entry->name, dev_name(wb->bdi->dev), 32); + bdi_get_dev_name(wb->bdi, __entry->name, 32); __entry->older = older_than_this ? *older_than_this : 0; __entry->age = older_than_this ? (jiffies - *older_than_this) * 1000 / HZ : -1; @@ -458,7 +456,7 @@ static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *w ),
TP_fast_assign( - strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); + bdi_get_dev_name(wb->bdi, __entry->bdi, 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); @@ -523,7 +521,7 @@ static inline unsigned int __trace_wbc_assign_cgroup(struct writeback_control *w
TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; - strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32); + bdi_get_dev_name(wb->bdi, __entry->bdi, 32);
__entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit +
From: Zhang Xiaoxu zhangxiaoxu5@huawei.com
hulk inclusion category: bugfix bugzilla: NA CVE: NA
---------------------------
When fix CVE-2018-12207, the kvm_vm_worker_thread will attach all cgroup subsystem. But the files cgroup doesn't support kernel thread.
Because the init_files doesn't init the files cgroup, when kernel thread 'kvm_vm_worker_thread' attach the files cgroup, the files_cgroup get from 'init_files' is an error pointer. It lead the kernel panic as below: [ 724.842302] page_counter_uncharge+0x1d/0x30 [ 724.842431] files_cgroup_attach+0x7c/0x130 [ 724.842564] ? css_set_move_task+0x12e/0x230 [ 724.842694] cgroup_migrate_execute+0x2f9/0x3b0 [ 724.842833] cgroup_attach_task+0x156/0x200 [ 724.843010] ? kvm_mmu_pte_write+0x490/0x490 [kvm] [ 724.843153] cgroup_attach_task_all+0x81/0xd0 [ 724.843289] ? __schedule+0x294/0x910 [ 724.843419] kvm_vm_worker_thread+0x4a/0xc0 [kvm] [ 724.843579] ? kvm_exit+0x80/0x80 [kvm] [ 724.843690] kthread+0x112/0x130 [ 724.843792] ?kthread_create_worker_on_cpu+0x70/0x70 [ 724.843948] ret_from_fork+0x35/0x40
So, we add some check, if the task is kernel thread (files is 'init_files'), we doesn't do the more operation about the files cgroup.
Fixes: baa10bc24e1e ("kvm: Add helper function for creating VM ...") Signed-off-by: Zhang Xiaoxu zhangxiaoxu5@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- fs/filescontrol.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/fs/filescontrol.c b/fs/filescontrol.c index fbaeacb..5ec5096 100644 --- a/fs/filescontrol.c +++ b/fs/filescontrol.c @@ -171,7 +171,7 @@ static void files_cgroup_attach(struct cgroup_taskset *tset)
task_lock(task); files = task->files; - if (!files) { + if (!files || files == &init_files) { task_unlock(task); return; } @@ -311,6 +311,9 @@ void files_cgroup_assign(struct files_struct *files) { struct cgroup_subsys_state *css;
+ if (files == &init_files) + return; + css = task_get_css(current, files_cgrp_id); files->files_cgroup = container_of(css, struct files_cgroup, css); } @@ -320,6 +323,9 @@ void files_cgroup_remove(struct files_struct *files) struct task_struct *tsk = current; struct files_cgroup *fcg;
+ if (files == &init_files) + return; + task_lock(tsk); spin_lock(&files->file_lock); fcg = files_cgroup_from_files(files);