From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The share_k2u_to_spg was designed to translate the kernel virtual address to user level address and share to the whole group, but tht function use the uncorrect file to mmap the memory region, it was not really work for whole group, and will consume more memory, fix this problem to use the correct hugepage file and make share to the whole group for each task.
Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 117 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 87 insertions(+), 30 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index f8dfbe80968b..a83254200e33 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -267,7 +267,10 @@ static int spa_dec_usage(enum spa_type type, unsigned long size, bool is_dvpp) }
static unsigned long sp_mmap(struct mm_struct *mm, struct file *file, - struct sp_area *spa, unsigned long *populate); + struct sp_area *spa, unsigned long *populate); +static void sp_munmap(struct mm_struct *mm, unsigned long addr, unsigned long size); +static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, + struct mm_struct *mm);
static void free_sp_group_id(unsigned int spg_id) { @@ -596,6 +599,15 @@ int sp_group_add_task(int pid, int spg_id) atomic_inc(&spa->use_count); spin_unlock(&sp_area_lock);
+ if (spa->type == SPA_TYPE_K2SPG && spa->kva) { + addr = sp_remap_kva_to_vma(spa->kva, spa, mm); + if (IS_ERR_VALUE(addr)) + pr_warn("share pool: task add group remap k2u failed, ret %ld\n", addr); + + spin_lock(&sp_area_lock); + continue; + } + down_write(&mm->mmap_sem); addr = sp_mmap(mm, file, spa, &populate); if (IS_ERR_VALUE(addr)) { @@ -611,9 +623,11 @@ int sp_group_add_task(int pid, int spg_id) if (populate) { ret = do_mm_populate(mm, spa->va_start, populate, 0); if (ret) { - if (printk_ratelimit()) + if (printk_ratelimit()) { pr_warn("share pool: task add group failed when mm populate " - "failed (potential no enough memory): %d\n", ret); + "failed (potential no enough memory): %d " + "spa flag is %d\n", ret, spa->type); + } sp_munmap_task_areas(mm, spa->link.next); spin_lock(&sp_area_lock); break; @@ -1480,12 +1494,16 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, int hsize_log = MAP_HUGE_2MB >> MAP_HUGE_SHIFT; unsigned long addr, buf, offset;
- if (spa->is_hugepage) { - file = hugetlb_file_setup(HUGETLB_ANON_FILE, spa_size(spa), VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE, hsize_log); - if (IS_ERR(file)) { - pr_err("share pool: file setup for k2u hugepage failed %ld\n", PTR_ERR(file)); - return PTR_ERR(file); + if (spg_valid(spa->spg)) { + file = spa_file(spa); + } else { + if (spa->is_hugepage) { + file = hugetlb_file_setup(HUGETLB_ANON_FILE, spa_size(spa), VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE, hsize_log); + if (IS_ERR(file)) { + pr_err("share pool: file setup for k2u hugepage failed %ld\n", PTR_ERR(file)); + return PTR_ERR(file); + } } }
@@ -1510,7 +1528,8 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, ret = remap_vmalloc_hugepage_range(vma, (void *)kva, 0); if (ret) { do_munmap(mm, ret_addr, spa_size(spa), NULL); - pr_err("share pool: remap vmalloc hugepage failed, ret %d\n", ret); + pr_err("share pool: remap vmalloc hugepage failed, " + "ret %d, kva is %lx\n", ret, kva); ret_addr = ret; goto put_mm; } @@ -1538,7 +1557,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, up_write(&mm->mmap_sem); mmput(mm); put_file: - if (file) + if (!spa->spg && file) fput(file);
return ret_addr; @@ -1615,6 +1634,35 @@ static void *sp_make_share_kva_to_spg(unsigned long kva, struct sp_area *spa, return p; }
+static bool vmalloc_area_set_flag(struct sp_area *spa, unsigned long kva, unsigned long flags) +{ + struct vm_struct *area; + + area = find_vm_area((void *)kva); + if (area) { + area->flags |= flags; + spa->kva = kva; + return true; + } + + return false; +} + +static bool vmalloc_area_clr_flag(struct sp_area *spa, unsigned long kva, unsigned long flags) +{ + struct vm_struct *area; + + spa->kva = 0; + + area = find_vm_area((void *)kva); + if (area) { + area->flags &= ~flags; + return true; + } + + return false; +} + /** * Share kernel memory to a specified process or sp_group * @kva: the VA of shared kernel memory @@ -1638,7 +1686,6 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, unsigned long size_aligned; unsigned int page_size = PAGE_SIZE; int ret; - struct vm_struct *area;
if (sp_flags & ~SP_DVPP) { if (printk_ratelimit()) @@ -1679,8 +1726,13 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, PTR_ERR(spa)); return spa; } + + if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { + pr_err("%s: the kva %ld is not valid\n", __func__, kva_aligned); + goto out; + } + uva = sp_make_share_kva_to_task(kva_aligned, spa, pid); - mutex_unlock(&sp_mutex); } else if (spg_valid(spg)) { /* k2u to group */ if (spg_id != SPG_ID_DEFAULT && spg_id != spg->id) { @@ -1699,26 +1751,31 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, return spa; }
+ if (!vmalloc_area_set_flag(spa, kva_aligned, VM_SHAREPOOL)) { + pr_err("%s: the kva %ld is not valid\n", __func__, kva_aligned); + goto out; + } + uva = sp_make_share_kva_to_spg(kva_aligned, spa, spg); - mutex_unlock(&sp_mutex); } else { mutex_unlock(&sp_mutex); pr_err("share pool: failed to make k2u\n"); return NULL; }
- if (!IS_ERR(uva)) + if (!IS_ERR(uva)) { uva = uva + (kva - kva_aligned); + } else { + /* associate vma and spa */ + if (!vmalloc_area_clr_flag(spa, kva_aligned, VM_SHAREPOOL)) + pr_warn("share pool: %s: the kva %ld is not valid \n", + __func__, kva_aligned); + }
+out: __sp_area_drop(spa); + mutex_unlock(&sp_mutex);
- if (!IS_ERR(uva)) { - /* associate vma and spa */ - area = find_vm_area((void *)kva); - if (area) - area->flags |= VM_SHAREPOOL; - spa->kva = kva; - } sp_dump_stack();
return uva; @@ -1990,7 +2047,6 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp unsigned long uva_aligned; unsigned long size_aligned; unsigned int page_size; - struct vm_struct *area;
mutex_lock(&sp_mutex); /* @@ -2061,7 +2117,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (printk_ratelimit()) pr_info("share pool: no need to unshare uva(to task), " "target process mm is exiting\n"); - goto out_drop_area; + goto out_clr_flag; }
if (spa->mm != mm) { @@ -2095,7 +2151,7 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp if (printk_ratelimit()) pr_info("share pool: no need to unshare uva(to group), " "spa doesn't belong to a sp group or group is dead\n"); - goto out_drop_area; + goto out_clr_flag; }
/* alway allow kthread and dvpp channel destroy procedure */ @@ -2112,11 +2168,12 @@ static int sp_unshare_uva(unsigned long uva, unsigned long size, int pid, int sp
sp_dump_stack();
-out_drop_area: +out_clr_flag: /* deassociate vma and spa */ - area = find_vm_area((void *)spa->kva); - if (area) - area->flags &= ~VM_SHAREPOOL; + if (!vmalloc_area_clr_flag(spa, spa->kva, VM_SHAREPOOL)) + pr_warn("share pool: %s: the spa->kva %ld is not valid\n", __func__, spa->kva); + +out_drop_area: __sp_area_drop(spa); out_unlock: mutex_unlock(&sp_mutex); @@ -2162,7 +2219,7 @@ static int sp_unshare_kva(unsigned long kva, unsigned long size) if (page) put_page(page); else - pr_err("share pool: vmalloc %pK to page/hugepage failed\n", + pr_warn("share pool: vmalloc %pK to page/hugepage failed\n", (void *)addr); }
From: Ding Tianhong dingtianhong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
The uva from k2u was shared to the whole group now, it was useless for the application user, and waste more cycles for the api caller, so disable it by default for performance, and enable it until the user really need it future.
Signed-off-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- mm/share_pool.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index a83254200e33..72e46686566a 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -59,6 +59,9 @@ int enable_mdc_default_group; static const int mdc_default_group_id = 1;
+/* share the uva to the whole group */ +int enable_share_k2u_spg; + /* access control mode */ int sysctl_ac_mode = AC_NONE; /* debug mode */ @@ -1741,7 +1744,12 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, pr_err("share pool: k2spg invalid spg id %d\n", spg_id); return ERR_PTR(-EINVAL); } - spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); + + if (enable_share_k2u_spg) + spa = sp_alloc_area(size_aligned, sp_flags, spg, SPA_TYPE_K2SPG); + else + spa = sp_alloc_area(size_aligned, sp_flags, NULL, SPA_TYPE_K2TASK); + if (IS_ERR(spa)) { mutex_unlock(&sp_mutex); if (printk_ratelimit()) @@ -1756,7 +1764,10 @@ void *sp_make_share_k2u(unsigned long kva, unsigned long size, goto out; }
- uva = sp_make_share_kva_to_spg(kva_aligned, spa, spg); + if (spa->spg) + uva = sp_make_share_kva_to_spg(kva_aligned, spa, spg); + else + uva = sp_make_share_kva_to_task(kva_aligned, spa, pid); } else { mutex_unlock(&sp_mutex); pr_err("share pool: failed to make k2u\n"); @@ -2375,6 +2386,13 @@ static int __init mdc_default_group(char *s) } __setup("enable_mdc_default_group", mdc_default_group);
+static int __init enable_share_k2u_to_group(char *s) +{ + enable_share_k2u_spg = 1; + return 1; +} +__setup("enable_sp_share_k2u_spg", enable_share_k2u_to_group); + int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) {
From: chenjiajun chenjiajun8@huawei.com
euleros inclusion category: bugfix bugzilla: NA CVE: NA
Since commit bbb5023b977a ("kvm: debugfs: Export vcpu stat via debugfs") add vcpu_stat debugfs entry for arm64 and x86 introduce some architecture independent code, which will cause compile err in other arches.
So, modify vcpu_stat arch support to avoid compile err in arches such as mips, ppc,s390 and etc.
Fixes: bbb5023b977 ("kvm: debugfs: Export vcpu stat via debugfs") Signed-off-by: chenjiajun chenjiajun8@huawei.com Reviewed-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/kvm_main.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9768ab63ef30..38fbd4c3aba8 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -141,6 +141,11 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); static unsigned long long kvm_createvm_count; static unsigned long long kvm_active_vms;
+/* debugfs entries of Detail For vcpu stat EXtension */ +__weak struct dfx_kvm_stats_debugfs_item dfx_debugfs_entries[] = { + { NULL } +}; + __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, unsigned long start, unsigned long end) { @@ -2908,8 +2913,9 @@ static long kvm_vcpu_ioctl(struct file *filp, if (oldpid) synchronize_rcu(); put_pid(oldpid); - /* NOTE: only work on aarch64 */ +#if defined(CONFIG_X86) || defined(CONFIG_ARM64) vcpu->stat.pid = current->pid; +#endif /* defined(CONFIG_X86) || defined (CONFIG_ARM64) */ } r = kvm_arch_vcpu_ioctl_run(vcpu, vcpu->run); trace_kvm_userspace_exit(vcpu->run->exit_reason, r);
From: Weilong Chen chenweilong@huawei.com
ascend inclusion category: feature bugzilla: NA CVE: NA
-------------------------------------------------
When the CPU is hunged up by hardware errors, board can't restart by software reset. It needs to be reset by an external reset (for example: saftyisland). Bios check this kind reset, enter the crash kernel and dump the vmcore.
Usage: 1. add a node name:kexecmailbox to dts config. 2. after kexec run, set sysctl -w kernel.kexec_bios_start=1.
Reviewed-by: Ding Tianhong dingtianhong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Weilong Chen chenweilong@huawei.com Acked-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/Kconfig | 12 +++++ arch/arm64/configs/hulk_defconfig | 1 + arch/arm64/kernel/Makefile | 1 + arch/arm64/kernel/kexec_mailbox.c | 83 +++++++++++++++++++++++++++++++ include/linux/kexec.h | 8 +++ kernel/kexec_core.c | 32 ++++++++++++ kernel/sysctl.c | 9 ++++ 7 files changed, 146 insertions(+) create mode 100644 arch/arm64/kernel/kexec_mailbox.c
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index cfde721a5961..d93dae75f1a6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1560,6 +1560,18 @@ config ASCEND_SHARE_POOL help This feature allows multiple processes to share virtual memory both in kernel and user level, which is only enabled for ascend platform. + +config ASCEND_BOOT_CRASH_KERNEL + bool "Support of boot crash kernel from bios" + default y + help + When the CPU is hunged up by hardware errors, board can't restart + by software reset. It needs to be reset by an external reset + (for example: saftyisland). Bios check this kind reset, enter + the crash kernel and dump the vmcore. + Usage: + 1. add a node name:kexecmailbox to dts config. + 2. after kexec run, set sysctl -w kernel.kexec_bios_start=1. endif
endmenu diff --git a/arch/arm64/configs/hulk_defconfig b/arch/arm64/configs/hulk_defconfig index 3b2b957f9ae0..e763311a3f8f 100644 --- a/arch/arm64/configs/hulk_defconfig +++ b/arch/arm64/configs/hulk_defconfig @@ -482,6 +482,7 @@ CONFIG_ASCEND_CHARGE_MIGRATE_HUGEPAGES=y CONFIG_ASCEND_WATCHDOG_SYSFS_CONFIGURE=y CONFIG_ASCEND_AUTO_TUNING_HUGEPAGE=y CONFIG_ASCEND_SHARE_POOL=y +CONFIG_ASCEND_BOOT_CRASH_KERNEL=y CONFIG_ARM64_CNP=y
# diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index d9237117da97..8a15d44fff67 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -54,6 +54,7 @@ arm64-obj-$(CONFIG_RANDOMIZE_BASE) += kaslr.o arm64-obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o arm64-obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o \ cpu-reset.o +arm64-obj-$(CONFIG_ASCEND_BOOT_CRASH_KERNEL) += kexec_mailbox.o arm64-obj-$(CONFIG_ARM64_RELOC_TEST) += arm64-reloc-test.o arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o diff --git a/arch/arm64/kernel/kexec_mailbox.c b/arch/arm64/kernel/kexec_mailbox.c new file mode 100644 index 000000000000..1f16fb29c9e4 --- /dev/null +++ b/arch/arm64/kernel/kexec_mailbox.c @@ -0,0 +1,83 @@ +/* + * Huawei Ascend Kexec Mailbox + * + * Copyright (C) 2020 Huawei Limited + * Author: Huawei OS Kernel Lab + * + * This code is based on the hisilicon ascend platform. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kexec.h> +#include <linux/of.h> +#include <linux/of_address.h> + +#include <asm/cacheflush.h> + +#define MAGIC_NO 0x42494F53UL +#define MAILBOX_ADDR 0x880000000UL + +struct kexec_mailbox { + unsigned long magic; + phys_addr_t reboot_code_phys; + unsigned long kimage_head; + unsigned long kimage_start; + unsigned long kimage_pad; +}; + +/* Global variables for the arm64_relocate_new_kernel routine. */ +extern const unsigned char arm64_relocate_new_kernel[]; +extern const unsigned long arm64_relocate_new_kernel_size; + +unsigned long mailbox_addr = MAILBOX_ADDR; + +int bios_setup_kimage(struct kimage *kimage) +{ + struct kexec_mailbox *bios_addr; + phys_addr_t reboot_code_buffer_phys; + void *reboot_code_buffer; + struct device_node *np; + + /* setup mailbox addr */ + np = of_find_node_by_name(NULL, "kexecmailbox"); + if (np) { + struct resource res; + + of_address_to_resource(np, 0, &res); + mailbox_addr = res.start; + of_node_put(np); + pr_info("kexec_mailbox: use dtb config addr %lx\n", mailbox_addr); + } else + pr_info("kexec_mailbox: use default addr %lx\n", mailbox_addr); + + bios_addr = ioremap_cache(mailbox_addr, sizeof(struct kexec_mailbox)); + if (!bios_addr) + return -EINVAL; + + reboot_code_buffer_phys = page_to_phys(kimage->control_code_page); + reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys); + memcpy(reboot_code_buffer, arm64_relocate_new_kernel, + arm64_relocate_new_kernel_size); + __flush_dcache_area(reboot_code_buffer, arm64_relocate_new_kernel_size); + __flush_icache_range((uintptr_t)reboot_code_buffer, + arm64_relocate_new_kernel_size); + + bios_addr->magic = MAGIC_NO; + bios_addr->reboot_code_phys = reboot_code_buffer_phys; + bios_addr->kimage_head = kimage->head; + bios_addr->kimage_start = kimage->start; + bios_addr->kimage_pad = 0; + pr_info("kexec_mailbox: magic %lx, reboot_code_phys %llx kimage_head %lx kimage_start %lx kimage_pad %lx\n", + bios_addr->magic, + bios_addr->reboot_code_phys, bios_addr->kimage_head, + bios_addr->kimage_start, bios_addr->kimage_pad); + __flush_dcache_area(bios_addr, sizeof(struct kexec_mailbox)); + __flush_icache_range((uintptr_t)bios_addr, sizeof(struct kexec_mailbox)); + + iounmap((void __iomem *)mailbox_addr); + return 0; +} diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d6b8d0a69720..f22c6d882af5 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -285,6 +285,14 @@ extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; extern int kexec_load_disabled;
+#ifdef CONFIG_ASCEND_BOOT_CRASH_KERNEL +extern int bios_setup_kimage(struct kimage *kimage); +extern int kexec_bios_start; +extern int kexec_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + #ifndef kexec_flush_icache_page #define kexec_flush_icache_page(page) #endif diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index b36c9c46cd2c..38f930887207 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -987,6 +987,34 @@ struct kimage *kexec_image; struct kimage *kexec_crash_image; int kexec_load_disabled;
+#ifdef CONFIG_ASCEND_BOOT_CRASH_KERNEL +int kexec_bios_start; +int kexec_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int err; + + err = proc_dointvec(table, write, buffer, lenp, ppos); + if (err) + return err; + + if (write && kexec_bios_start) { + if (mutex_trylock(&kexec_mutex)) { + if (kexec_crash_image) + err = bios_setup_kimage(kexec_crash_image); + else + err = -EINVAL; + mutex_unlock(&kexec_mutex); + } + } + if (err) + kexec_bios_start = 0; + + return err; +} +#endif + /* * No panic_cpu check version of crash_kexec(). This function is called * only when panic_cpu holds the current CPU number; this is the only CPU @@ -994,6 +1022,10 @@ int kexec_load_disabled; */ void __noclone __crash_kexec(struct pt_regs *regs) { +#ifdef CONFIG_ASCEND_BOOT_CRASH_KERNEL + if (kexec_bios_start) + return; +#endif /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 26c215fb37dc..dd01fd0e121f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -668,6 +668,15 @@ static struct ctl_table kern_table[] = { .extra1 = &one, .extra2 = &one, }, +#ifdef CONFIG_ASCEND_BOOT_CRASH_KERNEL + { + .procname = "kexec_bios_start", + .data = &kexec_bios_start, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = kexec_sysctl_handler, + }, +#endif #endif #ifdef CONFIG_MODULES {
From: Tang Yizhou tangyizhou@huawei.com
ascend inclusion category: feature bugzilla: 46819 CVE: NA
------------------------------------------------- The reason of exporting buff_vzalloc_user() is that gfp_mask __GFP_ACCOUNT can be used to limit memory usage with memory cgroup.
The same reason for buff_vzalloc_hugepage_user(), a hugepage version.
By selecting HAVE_ARCH_HUGE_VMALLOC and enabling boot arg enable_share_pool, buff_vzalloc_user() and vmalloc_hugepage_user() can allocate hugepage memory. Also, vmalloc() will allocate hugepage memory if possible. Reference: https://lwn.net/Articles/839107/
Signed-off-by: Tang Yizhou tangyizhou@huawei.com Reviewed-by: Ding Tianhong dingtianhong@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/Kconfig | 4 ++ arch/arm64/Kconfig | 2 + include/linux/share_pool.h | 29 ++++++++++++ include/linux/vmalloc.h | 3 +- mm/share_pool.c | 91 ++++++++++++++++++++++++++++++++++++++ mm/vmalloc.c | 52 +--------------------- 6 files changed, 129 insertions(+), 52 deletions(-)
diff --git a/arch/Kconfig b/arch/Kconfig index bf32f02845c7..de33474c4381 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -544,6 +544,10 @@ config HAVE_ARCH_HUGE_VMAP config HAVE_ARCH_HUGE_VMALLOC depends on HAVE_ARCH_HUGE_VMAP bool + help + Archs that select this would be capable of PMD-sized vmaps (i.e., + arch_vmap_pmd_supported() returns true), and they must make no + assumptions that vmalloc memory is mapped with PAGE_SIZE ptes.
config HAVE_ARCH_SOFT_DIRTY bool diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index d93dae75f1a6..478205b50196 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -106,6 +106,7 @@ config ARM64 select HAVE_ARCH_BITREVERSE select HAVE_ARCH_COMPILER_H select HAVE_ARCH_HUGE_VMAP + select HAVE_ARCH_HUGE_VMALLOC select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48) select HAVE_ARCH_KGDB @@ -1557,6 +1558,7 @@ config ASCEND_SHARE_POOL select ARCH_USES_HIGH_VMA_FLAGS select MM_OWNER depends on HUGETLBFS + depends on HAVE_ARCH_HUGE_VMALLOC help This feature allows multiple processes to share virtual memory both in kernel and user level, which is only enabled for ascend platform. diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 933b77be8ff8..3c5a41ae5bd1 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -38,6 +38,10 @@ extern int sysctl_sp_debug_mode;
extern int enable_ascend_share_pool;
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC +extern bool vmap_allow_huge; +#endif + /* Processes in the same sp_group can share memory. * Memory layout for share pool: * @@ -223,6 +227,11 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm, struct address_space *mapping, pgoff_t idx, unsigned long address, pte_t *ptep, unsigned int flags);
+extern void *vmalloc_hugepage(unsigned long size); +extern void *vmalloc_hugepage_user(unsigned long size); +extern void *buff_vzalloc_user(unsigned long size); +extern void *buff_vzalloc_hugepage_user(unsigned long size); + #else
static inline int sp_group_add_task(int pid, int spg_id) @@ -365,6 +374,26 @@ static inline bool sp_mmap_check(unsigned long flags) static inline void sp_dump_stack(void) { } + +static inline void *vmalloc_hugepage(unsigned long size) +{ + return NULL; +} + +static inline void *vmalloc_hugepage_user(unsigned long size) +{ + return NULL; +} + +static inline void *buff_vzalloc_user(unsigned long size) +{ + return NULL; +} + +static inline void *buff_vzalloc_hugepage_user(unsigned long size) +{ + return NULL; +} #endif
#endif /* LINUX_SHARE_POOL_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 6383d6989c0f..bb814f6418fd 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -99,8 +99,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); -extern void *vmalloc_hugepage(unsigned long size); -extern void *vmalloc_hugepage_user(unsigned long size); + #ifndef CONFIG_MMU extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, diff --git a/mm/share_pool.c b/mm/share_pool.c index 72e46686566a..4fa539e452ef 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2674,11 +2674,102 @@ struct page *sp_alloc_pages(struct vm_struct *area, gfp_t mask, return alloc_pages_node(node, mask, page_order); }
+/** + * vmalloc_hugepage - allocate virtually contiguous hugetlb memory + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage(unsigned long size) +{ + /* PMD hugepage aligned */ + size = PMD_ALIGN(size); + + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL, + VM_HUGE_PAGES, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_hugepage); + +/** + * vmalloc_hugepage_user - allocate virtually contiguous hugetlb memory + * for userspace + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. The resulting memory area + * is zeroed so it can be mapped to userspace without leaking data. + * + * The allocation size is aligned to PMD_SIZE automatically + */ +void *vmalloc_hugepage_user(unsigned long size) +{ + /* PMD hugepage aligned */ + size = PMD_ALIGN(size); + + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, + VM_HUGE_PAGES | VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_hugepage_user); + +/** + * buff_vzalloc_user - allocate zeroed virtually contiguous memory + * for userspace + * @size: allocation size + * + * The resulting memory area is zeroed so it can be mapped to userspace + * without leaking data. + * + * Compare to vmalloc_user(), this is a customized function because + * __GFP_ACCOUNT is used to limit memory usage. + */ +void *buff_vzalloc_user(unsigned long size) +{ + return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT, PAGE_KERNEL, + VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(buff_vzalloc_user); + +/** + * buff_vzalloc_hugepage_user - allocate virtually contiguous hugetlb memory + * for userspace + * @size: allocation size + * + * Allocate enough huge pages to cover @size and map them into + * contiguous kernel virtual space. The resulting memory area + * is zeroed so it can be mapped to userspace without leaking data. + * + * The allocation size is aligned to PMD_SIZE automatically + * + * Compare to vmalloc_hugepage_user(), this is a customized function because + * __GFP_ACCOUNT is used to limit memory usage. + */ +void *buff_vzalloc_hugepage_user(unsigned long size) +{ + /* PMD hugepage aligned */ + size = PMD_ALIGN(size); + + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL | __GFP_ZERO | __GFP_ACCOUNT, PAGE_KERNEL, + VM_HUGE_PAGES | VM_USERMAP, NUMA_NO_NODE, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(buff_vzalloc_hugepage_user); + int enable_ascend_share_pool;
static int __init enable_share_pool(char *s) { enable_ascend_share_pool = 1; + vmap_allow_huge = true;
pr_info("Ascend enable share pool features\n");
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index aa2415741d13..6bebb7b52448 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -43,7 +43,7 @@ #include "internal.h"
#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC -static bool __ro_after_init vmap_allow_huge = true; +bool __ro_after_init vmap_allow_huge;
static int __init set_nohugevmalloc(char *str) { @@ -52,7 +52,7 @@ static int __init set_nohugevmalloc(char *str) } early_param("nohugevmalloc", set_nohugevmalloc); #else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */ -static const bool vmap_allow_huge = false; +static const bool vmap_allow_huge; #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
struct vfree_deferred { @@ -2931,54 +2931,6 @@ void *vmalloc_32_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_32_user);
-/** - * vmalloc_hugepage - allocate virtually contiguous hugetlb memory - * @size: allocation size - * - * Allocate enough huge pages to cover @size and map them into - * contiguous kernel virtual space. - * - * The allocation size is aligned to PMD_SIZE automatically - */ -void *vmalloc_hugepage(unsigned long size) -{ - /* PMD hugepage aligned */ - size = PMD_ALIGN(size); - - return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); -} -EXPORT_SYMBOL(vmalloc_hugepage); - -/** - * vmalloc_hugepage_user - allocate virtually contiguous hugetlb memory - * for userspace - * @size: allocation size - * - * Allocate enough huge pages to cover @size and map them into - * contiguous kernel virtual space. The resulting memory area - * is zeroed so it can be mapped to userspace without leaking data. - * - * The allocation size is aligned to PMD_SIZE automatically - */ -void *vmalloc_hugepage_user(unsigned long size) -{ - struct vm_struct *area; - void *ret; - - /* 2M hugepa aligned */ - size = PMD_ALIGN(size); - - ret = __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, - NUMA_NO_NODE, __builtin_return_address(0)); - if (ret) { - area = find_vm_area(ret); - area->flags |= VM_USERMAP; - } - return ret; -} -EXPORT_SYMBOL(vmalloc_hugepage_user); -
/* * small helper routine , copy contents to buf from addr.
From: Pawel Wieczorkiewicz wipawel@amazon.de
mainline inclusion from mainline-v5.11-rc1 commit 1c728719a4da6e654afb9cc047164755072ed7c9 category: bugfix bugzilla: NA CVE: CVE-2020-29569
--------------------------------
When xen_blkif_disconnect() is called, the kernel thread behind the block interface is stopped by calling kthread_stop(ring->xenblkd). The ring->xenblkd thread pointer being non-NULL determines if the thread has been already stopped. Normally, the thread's function xen_blkif_schedule() sets the ring->xenblkd to NULL, when the thread's main loop ends.
However, when the thread has not been started yet (i.e. wake_up_process() has not been called on it), the xen_blkif_schedule() function would not be called yet.
In such case the kthread_stop() call returns -EINTR and the ring->xenblkd remains dangling. When this happens, any consecutive call to xen_blkif_disconnect (for example in frontend_changed() callback) leads to a kernel crash in kthread_stop() (e.g. NULL pointer dereference in exit_creds()).
This is XSA-350.
Cc: stable@vger.kernel.org # 4.12 Fixes: a24fa22ce22a ("xen/blkback: don't use xen_blkif_get() in xen-blkback kthread") Reported-by: Olivier Benjamin oliben@amazon.com Reported-by: Pawel Wieczorkiewicz wipawel@amazon.de Signed-off-by: Pawel Wieczorkiewicz wipawel@amazon.de Reviewed-by: Julien Grall jgrall@amazon.com Reviewed-by: Juergen Gross jgross@suse.com Signed-off-by: Juergen Gross jgross@suse.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/block/xen-blkback/xenbus.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 93896c992245..60594768057e 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -264,6 +264,7 @@ static int xen_blkif_disconnect(struct xen_blkif *blkif)
if (ring->xenblkd) { kthread_stop(ring->xenblkd); + ring->xenblkd = NULL; wake_up(&ring->shutdown_wq); }
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-v5.11-rc1 commit fed1755b118147721f2c87b37b9d66e62c39b668 category: bugfix bugzilla: NA CVE: CVE-2020-29568
--------------------------------
If handling logics of watch events are slower than the events enqueue logic and the events can be created from the guests, the guests could trigger memory pressure by intensively inducing the events, because it will create a huge number of pending events that exhausting the memory.
Fortunately, some watch events could be ignored, depending on its handler callback. For example, if the callback has interest in only one single path, the watch wouldn't want multiple pending events. Or, some watches could ignore events to same path.
To let such watches to volutarily help avoiding the memory pressure situation, this commit introduces new watch callback, 'will_handle'. If it is not NULL, it will be called for each new event just before enqueuing it. Then, if the callback returns false, the event will be discarded. No watch is using the callback for now, though.
This is part of XSA-349
Cc: stable@vger.kernel.org Signed-off-by: SeongJae Park sjpark@amazon.de Reported-by: Michael Kurth mku@amazon.de Reported-by: Pawel Wieczorkiewicz wipawel@amazon.de Reviewed-by: Juergen Gross jgross@suse.com Signed-off-by: Juergen Gross jgross@suse.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/net/xen-netback/xenbus.c | 4 ++++ drivers/xen/xenbus/xenbus_client.c | 1 + drivers/xen/xenbus/xenbus_xs.c | 5 ++++- include/xen/xenbus.h | 7 +++++++ 4 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index cd51492ae6c2..14273a431d99 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -777,12 +777,14 @@ static int xen_register_credit_watch(struct xenbus_device *dev, return -ENOMEM; snprintf(node, maxlen, "%s/rate", dev->nodename); vif->credit_watch.node = node; + vif->credit_watch.will_handle = NULL; vif->credit_watch.callback = xen_net_rate_changed; err = register_xenbus_watch(&vif->credit_watch); if (err) { pr_err("Failed to set watcher %s\n", vif->credit_watch.node); kfree(node); vif->credit_watch.node = NULL; + vif->credit_watch.will_handle = NULL; vif->credit_watch.callback = NULL; } return err; @@ -829,6 +831,7 @@ static int xen_register_mcast_ctrl_watch(struct xenbus_device *dev, snprintf(node, maxlen, "%s/request-multicast-control", dev->otherend); vif->mcast_ctrl_watch.node = node; + vif->mcast_ctrl_watch.will_handle = NULL; vif->mcast_ctrl_watch.callback = xen_mcast_ctrl_changed; err = register_xenbus_watch(&vif->mcast_ctrl_watch); if (err) { @@ -836,6 +839,7 @@ static int xen_register_mcast_ctrl_watch(struct xenbus_device *dev, vif->mcast_ctrl_watch.node); kfree(node); vif->mcast_ctrl_watch.node = NULL; + vif->mcast_ctrl_watch.will_handle = NULL; vif->mcast_ctrl_watch.callback = NULL; } return err; diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index f7b553faadb1..5a8bd3baa6e5 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -120,6 +120,7 @@ int xenbus_watch_path(struct xenbus_device *dev, const char *path, int err;
watch->node = path; + watch->will_handle = NULL; watch->callback = callback;
err = register_xenbus_watch(watch); diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 3a06eb699f33..e8bdbd0a1e26 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -705,7 +705,10 @@ int xs_watch_msg(struct xs_watch_event *event)
spin_lock(&watches_lock); event->handle = find_watch(event->token); - if (event->handle != NULL) { + if (event->handle != NULL && + (!event->handle->will_handle || + event->handle->will_handle(event->handle, + event->path, event->token))) { spin_lock(&watch_events_lock); list_add_tail(&event->list, &watch_events); wake_up(&watch_events_waitq); diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 869c816d5f8c..55f543fe0bd8 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -59,6 +59,13 @@ struct xenbus_watch /* Path being watched. */ const char *node;
+ /* + * Called just before enqueing new event while a spinlock is held. + * The event will be discarded if this callback returns false. + */ + bool (*will_handle)(struct xenbus_watch *, + const char *path, const char *token); + /* Callback (executed in a process context with no locks held). */ void (*callback)(struct xenbus_watch *, const char *path, const char *token);
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-v5.11-rc1 commit 2e85d32b1c865bec703ce0c962221a5e955c52c2 category: bugfix bugzilla: NA CVE: CVE-2020-29568
--------------------------------
Some code does not directly make 'xenbus_watch' object and call 'register_xenbus_watch()' but use 'xenbus_watch_path()' instead. This commit adds support of 'will_handle' callback in the 'xenbus_watch_path()' and it's wrapper, 'xenbus_watch_pathfmt()'.
This is part of XSA-349
Cc: stable@vger.kernel.org Signed-off-by: SeongJae Park sjpark@amazon.de Reported-by: Michael Kurth mku@amazon.de Reported-by: Pawel Wieczorkiewicz wipawel@amazon.de Reviewed-by: Juergen Gross jgross@suse.com Signed-off-by: Juergen Gross jgross@suse.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/block/xen-blkback/xenbus.c | 3 ++- drivers/net/xen-netback/xenbus.c | 2 +- drivers/xen/xen-pciback/xenbus.c | 2 +- drivers/xen/xenbus/xenbus_client.c | 9 +++++++-- drivers/xen/xenbus/xenbus_probe.c | 2 +- include/xen/xenbus.h | 6 +++++- 6 files changed, 17 insertions(+), 7 deletions(-)
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 60594768057e..42af2f37ba4e 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -652,7 +652,8 @@ static int xen_blkbk_probe(struct xenbus_device *dev, /* setup back pointer */ be->blkif->be = be;
- err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed, + err = xenbus_watch_pathfmt(dev, &be->backend_watch, NULL, + backend_changed, "%s/%s", dev->nodename, "physical-device"); if (err) goto fail; diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c index 14273a431d99..107bbd4ae825 100644 --- a/drivers/net/xen-netback/xenbus.c +++ b/drivers/net/xen-netback/xenbus.c @@ -1043,7 +1043,7 @@ static void connect(struct backend_info *be) xenvif_carrier_on(be->vif);
unregister_hotplug_status_watch(be); - err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch, + err = xenbus_watch_pathfmt(dev, &be->hotplug_status_watch, NULL, hotplug_status_changed, "%s/%s", dev->nodename, "hotplug-status"); if (!err) diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index 3bbed47da3fa..1e2a996c7515 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -688,7 +688,7 @@ static int xen_pcibk_xenbus_probe(struct xenbus_device *dev,
/* watch the backend node for backend configuration information */ err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch, - xen_pcibk_be_watch); + NULL, xen_pcibk_be_watch); if (err) goto out;
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index 5a8bd3baa6e5..e35bb6b87449 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -114,19 +114,22 @@ EXPORT_SYMBOL_GPL(xenbus_strstate); */ int xenbus_watch_path(struct xenbus_device *dev, const char *path, struct xenbus_watch *watch, + bool (*will_handle)(struct xenbus_watch *, + const char *, const char *), void (*callback)(struct xenbus_watch *, const char *, const char *)) { int err;
watch->node = path; - watch->will_handle = NULL; + watch->will_handle = will_handle; watch->callback = callback;
err = register_xenbus_watch(watch);
if (err) { watch->node = NULL; + watch->will_handle = NULL; watch->callback = NULL; xenbus_dev_fatal(dev, err, "adding watch on %s", path); } @@ -153,6 +156,8 @@ EXPORT_SYMBOL_GPL(xenbus_watch_path); */ int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, + bool (*will_handle)(struct xenbus_watch *, + const char *, const char *), void (*callback)(struct xenbus_watch *, const char *, const char *), const char *pathfmt, ...) @@ -169,7 +174,7 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch"); return -ENOMEM; } - err = xenbus_watch_path(dev, path, watch, callback); + err = xenbus_watch_path(dev, path, watch, will_handle, callback);
if (err) kfree(path); diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 5b471889d723..d7474ff2c277 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -136,7 +136,7 @@ static int watch_otherend(struct xenbus_device *dev) container_of(dev->dev.bus, struct xen_bus_type, bus);
return xenbus_watch_pathfmt(dev, &dev->otherend_watch, - bus->otherend_changed, + NULL, bus->otherend_changed, "%s/%s", dev->otherend, "state"); }
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 55f543fe0bd8..de5ae6c6fa0f 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -199,10 +199,14 @@ void xenbus_probe(struct work_struct *);
int xenbus_watch_path(struct xenbus_device *dev, const char *path, struct xenbus_watch *watch, + bool (*will_handle)(struct xenbus_watch *, + const char *, const char *), void (*callback)(struct xenbus_watch *, const char *, const char *)); -__printf(4, 5) +__printf(5, 6) int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch, + bool (*will_handle)(struct xenbus_watch *, + const char *, const char *), void (*callback)(struct xenbus_watch *, const char *, const char *), const char *pathfmt, ...);
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-v5.11-rc1 commit be987200fbaceaef340872841d4f7af2c5ee8dc3 category: bugfix bugzilla: NA CVE: CVE-2020-29568
--------------------------------
This commit adds support of the 'will_handle' watch callback for 'xen_bus_type' users.
This is part of XSA-349
Cc: stable@vger.kernel.org Signed-off-by: SeongJae Park sjpark@amazon.de Reported-by: Michael Kurth mku@amazon.de Reported-by: Pawel Wieczorkiewicz wipawel@amazon.de Reviewed-by: Juergen Gross jgross@suse.com Signed-off-by: Juergen Gross jgross@suse.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/xen/xenbus/xenbus.h | 2 ++ drivers/xen/xenbus/xenbus_probe.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h index d75a2385b37c..88516a8a9f93 100644 --- a/drivers/xen/xenbus/xenbus.h +++ b/drivers/xen/xenbus/xenbus.h @@ -44,6 +44,8 @@ struct xen_bus_type { int (*get_bus_id)(char bus_id[XEN_BUS_ID_SIZE], const char *nodename); int (*probe)(struct xen_bus_type *bus, const char *type, const char *dir); + bool (*otherend_will_handle)(struct xenbus_watch *watch, + const char *path, const char *token); void (*otherend_changed)(struct xenbus_watch *watch, const char *path, const char *token); struct bus_type bus; diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index d7474ff2c277..e6d0903459e1 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -136,7 +136,8 @@ static int watch_otherend(struct xenbus_device *dev) container_of(dev->dev.bus, struct xen_bus_type, bus);
return xenbus_watch_pathfmt(dev, &dev->otherend_watch, - NULL, bus->otherend_changed, + bus->otherend_will_handle, + bus->otherend_changed, "%s/%s", dev->otherend, "state"); }
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-v5.11-rc1 commit 3dc86ca6b4c8cfcba9da7996189d1b5a358a94fc category: bugfix bugzilla: NA CVE: CVE-2020-29568
--------------------------------
This commit adds a counter of pending messages for each watch in the struct. It is used to skip unnecessary pending messages lookup in 'unregister_xenbus_watch()'. It could also be used in 'will_handle' callback.
This is part of XSA-349
Cc: stable@vger.kernel.org Signed-off-by: SeongJae Park sjpark@amazon.de Reported-by: Michael Kurth mku@amazon.de Reported-by: Pawel Wieczorkiewicz wipawel@amazon.de Reviewed-by: Juergen Gross jgross@suse.com Signed-off-by: Juergen Gross jgross@suse.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/xen/xenbus/xenbus_xs.c | 29 ++++++++++++++++++----------- include/xen/xenbus.h | 2 ++ 2 files changed, 20 insertions(+), 11 deletions(-)
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index e8bdbd0a1e26..12e02eb01f59 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -711,6 +711,7 @@ int xs_watch_msg(struct xs_watch_event *event) event->path, event->token))) { spin_lock(&watch_events_lock); list_add_tail(&event->list, &watch_events); + event->handle->nr_pending++; wake_up(&watch_events_waitq); spin_unlock(&watch_events_lock); } else @@ -768,6 +769,8 @@ int register_xenbus_watch(struct xenbus_watch *watch)
sprintf(token, "%lX", (long)watch);
+ watch->nr_pending = 0; + down_read(&xs_watch_rwsem);
spin_lock(&watches_lock); @@ -817,11 +820,14 @@ void unregister_xenbus_watch(struct xenbus_watch *watch)
/* Cancel pending watch events. */ spin_lock(&watch_events_lock); - list_for_each_entry_safe(event, tmp, &watch_events, list) { - if (event->handle != watch) - continue; - list_del(&event->list); - kfree(event); + if (watch->nr_pending) { + list_for_each_entry_safe(event, tmp, &watch_events, list) { + if (event->handle != watch) + continue; + list_del(&event->list); + kfree(event); + } + watch->nr_pending = 0; } spin_unlock(&watch_events_lock);
@@ -868,7 +874,6 @@ void xs_suspend_cancel(void)
static int xenwatch_thread(void *unused) { - struct list_head *ent; struct xs_watch_event *event;
xenwatch_pid = current->pid; @@ -883,13 +888,15 @@ static int xenwatch_thread(void *unused) mutex_lock(&xenwatch_mutex);
spin_lock(&watch_events_lock); - ent = watch_events.next; - if (ent != &watch_events) - list_del(ent); + event = list_first_entry_or_null(&watch_events, + struct xs_watch_event, list); + if (event) { + list_del(&event->list); + event->handle->nr_pending--; + } spin_unlock(&watch_events_lock);
- if (ent != &watch_events) { - event = list_entry(ent, struct xs_watch_event, list); + if (event) { event->handle->callback(event->handle, event->path, event->token); kfree(event); diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index de5ae6c6fa0f..eba01ab5a55e 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -59,6 +59,8 @@ struct xenbus_watch /* Path being watched. */ const char *node;
+ unsigned int nr_pending; + /* * Called just before enqueing new event while a spinlock is held. * The event will be discarded if this callback returns false.
From: SeongJae Park sjpark@amazon.de
mainline inclusion from mainline-v5.11-rc1 commit 9996bd494794a2fe393e97e7a982388c6249aa76 category: bugfix bugzilla: NA CVE: CVE-2020-29568
--------------------------------
'xenbus_backend' watches 'state' of devices, which is writable by guests. Hence, if guests intensively updates it, dom0 will have lots of pending events that exhausting memory of dom0. In other words, guests can trigger dom0 memory pressure. This is known as XSA-349. However, the watch callback of it, 'frontend_changed()', reads only 'state', so doesn't need to have the pending events.
To avoid the problem, this commit disallows pending watch messages for 'xenbus_backend' using the 'will_handle()' watch callback.
This is part of XSA-349
Cc: stable@vger.kernel.org Signed-off-by: SeongJae Park sjpark@amazon.de Reported-by: Michael Kurth mku@amazon.de Reported-by: Pawel Wieczorkiewicz wipawel@amazon.de Reviewed-by: Juergen Gross jgross@suse.com Signed-off-by: Juergen Gross jgross@suse.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com Reviewed-by: Jason Yan yanaijie@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/xen/xenbus/xenbus_probe_backend.c | 7 +++++++ 1 file changed, 7 insertions(+)
diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index b0bed4faf44c..4bb603051d5b 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -180,6 +180,12 @@ static int xenbus_probe_backend(struct xen_bus_type *bus, const char *type, return err; }
+static bool frontend_will_handle(struct xenbus_watch *watch, + const char *path, const char *token) +{ + return watch->nr_pending == 0; +} + static void frontend_changed(struct xenbus_watch *watch, const char *path, const char *token) { @@ -191,6 +197,7 @@ static struct xen_bus_type xenbus_backend = { .levels = 3, /* backend/type/<frontend>/<id> */ .get_bus_id = backend_bus_id, .probe = xenbus_probe_backend, + .otherend_will_handle = frontend_will_handle, .otherend_changed = frontend_changed, .bus = { .name = "xen-backend",
From: Yiwen Jiang jiangyiwen@huawei.com
euleros inclusion category: bugfix bugzilla: 46842 CVE: NA
-------------------------------------------------
In the ARM64 architecture, zero physical address can be used for system ram, so virtual machine can allocate this segment of memory as vm's memory, but in vfio_unmap_unpin() if it found memory physical address is zero, it will call trace. By the way, x86 reserve 0~4k and avoid to happen this scenario.
So avoid to call trace in the ARM64 architecture.
Signed-off-by: Yiwen Jiang jiangyiwen@huawei.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- drivers/vfio/vfio_iommu_type1.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 5a106963dd08..8c5c99aad00d 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -827,7 +827,9 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, phys_addr_t phys, next;
phys = iommu_iova_to_phys(domain->domain, iova); - if (WARN_ON(!phys)) { + if (!phys) { + pr_warn("%s: phys is 0, it's normal if arch is arm\n", + __func__); iova += PAGE_SIZE; continue; }
From: wanghaibin wanghaibin.wang@huawei.com
euleros inclusion category: bugfix bugzilla: 46842 CVE: NA
-------------------------------------------------
We clear the ICH_LRn registers when save the ICH* registers. While the next vm entry, if the ICH_LRn registers is not enough (ICH_HCR,UIE bit will be set 1),if we set ICH_HCR register before the ICH_LRn restroe, here are lots of maintenance interrupts. This patch fix it
Signed-off-by: wanghaibin wanghaibin.wang@huawei.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/arm/hyp/vgic-v3-sr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index e20e797a1b69..f2f3665bc926 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -248,10 +248,10 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) int i;
if (used_lrs || cpu_if->its_vpe.its_vm) { - write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); - for (i = 0; i < used_lrs; i++) __gic_v3_set_lr(cpu_if->vgic_lr[i], i); + + write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); }
/*
From: Zenghui Yu yuzenghui@huawei.com
euleros inclusion category: bugfix bugzilla: 46842 CVE: NA
-------------------------------------------------
Doing a full D-cache clean will require to go through the stage2 page table and flush the entries one by one. If a large number of memory is mapped, we have to flush guest with hundreds of MB, thus takes a very long time (Linux timeout during CPU bring).
If timeout in kvm_toggle_cache(), trigger a WARN_ON() like:
[ 614.952893] WARNING: CPU: 61 PID: 23370 at arch/arm64/kvm/../../../virt/kvm/arm/mmu.c:2467 kvm_toggle_cache+0x188/0x218 [ 614.952902] Modules linked in: ... [ 614.952992] CPU: 61 PID: 23370 Comm: CPU 1/KVM Not tainted 5.1.0-rc4+ #83 [ 614.952997] Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.58 10/24/2018 [ 614.953003] pstate: 60000005 (nZCv daif -PAN -UAO) [ 614.953009] pc : kvm_toggle_cache+0x188/0x218 [ 614.953014] lr : kvm_toggle_cache+0x9c/0x218 [ 614.953018] sp : ffff00002f1f3910 [ 614.953023] x29: ffff00002f1f3910 x28: 0000000000000002 [ 614.953029] x27: ffff000011ab7bd8 x26: ffff000011ab5000 [ 614.953036] x25: ffff000011ab7bd8 x24: 0000000000000001 [ 614.953042] x23: 0000000000000000 x22: ffff80a77a3c0000 [ 614.953048] x21: 0000000000000005 x20: ffff000011aab000 [ 614.953054] x19: fffffffffffffe97 x18: ffffffffffffffff [ 614.953061] x17: 0000000000000000 x16: 0000000000000000 [ 614.953067] x15: ffff000011ab5b48 x14: ffff000012969e88 [ 614.953073] x13: ffff000012969adf x12: 0000000005f5e100 [ 614.953079] x11: 0000000005f5e0ff x10: 000000000000002a [ 614.953086] x9 : abcc77118461cefd x8 : 6d69742033303036 [ 614.953092] x7 : ffff000011ae45b0 x6 : 000000023c75873c [ 614.953099] x5 : 0000000000000000 x4 : 0000000000000001 [ 614.953105] x3 : fffffffffffffe80 x2 : 5631d047c2fe0900 [ 614.953111] x1 : 0000000000000000 x0 : 0000000100013373 [ 614.953118] Call trace: [ 614.953125] kvm_toggle_cache+0x188/0x218 [ 614.953131] access_vm_reg+0x88/0x110 [ 614.953136] perform_access+0x7c/0x1f0 [ 614.953142] kvm_handle_sys_reg+0x130/0x358 [ 614.953147] handle_exit+0x14c/0x1c8 [ 614.953153] kvm_arch_vcpu_ioctl_run+0x324/0xa40 [ 614.953159] kvm_vcpu_ioctl+0x3c8/0xa30 [ 614.953169] do_vfs_ioctl+0xc4/0x7f0 [ 614.953175] ksys_ioctl+0x8c/0xa0 [ 614.953180] __arm64_sys_ioctl+0x28/0x38 [ 614.953187] el0_svc_handler+0xd8/0x1a0 [ 614.953194] el0_svc+0x8/0xc [ 614.953232] ---[ end trace f036f6168107fdfd ]---
which will help in OM.
This commit introduces no functional changes.
Signed-off-by: Zenghui Yu yuzenghui@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/arm/mmu.c | 8 ++++++++ 1 file changed, 8 insertions(+)
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c index aef836a93086..3a74cf59f5c8 100644 --- a/virt/kvm/arm/mmu.c +++ b/virt/kvm/arm/mmu.c @@ -2511,6 +2511,7 @@ static bool kvm_need_flush_vm(struct kvm_vcpu *vcpu) void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) { bool now_enabled = vcpu_has_cache_enabled(vcpu); + unsigned long timeout = jiffies + HZ;
/* * If switching the MMU+caches on, need to invalidate the caches. @@ -2524,5 +2525,12 @@ void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool was_enabled) if (now_enabled) *vcpu_hcr(vcpu) &= ~HCR_TVM;
+ /* + * Guest's APs will fail to online after waiting for 1 second. + * Tell luser about this issue if already timeout here (mostly + * due to the bad cache maintenance performance). + */ + WARN_ON(time_after(jiffies, timeout)); + trace_kvm_toggle_cache(*vcpu_pc(vcpu), was_enabled, now_enabled); }
From: Zenghui Yu yuzenghui@huawei.com
euleros inclusion category: feature bugzilla: 46842 CVE: NA
-------------------------------------------------
port "MMIO Access" events reporting from x86 to arm64, they're totally same. Test below:
# ./tools/perf/perf kvm stat report --event=mmio
Analyze events for all VMs, all VCPUs:
MMIOAccess Samples Samples% Time% MinTime MaxTime Avgtime
0x80503000:W 404 100.00% 100.00% 10.86us 62.30us 19.99us(+-2.5%)
Total Samples:404, Total events handled time:8075.60us.
Signed-off-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- tools/perf/arch/arm64/util/kvm-stat.c | 71 +++++++++++++++++++++++++++ 1 file changed, 71 insertions(+)
diff --git a/tools/perf/arch/arm64/util/kvm-stat.c b/tools/perf/arch/arm64/util/kvm-stat.c index 7e70d93fc843..f2e5db271934 100644 --- a/tools/perf/arch/arm64/util/kvm-stat.c +++ b/tools/perf/arch/arm64/util/kvm-stat.c @@ -85,17 +85,88 @@ static struct kvm_events_ops trap_events = { .name = "TRAP-EVENT", };
+/* + * For the mmio events, we treat: + * the time of MMIO write: kvm_mmio(KVM_TRACE_MMIO_WRITE...) -> kvm_entry + * the time of MMIO read: kvm_exit -> kvm_mmio(KVM_TRACE_MMIO_READ...). + */ +static void mmio_event_get_key(struct perf_evsel *evsel, + struct perf_sample *sample, + struct event_key *key) +{ + key->key = perf_evsel__intval(evsel, sample, "gpa"); + key->info = perf_evsel__intval(evsel, sample, "type"); +} + +#define KVM_TRACE_MMIO_READ_UNSATISFIED 0 +#define KVM_TRACE_MMIO_READ 1 +#define KVM_TRACE_MMIO_WRITE 2 + +static bool mmio_event_begin(struct perf_evsel *evsel, + struct perf_sample *sample, + struct event_key *key) +{ + /* MMIO read begin event in kernel. */ + if (kvm_exit_event(evsel)) + return true; + + /* MMIO write begin event in kernel. */ + if (!strcmp(evsel->name, "kvm:kvm_mmio") && + perf_evsel__intval(evsel, sample, "type") == KVM_TRACE_MMIO_WRITE) { + mmio_event_get_key(evsel, sample, key); + return true; + } + + return false; +} + +static bool mmio_event_end(struct perf_evsel *evsel, + struct perf_sample *sample, + struct event_key *key) +{ + /* MMIO write end event in kernel. */ + if (kvm_entry_event(evsel)) + return true; + + /* MMIO read end event in kernel.*/ + if (!strcmp(evsel->name, "kvm:kvm_mmio") && + perf_evsel__intval(evsel, sample, "type") == KVM_TRACE_MMIO_READ) { + mmio_event_get_key(evsel, sample, key); + return true; + } + + return false; +} + +static void mmio_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused, + struct event_key *key, + char *decode) +{ + scnprintf(decode, decode_str_len, "%#lx:%s", + (unsigned long)key->key, + key->info == KVM_TRACE_MMIO_WRITE ? "W" : "R"); +} + +static struct kvm_events_ops mmio_events = { + .is_begin_event = mmio_event_begin, + .is_end_event = mmio_event_end, + .decode_key = mmio_event_decode_key, + .name = "MMIO Access" +}; + const char *kvm_events_tp[] = { "kvm:kvm_entry", "kvm:kvm_exit", "kvm:kvm_trap_enter", "kvm:kvm_trap_exit", + "kvm:kvm_mmio", NULL, };
struct kvm_reg_events_ops kvm_reg_events_ops[] = { { .name = "vmexit", .ops = &exit_events }, { .name = "trap", .ops = &trap_events }, + { .name = "mmio", .ops = &mmio_events }, { NULL, NULL }, };
From: Xiangyou Xie xiexiangyou@huawei.com
euleros inclusion category: feature bugzilla: 46842 CVE: NA
-------------------------------------------------
There is a delay when injecting a vtimer interrupt. And When vcpu wfx vmexit, it will switch to bgtimer, and the interrupt simulation path will become longer.
Provides a mechanism for bgtimer to trigger in advance, which can be injected into the interrupt in advance to avoid delay.
Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- include/kvm/arm_arch_timer.h | 3 +++ virt/kvm/arm/arch_timer.c | 44 ++++++++++++++++++++++++++++++++++-- virt/kvm/arm/arm.c | 4 ++++ 3 files changed, 49 insertions(+), 2 deletions(-)
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index d6e6a45d1d24..33771352dcd6 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -76,6 +76,9 @@ int kvm_arm_timer_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr);
bool kvm_timer_is_pending(struct kvm_vcpu *vcpu);
+void kvm_timer_schedule(struct kvm_vcpu *vcpu); +void kvm_timer_unschedule(struct kvm_vcpu *vcpu); + u64 kvm_phys_timer_read(void);
void kvm_timer_vcpu_load(struct kvm_vcpu *vcpu); diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 0f8cfc95a056..e1dac464e48a 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -36,6 +36,9 @@ static struct timecounter *timecounter; static unsigned int host_vtimer_irq; static u32 host_vtimer_irq_flags;
+static unsigned int bgtimer_advance_cycles; +module_param(bgtimer_advance_cycles, uint, 0644); + static DEFINE_STATIC_KEY_FALSE(has_gic_active_state);
static const struct kvm_irq_level default_ptimer_irq = { @@ -135,6 +138,7 @@ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) u64 min_virt = ULLONG_MAX, min_phys = ULLONG_MAX; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + u64 min_expire;
if (kvm_timer_irq_can_fire(vtimer)) min_virt = kvm_timer_compute_delta(vtimer); @@ -146,7 +150,17 @@ static u64 kvm_timer_earliest_exp(struct kvm_vcpu *vcpu) if ((min_virt == ULLONG_MAX) && (min_phys == ULLONG_MAX)) return 0;
- return min(min_virt, min_phys); + min_expire = min(min_virt, min_phys); + + if (bgtimer_advance_cycles) { + u64 ns = cyclecounter_cyc2ns(timecounter->cc, + bgtimer_advance_cycles, + timecounter->mask, + &timecounter->frac); + min_expire = min_expire > ns ? min_expire - ns : 0; + } + + return min_expire; }
static enum hrtimer_restart kvm_bg_timer_expire(struct hrtimer *hrt) @@ -354,6 +368,7 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu) struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + u64 expire;
/* * If both timers are not capable of raising interrupts (disabled or @@ -362,11 +377,30 @@ static void kvm_timer_blocking(struct kvm_vcpu *vcpu) if (!kvm_timer_irq_can_fire(vtimer) && !kvm_timer_irq_can_fire(ptimer)) return;
+ if (hrtimer_active(&timer->bg_timer)) + return; + /* * At least one guest time will expire. Schedule a background timer. * Set the earliest expiration time among the guest timers. */ - soft_timer_start(&timer->bg_timer, kvm_timer_earliest_exp(vcpu)); + expire = kvm_timer_earliest_exp(vcpu); + + if (expire && bgtimer_advance_cycles) { + if (vtimer->cnt_cval > bgtimer_advance_cycles) + vtimer->cnt_cval -= bgtimer_advance_cycles; + } + + soft_timer_start(&timer->bg_timer, expire); +} + +void kvm_timer_schedule(struct kvm_vcpu *vcpu) +{ + if (!bgtimer_advance_cycles || kvm_timer_is_pending(vcpu)) + return; + + vtimer_save_state(vcpu); + kvm_timer_blocking(vcpu); }
static void kvm_timer_unblocking(struct kvm_vcpu *vcpu) @@ -398,6 +432,12 @@ static void vtimer_restore_state(struct kvm_vcpu *vcpu) local_irq_restore(flags); }
+void kvm_timer_unschedule(struct kvm_vcpu *vcpu) +{ + vtimer_restore_state(vcpu); + kvm_timer_unblocking(vcpu); +} + static void set_cntvoff(u64 cntvoff) { u32 low = lower_32_bits(cntvoff); diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index 8e04e7ee7d08..f2eec858458b 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -378,11 +378,15 @@ void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) preempt_enable();
kvm_vgic_v4_enable_doorbell(vcpu); + + kvm_timer_schedule(vcpu); }
void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { kvm_vgic_v4_disable_doorbell(vcpu); + + kvm_timer_unschedule(vcpu); }
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
From: Zengruan Ye yezengruan@huawei.com
euleros inclusion category: bugfix bugzilla: 46842 CVE: NA
-------------------------------------------------
In addition to the Kunpeng processor, other CPU processors may be supported. The call trace here may cause confusion.
Signed-off-by: Zengruan Ye yezengruan@huawei.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/arm/hisi_cpu_model.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/virt/kvm/arm/hisi_cpu_model.c b/virt/kvm/arm/hisi_cpu_model.c index b0f7e0d7ef1c..52eecf1ba1cf 100644 --- a/virt/kvm/arm/hisi_cpu_model.c +++ b/virt/kvm/arm/hisi_cpu_model.c @@ -78,7 +78,8 @@ void probe_hisi_cpu_type(void) else of_get_hw_cpu_type();
- WARN_ON(hi_cpu_type == UNKNOWN_HI_TYPE); + if (hi_cpu_type == UNKNOWN_HI_TYPE) + pr_warn("UNKNOWN Hisi cpu type.\n"); }
#define NCSNP_MMIO_BASE 0x20107E238
From: Zenghui Yu yuzenghui@huawei.com
euleros inclusion category: feature bugzilla: 46842 CVE: NA
--------------------------------
This is very useful for debuging vgic.
Signed-off-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/arm/vgic/trace.h | 43 +++++++++++++++++++++++++++++++++++++++ virt/kvm/arm/vgic/vgic.c | 4 ++++ 2 files changed, 47 insertions(+)
diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h index 55fed77a9f73..f959ffa54156 100644 --- a/virt/kvm/arm/vgic/trace.h +++ b/virt/kvm/arm/vgic/trace.h @@ -27,6 +27,49 @@ TRACE_EVENT(vgic_update_irq_pending, __entry->vcpu_id, __entry->irq, __entry->level) );
+TRACE_EVENT(compute_ap_list_depth, + TP_PROTO(unsigned long vcpu_id, __u32 irq, __u32 hwirq, __u8 source, + __u8 priority, bool level, bool pending_latch, bool active, + bool enabled, bool hw, bool config), + TP_ARGS(vcpu_id, irq, hwirq, source, priority, level, pending_latch, + active, enabled, hw, config), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_id) + __field(__u32, irq) + __field(__u32, hwirq) + __field(__u8, source) + __field(__u8, priority) + __field(bool, level) + __field(bool, pending_latch) + __field(bool, active) + __field(bool, enabled) + __field(bool, hw) + __field(bool, config) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->irq = irq; + __entry->hwirq = hwirq; + __entry->source = source; + __entry->priority = priority; + __entry->level = level; + __entry->pending_latch = pending_latch; + __entry->active = active; + __entry->enabled = enabled; + __entry->hw = hw; + __entry->config = config; + ), + + TP_printk("VCPU: %ld, IRQ %d, HWIRQ: %d, SOURCE: %d, PRIORITY: %d, level: %d, pending_latch: %d, active: %d, enabled: %d, hw: %d, config: %d", + __entry->vcpu_id, __entry->irq, __entry->hwirq, + __entry->source, __entry->priority, __entry->level, + __entry->pending_latch, __entry->active, + __entry->enabled, __entry->hw, __entry->config) +); + + #endif /* _TRACE_VGIC_H */
#undef TRACE_INCLUDE_PATH diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index c9c062a39a95..38868b09ba94 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -779,6 +779,10 @@ static int compute_ap_list_depth(struct kvm_vcpu *vcpu, int w;
raw_spin_lock(&irq->irq_lock); + trace_compute_ap_list_depth(vcpu->vcpu_id, irq->intid, + irq->hwintid, irq->source, irq->priority, + irq->line_level, irq->pending_latch, irq->active, + irq->enabled, irq->hw, irq->config); /* GICv2 SGIs can count for more than one... */ w = vgic_irq_get_lr_count(irq); raw_spin_unlock(&irq->irq_lock);
From: Zenghui Yu yuzenghui@huawei.com
euleros inclusion category: feature bugzilla: 46842 CVE: NA
--------------------------------
Add trace to print the used_lrs, multi_sgi, vcpu_id
Signed-off-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/arm/vgic/trace.h | 35 +++++++++++++++++++++++++++++++++++ virt/kvm/arm/vgic/vgic.c | 4 ++++ 2 files changed, 39 insertions(+)
diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h index f959ffa54156..a7b23721b94f 100644 --- a/virt/kvm/arm/vgic/trace.h +++ b/virt/kvm/arm/vgic/trace.h @@ -69,6 +69,41 @@ TRACE_EVENT(compute_ap_list_depth, __entry->enabled, __entry->hw, __entry->config) );
+TRACE_EVENT(vgic_set_underflow, + TP_PROTO(unsigned long vcpu_id), + TP_ARGS(vcpu_id), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_id) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + ), + + TP_printk("VCPU: %ld", __entry->vcpu_id) +); + +TRACE_EVENT(vgic_flush_lr_state, + TP_PROTO(unsigned long vcpu_id, unsigned int used_lrs, bool multi_sgi), + TP_ARGS(vcpu_id, used_lrs, multi_sgi), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_id) + __field(unsigned int, used_lrs) + __field(bool, multi_sgi) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->used_lrs = used_lrs; + __entry->multi_sgi = multi_sgi; + ), + + TP_printk("VCPU: %ld, used_lrs: %d, multi_sgi: %d", + __entry->vcpu_id, __entry->used_lrs, __entry->multi_sgi) +); +
#endif /* _TRACE_VGIC_H */
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 38868b09ba94..11e471222e21 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -757,6 +757,8 @@ static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
static inline void vgic_set_underflow(struct kvm_vcpu *vcpu) { + trace_vgic_set_underflow(vcpu->vcpu_id); + if (kvm_vgic_global_state.type == VGIC_V2) vgic_v2_set_underflow(vcpu); else @@ -844,6 +846,8 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
vcpu->arch.vgic_cpu.used_lrs = count;
+ trace_vgic_flush_lr_state(vcpu->vcpu_id, vcpu->arch.vgic_cpu.used_lrs, + multi_sgi); /* Nuke remaining LRs */ for ( ; count < kvm_vgic_global_state.nr_lr; count++) vgic_clear_lr(vcpu, count);
From: Zenghui Yu yuzenghui@huawei.com
euleros inclusion category: feature bugzilla: 46842 CVE: NA
--------------------------------
Add trace to print LR values
Signed-off-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/arm/vgic/trace.h | 43 +++++++++++++++++++++++++++++++++++++ virt/kvm/arm/vgic/vgic-v3.c | 4 ++++ 2 files changed, 47 insertions(+)
diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h index a7b23721b94f..d9abaac6b4d5 100644 --- a/virt/kvm/arm/vgic/trace.h +++ b/virt/kvm/arm/vgic/trace.h @@ -104,6 +104,49 @@ TRACE_EVENT(vgic_flush_lr_state, __entry->vcpu_id, __entry->used_lrs, __entry->multi_sgi) );
+TRACE_EVENT(vgic_v3_populate_lr, + TP_PROTO(unsigned long vcpu_id, __u32 irq, __u64 val, int lr), + TP_ARGS(vcpu_id, irq, val, lr), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_id) + __field(__u32, irq) + __field(__u64, val) + __field(int, lr) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->irq = irq; + __entry->val = val; + __entry->lr = lr; + ), + + TP_printk("VCPU: %ld, IRQ: %d, LR: 0x%llx, Index: %d", + __entry->vcpu_id, __entry->irq, __entry->val, __entry->lr) +); + +TRACE_EVENT(vgic_v3_fold_lr_state, + TP_PROTO(unsigned long vcpu_id, __u32 irq, __u64 val, int lr), + TP_ARGS(vcpu_id, irq, val, lr), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_id) + __field(__u32, irq) + __field(__u64, val) + __field(int, lr) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->irq = irq; + __entry->val = val; + __entry->lr = lr; + ), + + TP_printk("VCPU: %ld, IRQ: %d, LR: 0x%llx, Index: %d", + __entry->vcpu_id, __entry->irq, __entry->val, __entry->lr) +);
#endif /* _TRACE_VGIC_H */
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index 30b3b8284db9..a9c6afec8480 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -21,6 +21,7 @@ #include <asm/kvm_asm.h>
#include "vgic.h" +#include "trace.h"
static bool group0_trap; static bool group1_trap; @@ -67,6 +68,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) is_v2_sgi = vgic_irq_is_sgi(intid); }
+ trace_vgic_v3_fold_lr_state(vcpu->vcpu_id, intid, val, lr); + /* Notify fds when the guest EOI'ed a level-triggered IRQ */ if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) kvm_notify_acked_irq(vcpu->kvm, 0, @@ -207,6 +210,7 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr) val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; + trace_vgic_v3_populate_lr(vcpu->vcpu_id, irq->intid, val, lr); }
void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
From: Zenghui Yu yuzenghui@huawei.com
euleros inclusion category: feature bugzilla: 46842 CVE: NA
--------------------------------
Add more trace log on guest entry/exit
Signed-off-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/arm/arm.c | 10 +++++++ virt/kvm/arm/trace.h | 65 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+)
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c index f2eec858458b..44fcec4b47b7 100644 --- a/virt/kvm/arm/arm.c +++ b/virt/kvm/arm/arm.c @@ -850,6 +850,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * Enter the guest */ trace_kvm_entry(vcpu->vcpu_id, *vcpu_pc(vcpu)); + trace_kvm_entry_more(*vcpu_pc(vcpu), vcpu_read_elr_el1(vcpu), + *vcpu_cpsr(vcpu), + *vcpu_hcr(vcpu), kvm_vcpu_get_hsr(vcpu), + kvm_vcpu_get_hfar(vcpu), + vcpu->arch.fault.hpfar_el2); guest_enter_irqoff();
if (has_vhe()) { @@ -914,6 +919,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) */ guest_exit(); trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu)); + trace_kvm_exit_more(*vcpu_pc(vcpu), vcpu_read_elr_el1(vcpu), + *vcpu_cpsr(vcpu), + *vcpu_hcr(vcpu), kvm_vcpu_get_hsr(vcpu), + kvm_vcpu_get_hfar(vcpu), + vcpu->arch.fault.hpfar_el2);
/* Exit types that need handling before we can be preempted */ handle_exit_early(vcpu, run, ret); diff --git a/virt/kvm/arm/trace.h b/virt/kvm/arm/trace.h index 4aea348252f9..5b7ba18f3b67 100644 --- a/virt/kvm/arm/trace.h +++ b/virt/kvm/arm/trace.h @@ -281,6 +281,71 @@ TRACE_EVENT(kvm_pvsched_kick_vcpu, __entry->vcpu_id, __entry->target_vcpu_id) );
+TRACE_EVENT(kvm_entry_more, + TP_PROTO(unsigned long vcpu_pc, unsigned long elr_el1, + unsigned long cpsr, unsigned long hcr, + unsigned long hsr, unsigned long hxfar, + unsigned long long hpfar), + TP_ARGS(vcpu_pc, elr_el1, cpsr, hcr, hsr, hxfar, hpfar), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(unsigned long, elr_el1) + __field(unsigned long, cpsr) + __field(unsigned long, hcr) + __field(unsigned long, hsr) + __field(unsigned long, hxfar) + __field(unsigned long long, hpfar) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->elr_el1 = elr_el1; + __entry->cpsr = cpsr; + __entry->hcr = hcr; + __entry->hsr = hsr; + __entry->hxfar = hxfar; + __entry->hpfar = hpfar; + ), + + TP_printk("PC: 0x%08lx, ELR_EL1: 0x%08lx, CPSR: 0x%08lx, HCR: 0x%08lx, HSR: 0x%08lx, HXFAR: 0x%08lx, HPFAR: 0x%llx", + __entry->vcpu_pc, __entry->elr_el1, __entry->cpsr, + __entry->hcr, __entry->hsr, __entry->hxfar, __entry->hpfar) +); + +TRACE_EVENT(kvm_exit_more, + TP_PROTO(unsigned long vcpu_pc, unsigned long elr_el1, + unsigned long cpsr, unsigned long hcr, + unsigned long hsr, unsigned long hxfar, + unsigned long long hpfar), + TP_ARGS(vcpu_pc, elr_el1, cpsr, hcr, hsr, hxfar, hpfar), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_pc) + __field(unsigned long, elr_el1) + __field(unsigned long, cpsr) + __field(unsigned long, hcr) + __field(unsigned long, hsr) + __field(unsigned long, hxfar) + __field(unsigned long long, hpfar) + ), + + TP_fast_assign( + __entry->vcpu_pc = vcpu_pc; + __entry->elr_el1 = elr_el1; + __entry->cpsr = cpsr; + __entry->hcr = hcr; + __entry->hsr = hsr; + __entry->hxfar = hxfar; + __entry->hpfar = hpfar; + ), + + TP_printk("PC: 0x%08lx, ELR_EL1: 0x%08lx, CPSR: 0x%08lx, HCR: 0x%08lx, HSR: 0x%08lx, HXFAR: 0x%08lx, HPFAR: 0x%llx", + __entry->vcpu_pc, __entry->elr_el1, __entry->cpsr, + __entry->hcr, __entry->hsr, __entry->hxfar, __entry->hpfar) +); + + #endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
From: Zenghui Yu yuzenghui@huawei.com
euleros inclusion category: feature bugzilla: 46842 CVE: NA
--------------------------------
Add trace to print vgic cpu interface
Signed-off-by: Zenghui Yu yuzenghui@huawei.com Signed-off-by: Xiangyou Xie xiexiangyou@huawei.com Reviewed-by: Ying Fang fangying1@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- virt/kvm/arm/vgic/trace.h | 93 +++++++++++++++++++++++++++++++++++++ virt/kvm/arm/vgic/vgic-v3.c | 19 ++++++++ 2 files changed, 112 insertions(+)
diff --git a/virt/kvm/arm/vgic/trace.h b/virt/kvm/arm/vgic/trace.h index d9abaac6b4d5..5285048afb25 100644 --- a/virt/kvm/arm/vgic/trace.h +++ b/virt/kvm/arm/vgic/trace.h @@ -148,6 +148,99 @@ TRACE_EVENT(vgic_v3_fold_lr_state, __entry->vcpu_id, __entry->irq, __entry->val, __entry->lr) );
+TRACE_EVENT(vgic_v3_populate_lr_vgic_if, + TP_PROTO(unsigned long vcpu_id, unsigned long hcr, unsigned long vmcr, + unsigned long sre, unsigned long ap0r0, + unsigned long ap0r1, unsigned long ap0r2, unsigned long ap0r3, + unsigned long ap1r0, unsigned long ap1r1, unsigned long ap1r2, + unsigned long ap1r3), + TP_ARGS(vcpu_id, hcr, vmcr, sre, ap0r0, ap0r1, ap0r2, ap0r3, + ap1r0, ap1r1, ap1r2, ap1r3), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_id) + __field(unsigned long, hcr) + __field(unsigned long, vmcr) + __field(unsigned long, sre) + __field(unsigned long, ap0r0) + __field(unsigned long, ap0r1) + __field(unsigned long, ap0r2) + __field(unsigned long, ap0r3) + __field(unsigned long, ap1r0) + __field(unsigned long, ap1r1) + __field(unsigned long, ap1r2) + __field(unsigned long, ap1r3) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->hcr = hcr; + __entry->vmcr = vmcr; + __entry->sre = sre; + __entry->ap0r0 = ap0r0; + __entry->ap0r1 = ap0r1; + __entry->ap0r2 = ap0r2; + __entry->ap0r3 = ap0r3; + __entry->ap1r0 = ap1r0; + __entry->ap1r1 = ap1r1; + __entry->ap1r2 = ap1r2; + __entry->ap1r3 = ap1r3; + ), + + TP_printk("VCPU: %ld, HCR: 0x%lx, VMCR: 0x%lx, SRE: 0x%lx, ap0r0: 0x%lx, ap0r1: 0x%lx, ap0r2: 0x%lx, ap0r3: 0x%lx, ap1r0: 0x%lx, ap1r1: 0x%lx, ap1r2: 0x%lx, ap1r3: 0x%lx,", + __entry->vcpu_id, __entry->hcr, __entry->vmcr, + __entry->sre, __entry->ap0r0, __entry->ap0r1, + __entry->ap0r2, __entry->ap0r3, __entry->ap1r0, + __entry->ap1r1, __entry->ap1r2, __entry->ap1r3) +); + +TRACE_EVENT(vgic_v3_fold_lr_state_vgic_if, + TP_PROTO(unsigned long vcpu_id, unsigned long hcr, unsigned long vmcr, + unsigned long sre, unsigned long ap0r0, + unsigned long ap0r1, unsigned long ap0r2, unsigned long ap0r3, + unsigned long ap1r0, unsigned long ap1r1, unsigned long ap1r2, + unsigned long ap1r3), + TP_ARGS(vcpu_id, hcr, vmcr, sre, ap0r0, ap0r1, ap0r2, ap0r3, + ap1r0, ap1r1, ap1r2, ap1r3), + + TP_STRUCT__entry( + __field(unsigned long, vcpu_id) + __field(unsigned long, hcr) + __field(unsigned long, vmcr) + __field(unsigned long, sre) + __field(unsigned long, ap0r0) + __field(unsigned long, ap0r1) + __field(unsigned long, ap0r2) + __field(unsigned long, ap0r3) + __field(unsigned long, ap1r0) + __field(unsigned long, ap1r1) + __field(unsigned long, ap1r2) + __field(unsigned long, ap1r3) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu_id; + __entry->hcr = hcr; + __entry->vmcr = vmcr; + __entry->sre = sre; + __entry->ap0r0 = ap0r0; + __entry->ap0r1 = ap0r1; + __entry->ap0r2 = ap0r2; + __entry->ap0r3 = ap0r3; + __entry->ap1r0 = ap1r0; + __entry->ap1r1 = ap1r1; + __entry->ap1r2 = ap1r2; + __entry->ap1r3 = ap1r3; + ), + + TP_printk("VCPU: %ld, HCR: 0x%lx, VMCR: 0x%lx, SRE: 0x%lx, ap0r0: 0x%lx, ap0r1: 0x%lx, ap0r2: 0x%lx, ap0r3: 0x%lx, ap1r0: 0x%lx, ap1r1: 0x%lx, ap1r2: 0x%lx, ap1r3: 0x%lx,", + __entry->vcpu_id, __entry->hcr, __entry->vmcr, + __entry->sre, __entry->ap0r0, __entry->ap0r1, + __entry->ap0r2, __entry->ap0r3, __entry->ap1r0, + __entry->ap1r1, __entry->ap1r2, __entry->ap1r3) +); + + #endif /* _TRACE_VGIC_H */
#undef TRACE_INCLUDE_PATH diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index a9c6afec8480..4d1f7afe26fe 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -127,6 +127,13 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) }
vgic_cpu->used_lrs = 0; + + trace_vgic_v3_fold_lr_state_vgic_if(vcpu->vcpu_id, cpuif->vgic_hcr, + cpuif->vgic_vmcr, cpuif->vgic_sre, + cpuif->vgic_ap0r[0], cpuif->vgic_ap0r[1], + cpuif->vgic_ap0r[2], cpuif->vgic_ap0r[3], + cpuif->vgic_ap1r[0], cpuif->vgic_ap1r[1], + cpuif->vgic_ap1r[2], cpuif->vgic_ap1r[3]); }
/* Requires the irq to be locked already */ @@ -211,6 +218,18 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val; trace_vgic_v3_populate_lr(vcpu->vcpu_id, irq->intid, val, lr); + trace_vgic_v3_populate_lr_vgic_if(vcpu->vcpu_id, + vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr, + vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr, + vcpu->arch.vgic_cpu.vgic_v3.vgic_sre, + vcpu->arch.vgic_cpu.vgic_v3.vgic_ap0r[0], + vcpu->arch.vgic_cpu.vgic_v3.vgic_ap0r[1], + vcpu->arch.vgic_cpu.vgic_v3.vgic_ap0r[2], + vcpu->arch.vgic_cpu.vgic_v3.vgic_ap0r[3], + vcpu->arch.vgic_cpu.vgic_v3.vgic_ap1r[0], + vcpu->arch.vgic_cpu.vgic_v3.vgic_ap1r[1], + vcpu->arch.vgic_cpu.vgic_v3.vgic_ap1r[2], + vcpu->arch.vgic_cpu.vgic_v3.vgic_ap1r[3]); }
void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
From: qinyu qinyu16@huawei.com
euleros inclusion category: feature bugzilla: 46854
--------------------------------
cloud phone needs:
1.thermal emulation (CONFIG_THERMAL_EMULATION=y) apps may try to detect thermal related files/devices, so cloud phone requires us to turn on this config for detection and future thermal related adjustments.
2.cloud phone needs to run 32bit arm apps, and such compatibility requires 1.LSM_MMAP_MIN_ADDR no more than 32768. config help for LSM_MMAP_MIN_ADDR: "On arm and other archs it should not be higher than 32768." 64bit apps are compatible with lower value, see config help for more information.
2.all configs under ARMV8_DEPRECATED for instruction simulation.
Signed-off-by: liuzixian liuzixian4@huawei.com Signed-off-by: qinyu qinyu16@huawei.com Reviewed-by: Hanjun Guo guohanjun@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- arch/arm64/configs/openeuler_defconfig | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 74a9de3f8540..2251d4d2ffc3 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -453,7 +453,10 @@ CONFIG_UNMAP_KERNEL_AT_EL0=y CONFIG_HARDEN_BRANCH_PREDICTOR=y CONFIG_HARDEN_EL2_VECTORS=y CONFIG_ARM64_SSBD=y -# CONFIG_ARMV8_DEPRECATED is not set +CONFIG_ARMV8_DEPRECATED=y +CONFIG_SWP_EMULATION=y +CONFIG_CP15_BARRIER_EMULATION=y +CONFIG_SETEND_EMULATION=y # CONFIG_ARM64_SW_TTBR0_PAN is not set
# @@ -3549,7 +3552,7 @@ CONFIG_THERMAL_GOV_STEP_WISE=y CONFIG_THERMAL_GOV_USER_SPACE=y # CONFIG_THERMAL_GOV_POWER_ALLOCATOR is not set CONFIG_CPU_THERMAL=y -# CONFIG_THERMAL_EMULATION is not set +CONFIG_THERMAL_EMULATION=y CONFIG_HISI_THERMAL=y # CONFIG_QORIQ_THERMAL is not set
@@ -5441,7 +5444,7 @@ CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_INFINIBAND=y CONFIG_SECURITY_NETWORK_XFRM=y CONFIG_SECURITY_PATH=y -CONFIG_LSM_MMAP_MIN_ADDR=65535 +CONFIG_LSM_MMAP_MIN_ADDR=32768 CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y CONFIG_HARDENED_USERCOPY=y CONFIG_HARDENED_USERCOPY_FALLBACK=y
From: Roberto Sassu roberto.sassu@huawei.com
hulk inclusion category: feature feature: digest-lists
---------------------------
The EVM ignore mode works similarly to the metadata modification mode. They both allow an operation to be performed even if the operation causes metadata to become invalid.
Currently, evm_reset_status() notifies to IMA that an operation modified metadata only when the metadata modification mode was chosen. This patch sends a notification also when the ignore mode is selected.
Signed-off-by: Roberto Sassu roberto.sassu@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- security/integrity/evm/evm_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/security/integrity/evm/evm_main.c b/security/integrity/evm/evm_main.c index 5155ff4c4ef2..2d3c1670d8d3 100644 --- a/security/integrity/evm/evm_main.c +++ b/security/integrity/evm/evm_main.c @@ -570,7 +570,8 @@ static void evm_reset_status(struct inode *inode, int bit)
iint = integrity_iint_find(inode); if (iint) { - if (evm_initialized & EVM_ALLOW_METADATA_WRITES) + if ((evm_initialized & EVM_ALLOW_METADATA_WRITES) || + evm_ignoremode) set_bit(bit, &iint->atomic_flags);
iint->evm_status = INTEGRITY_UNKNOWN;
From: Roberto Sassu roberto.sassu@huawei.com
hulk inclusion category: feature feature: digest-lists
---------------------------
IMA supports direct uploading of digest lists without specifying the path of the digest list. This feature allows software vendors to provide a digest list in an alternative format (e.g. RPM). A user space parser converts it to the native format and directly uploads it to the kernel.
This feature requires additional protection for the kernel, to ensure that digest lists are converted only by legitimate parsers and to ensure that any file accessed by the parser itself is measured and appraised.
The first protection is provided by ima_check_current_is_parser(), ima_set/unset_parser() and ima_current_is_parser(). The first function checks the digest of the executable of the process accessing the securityfs interface. The second and third set/unset the current process as the parser process and the last tells whether the current process is the parser process.
The second protection is provided by ima_check_measured_appraised(). If ima_current_is_parser() returns true, the current process is the parser process, it checks whether or not IMA evaluated the files accessed by that process and if not, disables digest list lookup.
Unfortunately, ima_set_parser() is called too late, during a write operation, while some files could have been already accessed by the parser. This patch moves ima_set_parser() to ima_open_data_upload() so that more files can be evaluated. A parser should be written in a way that the securityfs interface is opened first before any other file operation.
Signed-off-by: Roberto Sassu roberto.sassu@huawei.com Signed-off-by: Yang Yingliang yangyingliang@huawei.com --- security/integrity/ima/ima_fs.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index a5f87fcdf731..2866a0967a1d 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -393,18 +393,16 @@ static ssize_t ima_write_data(struct file *file, const char __user *buf, result = ima_parse_add_rule(data); } } else if (dentry == digest_list_data) { - if (!ima_check_current_is_parser()) { + if (!ima_current_is_parser()) { result = -EACCES; } else { - ima_set_parser(); result = ima_parse_compact_list(datalen, data, DIGEST_LIST_OP_ADD); } } else if (dentry == digest_list_data_del) { - if (!ima_check_current_is_parser()) { + if (!ima_current_is_parser()) { result = -EACCES; } else { - ima_set_parser(); result = ima_parse_compact_list(datalen, data, DIGEST_LIST_OP_DEL); } @@ -479,6 +477,11 @@ static int ima_open_data_upload(struct inode *inode, struct file *filp) } if (test_and_set_bit(flag, &ima_fs_flags)) return -EBUSY; + + if (dentry == digest_list_data || dentry == digest_list_data_del) + if (ima_check_current_is_parser()) + ima_set_parser(); + return 0; }