From: Daniel Thompson daniel.thompson@linaro.org
from stable-v5.10.119 commit a8f4d63142f947cd22fa615b8b3b8921cdaf4991 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5A5YP CVE: CVE-2022-21499
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
commit eadb2f47a3ced5c64b23b90fd2a3463f63726066 upstream.
KGDB and KDB allow read and write access to kernel memory, and thus should be restricted during lockdown. An attacker with access to a serial port (for example, via a hypervisor console, which some cloud vendors provide over the network) could trigger the debugger so it is important that the debugger respect the lockdown mode when/if it is triggered.
Fix this by integrating lockdown into kdb's existing permissions mechanism. Unfortunately kgdb does not have any permissions mechanism (although it certainly could be added later) so, for now, kgdb is simply and brutally disabled by immediately exiting the gdb stub without taking any action.
For lockdowns established early in the boot (e.g. the normal case) then this should be fine but on systems where kgdb has set breakpoints before the lockdown is enacted than "bad things" will happen.
CVE: CVE-2022-21499 Co-developed-by: Stephen Brennan stephen.s.brennan@oracle.com Signed-off-by: Stephen Brennan stephen.s.brennan@oracle.com Reviewed-by: Douglas Anderson dianders@chromium.org Signed-off-by: Daniel Thompson daniel.thompson@linaro.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Zheng Yejian zhengyejian1@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/security.h | 2 ++ kernel/debug/debug_core.c | 24 ++++++++++++++ kernel/debug/kdb/kdb_main.c | 62 +++++++++++++++++++++++++++++++++++-- security/security.c | 2 ++ 4 files changed, 87 insertions(+), 3 deletions(-)
diff --git a/include/linux/security.h b/include/linux/security.h index 35355429648e..330029ef7e89 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -121,10 +121,12 @@ enum lockdown_reason { LOCKDOWN_DEBUGFS, LOCKDOWN_XMON_WR, LOCKDOWN_BPF_WRITE_USER, + LOCKDOWN_DBG_WRITE_KERNEL, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, LOCKDOWN_BPF_READ, + LOCKDOWN_DBG_READ_KERNEL, LOCKDOWN_PERF, LOCKDOWN_TRACEFS, LOCKDOWN_XMON_RW, diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index be5b6b97adbf..363f781b56ca 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -56,6 +56,7 @@ #include <linux/vmacache.h> #include <linux/rcupdate.h> #include <linux/irq.h> +#include <linux/security.h>
#include <asm/cacheflush.h> #include <asm/byteorder.h> @@ -762,6 +763,29 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, continue; kgdb_connected = 0; } else { + /* + * This is a brutal way to interfere with the debugger + * and prevent gdb being used to poke at kernel memory. + * This could cause trouble if lockdown is applied when + * there is already an active gdb session. For now the + * answer is simply "don't do that". Typically lockdown + * *will* be applied before the debug core gets started + * so only developers using kgdb for fairly advanced + * early kernel debug can be biten by this. Hopefully + * they are sophisticated enough to take care of + * themselves, especially with help from the lockdown + * message printed on the console! + */ + if (security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL)) { + if (IS_ENABLED(CONFIG_KGDB_KDB)) { + /* Switch back to kdb if possible... */ + dbg_kdb_mode = 1; + continue; + } else { + /* ... otherwise just bail */ + break; + } + } error = gdb_serial_stub(ks); }
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 930ac1b25ec7..4e09fab52faf 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -45,6 +45,7 @@ #include <linux/proc_fs.h> #include <linux/uaccess.h> #include <linux/slab.h> +#include <linux/security.h> #include "kdb_private.h"
#undef MODULE_PARAM_PREFIX @@ -197,10 +198,62 @@ struct task_struct *kdb_curr_task(int cpu) }
/* - * Check whether the flags of the current command and the permissions - * of the kdb console has allow a command to be run. + * Update the permissions flags (kdb_cmd_enabled) to match the + * current lockdown state. + * + * Within this function the calls to security_locked_down() are "lazy". We + * avoid calling them if the current value of kdb_cmd_enabled already excludes + * flags that might be subject to lockdown. Additionally we deliberately check + * the lockdown flags independently (even though read lockdown implies write + * lockdown) since that results in both simpler code and clearer messages to + * the user on first-time debugger entry. + * + * The permission masks during a read+write lockdown permits the following + * flags: INSPECT, SIGNAL, REBOOT (and ALWAYS_SAFE). + * + * The INSPECT commands are not blocked during lockdown because they are + * not arbitrary memory reads. INSPECT covers the backtrace family (sometimes + * forcing them to have no arguments) and lsmod. These commands do expose + * some kernel state but do not allow the developer seated at the console to + * choose what state is reported. SIGNAL and REBOOT should not be controversial, + * given these are allowed for root during lockdown already. + */ +static void kdb_check_for_lockdown(void) +{ + const int write_flags = KDB_ENABLE_MEM_WRITE | + KDB_ENABLE_REG_WRITE | + KDB_ENABLE_FLOW_CTRL; + const int read_flags = KDB_ENABLE_MEM_READ | + KDB_ENABLE_REG_READ; + + bool need_to_lockdown_write = false; + bool need_to_lockdown_read = false; + + if (kdb_cmd_enabled & (KDB_ENABLE_ALL | write_flags)) + need_to_lockdown_write = + security_locked_down(LOCKDOWN_DBG_WRITE_KERNEL); + + if (kdb_cmd_enabled & (KDB_ENABLE_ALL | read_flags)) + need_to_lockdown_read = + security_locked_down(LOCKDOWN_DBG_READ_KERNEL); + + /* De-compose KDB_ENABLE_ALL if required */ + if (need_to_lockdown_write || need_to_lockdown_read) + if (kdb_cmd_enabled & KDB_ENABLE_ALL) + kdb_cmd_enabled = KDB_ENABLE_MASK & ~KDB_ENABLE_ALL; + + if (need_to_lockdown_write) + kdb_cmd_enabled &= ~write_flags; + + if (need_to_lockdown_read) + kdb_cmd_enabled &= ~read_flags; +} + +/* + * Check whether the flags of the current command, the permissions of the kdb + * console and the lockdown state allow a command to be run. */ -static inline bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, +static bool kdb_check_flags(kdb_cmdflags_t flags, int permissions, bool no_args) { /* permissions comes from userspace so needs massaging slightly */ @@ -1194,6 +1247,9 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, kdb_curr_task(raw_smp_processor_id());
KDB_DEBUG_STATE("kdb_local 1", reason); + + kdb_check_for_lockdown(); + kdb_go_count = 0; if (reason == KDB_REASON_DEBUG) { /* special case below */ diff --git a/security/security.c b/security/security.c index 4fb58543eeb9..2fc40217d49d 100644 --- a/security/security.c +++ b/security/security.c @@ -59,10 +59,12 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_DEBUGFS] = "debugfs access", [LOCKDOWN_XMON_WR] = "xmon write access", [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM", + [LOCKDOWN_DBG_WRITE_KERNEL] = "use of kgdb/kdb to write kernel RAM", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM", + [LOCKDOWN_DBG_READ_KERNEL] = "use of kgdb/kdb to read kernel RAM", [LOCKDOWN_PERF] = "unsafe use of perf", [LOCKDOWN_TRACEFS] = "use of tracefs", [LOCKDOWN_XMON_RW] = "xmon read and write access",
From: Li Lingfeng lilingfeng3@huawei.com
hulk inclusion category: bugfix bugzilla: 186944, https://gitee.com/openeuler/kernel/issues/I5DAJY CVE: NA
--------------------------------
When migrating to extents, the checksum seed of temporary inode need to be replaced by inode's, otherwise the inode checksums will be incorrect when swapping the inodes data.
However, the temporary inode can not match it's checksum to itself since it has lost it's own checksum seed.
mkfs.ext4 -F /dev/sdc mount /dev/sdc /mnt/sdc xfs_io -fc "pwrite 4k 4k" -c "fsync" /mnt/sdc/testfile chattr -e /mnt/sdc/testfile chattr +e /mnt/sdc/testfile fsck -fn /dev/sdc
======== ... Pass 1: Checking inodes, blocks, and sizes Inode 13 passes checks, but checksum does not match inode. Fix? no ... ========
The fix is simple, save the checksum seed of temporary inode, and recover it after migrating to extents.
Fixes: e81c9302a6c3 ("ext4: set csum seed in tmp inode while migrating to extents") Signed-off-by: Li Lingfeng lilingfeng3@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/ext4/migrate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 49912814f3d8..04320715d61f 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -417,7 +417,7 @@ int ext4_ext_migrate(struct inode *inode) struct inode *tmp_inode = NULL; struct migrate_struct lb; unsigned long max_entries; - __u32 goal; + __u32 goal, tmp_csum_seed; uid_t owner[2];
/* @@ -465,6 +465,7 @@ int ext4_ext_migrate(struct inode *inode) * the migration. */ ei = EXT4_I(inode); + tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed; EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed; i_size_write(tmp_inode, i_size_read(inode)); /* @@ -575,6 +576,7 @@ int ext4_ext_migrate(struct inode *inode) * the inode is not visible to user space. */ tmp_inode->i_blocks = 0; + EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed;
/* Reset the extent details */ ext4_ext_tree_init(handle, tmp_inode);
From: Hugh Dickins hughd@google.com
mainline inclusion from mainline-v5.15-rc1 commit 51cc3a6620a6ca934d468bda345678768493f5d8 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5CANU CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/fs...
--------------------------------
We had a recurring situation in which admin procedures setting up swapfiles would race with test preparation clearing away swapfiles; and just occasionally that got stuck on a swapfile "(deleted)" which could never be swapped off. That is not supposed to be possible.
2.6.28 commit f9454548e17c ("don't unlink an active swapfile") admitted that it was leaving a race window open: now close it.
may_delete() makes the IS_SWAPFILE check (amongst many others) before inode_lock has been taken on target: now repeat just that simple check in vfs_unlink() and vfs_rename(), after taking inode_lock.
Which goes most of the way to fixing the race, but swapon() must also check after it acquires inode_lock, that the file just opened has not already been unlinked.
Link: https://lkml.kernel.org/r/e17b91ad-a578-9a15-5e3-4989e0f999b5@google.com Fixes: f9454548e17c ("don't unlink an active swapfile") Signed-off-by: Hugh Dickins hughd@google.com Reviewed-by: Jan Kara jack@suse.cz Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Miao Xie miaoxie@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/namei.c | 8 +++++++- mm/swapfile.c | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-)
diff --git a/fs/namei.c b/fs/namei.c index 0782401c6514..4b55e176cbfc 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3818,7 +3818,9 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate return -EPERM;
inode_lock(target); - if (is_local_mountpoint(dentry)) + if (IS_SWAPFILE(target)) + error = -EPERM; + else if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); @@ -4282,6 +4284,10 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, else if (target) inode_lock(target);
+ error = -EPERM; + if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target))) + goto out; + error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; diff --git a/mm/swapfile.c b/mm/swapfile.c index 5af6b0f770de..eaf483c7c83e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3167,6 +3167,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; + struct dentry *dentry; int prio; int error; union swap_header *swap_header; @@ -3210,6 +3211,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->swap_file = swap_file; mapping = swap_file->f_mapping; + dentry = swap_file->f_path.dentry; inode = mapping->host;
error = claim_swapfile(p, inode); @@ -3217,6 +3219,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap;
inode_lock(inode); + if (d_unlinked(dentry) || cant_mount(dentry)) { + error = -ENOENT; + goto bad_swap_unlock_inode; + } if (IS_SWAPFILE(inode)) { error = -EBUSY; goto bad_swap_unlock_inode;
From: Tadeusz Struk tadeusz.struk@linaro.org
maillist inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I5D7NF CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?...
--------------------------------
Syzbot found a Use After Free bug in compute_effective_progs(). The reproducer creates a number of BPF links, and causes a fault injected alloc to fail, while calling bpf_link_detach on them. Link detach triggers the link to be freed by bpf_link_free(), which calls __cgroup_bpf_detach() and update_effective_progs(). If the memory allocation in this function fails, the function restores the pointer to the bpf_cgroup_link on the cgroup list, but the memory gets freed just after it returns. After this, every subsequent call to update_effective_progs() causes this already deallocated pointer to be dereferenced in prog_list_length(), and triggers KASAN UAF error.
To fix this issue don't preserve the pointer to the prog or link in the list, but remove it and replace it with a dummy prog without shrinking the table. The subsequent call to __cgroup_bpf_detach() or __cgroup_bpf_detach() will correct it.
Fixes: af6eea57437a ("bpf: Implement bpf_link-based cgroup BPF program attachment") Reported-by: syzbot+f264bffdfbd5614f3bb2@syzkaller.appspotmail.com Signed-off-by: Tadeusz Struk tadeusz.struk@linaro.org Signed-off-by: Andrii Nakryiko andrii@kernel.org Cc: stable@vger.kernel.org Link: https://syzkaller.appspot.com/bug?id=8ebf179a95c2a2670f7cf1ba62429ec044369db... Link: https://lore.kernel.org/bpf/20220517180420.87954-1-tadeusz.struk@linaro.org Signed-off-by: Pu Lehui pulehui@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/bpf/cgroup.c | 70 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 10 deletions(-)
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 7e1e5051468f..e1d0c6248aae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -667,6 +667,60 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs, return ERR_PTR(-ENOENT); }
+/** + * purge_effective_progs() - After compute_effective_progs fails to alloc new + * cgrp->bpf.inactive table we can recover by + * recomputing the array in place. + * + * @cgrp: The cgroup which descendants to travers + * @prog: A program to detach or NULL + * @link: A link to detach or NULL + * @atype: Type of detach operation + */ +static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog, + struct bpf_cgroup_link *link, + enum cgroup_bpf_attach_type atype) +{ + struct cgroup_subsys_state *css; + struct bpf_prog_array *progs; + struct bpf_prog_list *pl; + struct list_head *head; + struct cgroup *cg; + int pos; + + /* recompute effective prog array in place */ + css_for_each_descendant_pre(css, &cgrp->self) { + struct cgroup *desc = container_of(css, struct cgroup, self); + + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + + /* find position of link or prog in effective progs array */ + for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) { + if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI)) + continue; + + head = &cg->bpf.progs[atype]; + list_for_each_entry(pl, head, node) { + if (!prog_list_prog(pl)) + continue; + if (pl->prog == prog && pl->link == link) + goto found; + pos++; + } + } +found: + BUG_ON(!cg); + progs = rcu_dereference_protected( + desc->bpf.effective[atype], + lockdep_is_held(&cgroup_mutex)); + + /* Remove the program from the array */ + WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos), + "Failed to purge a prog from array at index %d", pos); + } +} + /** * __cgroup_bpf_detach() - Detach the program or link from a cgroup, and * propagate the change to descendants @@ -686,7 +740,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, struct bpf_prog_list *pl; struct list_head *progs; u32 flags; - int err;
atype = to_cgroup_bpf_attach_type(type); if (atype < 0) @@ -708,9 +761,12 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, pl->prog = NULL; pl->link = NULL;
- err = update_effective_progs(cgrp, atype); - if (err) - goto cleanup; + if (update_effective_progs(cgrp, atype)) { + /* if update effective array failed replace the prog with a dummy prog*/ + pl->prog = old_prog; + pl->link = link; + purge_effective_progs(cgrp, old_prog, link, atype); + }
/* now can actually delete it from this cgroup list */ list_del(&pl->node); @@ -722,12 +778,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key[atype]); return 0; - -cleanup: - /* restore back prog or link */ - pl->prog = old_prog; - pl->link = link; - return err; }
/* Must be called with cgroup_mutex held to avoid races. */
From: Paolo Bonzini pbonzini@redhat.com
stable inclusion from stable-v5.10.110 commit e90518d10c7dd59d5ebbe25b0f0083a7dbffa42f category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I5869N CVE: CVE-2022-1158
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
commit 2a8859f373b0a86f0ece8ec8312607eacf12485d upstream.
FNAME(cmpxchg_gpte) is an inefficient mess. It is at least decent if it can go through get_user_pages_fast(), but if it cannot then it tries to use memremap(); that is not just terribly slow, it is also wrong because it assumes that the VM_PFNMAP VMA is contiguous.
The right way to do it would be to do the same thing as hva_to_pfn_remapped() does since commit add6a0cd1c5b ("KVM: MMU: try to fix up page faults before giving up", 2016-07-05), using follow_pte() and fixup_user_fault() to determine the correct address to use for memremap(). To do this, one could for example extract hva_to_pfn() for use outside virt/kvm/kvm_main.c. But really there is no reason to do that either, because there is already a perfectly valid address to do the cmpxchg() on, only it is a userspace address. That means doing user_access_begin()/user_access_end() and writing the code in assembly to handle exceptions correctly. Worse, the guest PTE can be 8-byte even on i686 so there is the extra complication of using cmpxchg8b to account for. But at least it is an efficient mess.
(Thanks to Linus for suggesting improvement on the inline assembly).
Reported-by: Qiuhao Li qiuhao@sysec.org Reported-by: Gaoning Pan pgn@zju.edu.cn Reported-by: Yongkang Jia kangel@zju.edu.cn Reported-by: syzbot+6cde2282daa792c49ab8@syzkaller.appspotmail.com Debugged-by: Tadeusz Struk tadeusz.struk@linaro.org Tested-by: Maxim Levitsky mlevitsk@redhat.com Cc: stable@vger.kernel.org Fixes: bd53cb35a3e9 ("X86/KVM: Handle PFNs outside of kernel reach when touching GPTEs") Signed-off-by: Paolo Bonzini pbonzini@redhat.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kvm/mmu/paging_tmpl.h | 77 ++++++++++++++++------------------ 1 file changed, 37 insertions(+), 40 deletions(-)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index f8829134bf34..c6daeeff1d9c 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -34,9 +34,8 @@ #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL - #define CMPXCHG cmpxchg + #define CMPXCHG "cmpxchgq" #else - #define CMPXCHG cmpxchg64 #define PT_MAX_FULL_LEVELS 2 #endif #elif PTTYPE == 32 @@ -52,7 +51,7 @@ #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true - #define CMPXCHG cmpxchg + #define CMPXCHG "cmpxchgl" #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT @@ -65,7 +64,9 @@ #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad) - #define CMPXCHG cmpxchg64 + #ifdef CONFIG_X86_64 + #define CMPXCHG "cmpxchgq" + #endif #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #error Invalid PTTYPE value @@ -147,43 +148,39 @@ static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, pt_element_t __user *ptep_user, unsigned index, pt_element_t orig_pte, pt_element_t new_pte) { - int npages; - pt_element_t ret; - pt_element_t *table; - struct page *page; - - npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page); - if (likely(npages == 1)) { - table = kmap_atomic(page); - ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table); - - kvm_release_page_dirty(page); - } else { - struct vm_area_struct *vma; - unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK; - unsigned long pfn; - unsigned long paddr; - - mmap_read_lock(current->mm); - vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE); - if (!vma || !(vma->vm_flags & VM_PFNMAP)) { - mmap_read_unlock(current->mm); - return -EFAULT; - } - pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - paddr = pfn << PAGE_SHIFT; - table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB); - if (!table) { - mmap_read_unlock(current->mm); - return -EFAULT; - } - ret = CMPXCHG(&table[index], orig_pte, new_pte); - memunmap(table); - mmap_read_unlock(current->mm); - } + int r = -EFAULT; + + if (!user_access_begin(ptep_user, sizeof(pt_element_t))) + return -EFAULT; + +#ifdef CMPXCHG + asm volatile("1:" LOCK_PREFIX CMPXCHG " %[new], %[ptr]\n" + "mov $0, %[r]\n" + "setnz %b[r]\n" + "2:" + _ASM_EXTABLE_UA(1b, 2b) + : [ptr] "+m" (*ptep_user), + [old] "+a" (orig_pte), + [r] "+q" (r) + : [new] "r" (new_pte) + : "memory"); +#else + asm volatile("1:" LOCK_PREFIX "cmpxchg8b %[ptr]\n" + "movl $0, %[r]\n" + "jz 2f\n" + "incl %[r]\n" + "2:" + _ASM_EXTABLE_UA(1b, 2b) + : [ptr] "+m" (*ptep_user), + [old] "+A" (orig_pte), + [r] "+rm" (r) + : [new_lo] "b" ((u32)new_pte), + [new_hi] "c" ((u32)(new_pte >> 32)) + : "memory"); +#endif
- return (ret != orig_pte); + user_access_end(); + return r; }
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,