From: LeoLiu-oc LeoLiu-oc@zhaoxin.com
zhaoxin inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I40QDN CVE: NA
----------------------------------------------------------------
Fix a logic issue when display Zhaoxin XHCI root hub speed.
Signed-off-by: LeoLiu-oc LeoLiu-oc@zhaoxin.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com Reviewed-by: Xiongfeng Wang wangxiongfeng2@huawei.com --- drivers/usb/host/xhci.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index ba2b2b5afdd1..0984bd31e0e9 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -5271,10 +5271,10 @@ int xhci_gen_setup(struct usb_hcd *hcd, xhci_get_quirks_t get_quirks) if (XHCI_EXT_PORT_PSIV(xhci->port_caps[j].psi[i]) >= 5) minor_rev = 1; } - if (minor_rev != 1) { - hcd->speed = HCD_USB3; - hcd->self.root_hub->speed = USB_SPEED_SUPER; - } + } + if (minor_rev != 1) { + hcd->speed = HCD_USB3; + hcd->self.root_hub->speed = USB_SPEED_SUPER; } }
From: Willy Tarreau w@1wt.eu
mainline inclusion from mainline-v5.18-rc5 commit 233087ca063686964a53c829d547c7571e3f67bf category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I59I1C CVE: CVE-2022-1836
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?h...
--------------------------------
Minh Yuan reported a concurrency use-after-free issue in the floppy code between raw_cmd_ioctl and seek_interrupt.
[ It turns out this has been around, and that others have reported the KASAN splats over the years, but Minh Yuan had a reproducer for it and so gets primary credit for reporting it for this fix - Linus ]
The problem is, this driver tends to break very easily and nowadays, nobody is expected to use FDRAWCMD anyway since it was used to manipulate non-standard formats. The risk of breaking the driver is higher than the risk presented by this race, and accessing the device requires privileges anyway.
Let's just add a config option to completely disable this ioctl and leave it disabled by default. Distros shouldn't use it, and only those running on antique hardware might need to enable it.
Link: https://lore.kernel.org/all/000000000000b71cdd05d703f6bf@google.com/ Link: https://lore.kernel.org/lkml/CAKcFiNC=MfYVW-Jt9A3=FPJpTwCD2PL_ULNCpsCVE5s8Ze... Link: https://lore.kernel.org/all/CAEAjamu1FRhz6StCe_55XY5s389ZP_xmCF69k987En+1z53... Reported-by: Minh Yuan yuanmingbuaa@gmail.com Reported-by: syzbot+8e8958586909d62b6840@syzkaller.appspotmail.com Reported-by: cruise k cruise4k@gmail.com Reported-by: Kyungtae Kim kt0755@gmail.com Suggested-by: Linus Torvalds torvalds@linuxfoundation.org Tested-by: Denis Efremov efremov@linux.com Signed-off-by: Willy Tarreau w@1wt.eu Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Luo Meng luomeng12@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- drivers/block/Kconfig | 16 ++++++++++++++++ drivers/block/floppy.c | 43 +++++++++++++++++++++++++++++++----------- 2 files changed, 48 insertions(+), 11 deletions(-)
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index f2548049aa0e..40c53632512b 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -39,6 +39,22 @@ config BLK_DEV_FD To compile this driver as a module, choose M here: the module will be called floppy.
+config BLK_DEV_FD_RAWCMD + bool "Support for raw floppy disk commands (DEPRECATED)" + depends on BLK_DEV_FD + help + If you want to use actual physical floppies and expect to do + special low-level hardware accesses to them (access and use + non-standard formats, for example), then enable this. + + Note that the code enabled by this option is rarely used and + might be unstable or insecure, and distros should not enable it. + + Note: FDRAWCMD is deprecated and will be removed from the kernel + in the near future. + + If unsure, say N. + config AMIGA_FLOPPY tristate "Amiga floppy support" depends on AMIGA diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 8fc23f5026f0..4ef407a33996 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3067,6 +3067,8 @@ static const char *drive_name(int type, int drive) return "(null)"; }
+#ifdef CONFIG_BLK_DEV_FD_RAWCMD + /* raw commands */ static void raw_cmd_done(int flag) { @@ -3271,6 +3273,35 @@ static int raw_cmd_ioctl(int cmd, void __user *param) return ret; }
+static int floppy_raw_cmd_ioctl(int type, int drive, int cmd, + void __user *param) +{ + int ret; + + pr_warn_once("Note: FDRAWCMD is deprecated and will be removed from the kernel in the near future.\n"); + + if (type) + return -EINVAL; + if (lock_fdc(drive)) + return -EINTR; + set_floppy(drive); + ret = raw_cmd_ioctl(cmd, param); + if (ret == -EINTR) + return -EINTR; + process_fd_request(); + return ret; +} + +#else /* CONFIG_BLK_DEV_FD_RAWCMD */ + +static int floppy_raw_cmd_ioctl(int type, int drive, int cmd, + void __user *param) +{ + return -EOPNOTSUPP; +} + +#endif + static int invalidate_drive(struct block_device *bdev) { /* invalidate the buffer track to force a reread */ @@ -3459,7 +3490,6 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int { int drive = (long)bdev->bd_disk->private_data; int type = ITYPE(drive_state[drive].fd_device); - int i; int ret; int size; union inparam { @@ -3610,16 +3640,7 @@ static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int outparam = &write_errors[drive]; break; case FDRAWCMD: - if (type) - return -EINVAL; - if (lock_fdc(drive)) - return -EINTR; - set_floppy(drive); - i = raw_cmd_ioctl(cmd, (void __user *)param); - if (i == -EINTR) - return -EINTR; - process_fd_request(); - return i; + return floppy_raw_cmd_ioctl(type, drive, cmd, (void __user *)param); case FDTWADDLE: if (lock_fdc(drive)) return -EINTR;
From: Guo Mengqi guomengqi3@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I54I59 CVE: NA backport: openEuler-22.03-LTS
--------------------------------------------------
When hisi_oom_notifier_call calls spg_overview_show, it requires the global rwsem sp_group_sem, which had been held by another process when oomed. This leads to kernel hungtask. At another position the unecessary sp_group_sem causes an ABBA deadlock.
[ 1934.549016] INFO: task klogd:2757 blocked for more than 120 seconds. [ 1934.562408] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 1934.570231] klogd D 0 2757 2746 0x00000000 [ 1934.575707] Call trace: [ 1934.578162] __switch_to+0xe8/0x150 [ 1934.581648] __schedule+0x250/0x558 [ 1934.585133] schedule+0x30/0xf0 [ 1934.588267] rwsem_down_read_failed+0x10c/0x188 [ 1934.592788] down_read+0x60/0x68 [ 1934.596015] spg_overview_show.part.31+0xc8/0xf8 [ 1934.600622] spg_overview_show+0x2c/0x38 [ 1934.604543] hisi_oom_notifier_call+0xe8/0x120 [ 1934.608975] out_of_memory+0x7c/0x570 [ 1934.612631] __alloc_pages_nodemask+0xcfc/0xd98 [ 1934.617158] alloc_pages_current+0x88/0xf0 [ 1934.621246] __page_cache_alloc+0x8c/0xd8 [ 1934.625247] page_cache_alloc_inode+0x48/0x58 [ 1934.629595] filemap_fault+0x360/0x8e0 [ 1934.633341] ext4_filemap_fault+0x38/0x128 [ 1934.637431] __do_fault+0x50/0x218 [ 1934.640822] __handle_mm_fault+0x69c/0x9c8 [ 1934.644909] handle_mm_fault+0xf8/0x200 [ 1934.648740] do_page_fault+0x220/0x508 [ 1934.652477] do_translation_fault+0xa8/0xbc [ 1934.656652] do_mem_abort+0x68/0x118 [ 1934.660216] do_el0_ia_bp_hardening+0x6c/0xd8 [ 1934.664565] el0_ia+0x20/0x24
Signed-off-by: Guo Mengqi guomengqi3@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 3a37418378f6..cd45852919a1 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -4042,9 +4042,9 @@ void spg_overview_show(struct seq_file *seq) atomic_read(&sp_overall_stat.spa_total_num)); }
- down_read(&sp_group_sem); + down_read(&sp_spg_stat_sem); idr_for_each(&sp_spg_stat_idr, idr_spg_stat_cb, seq); - up_read(&sp_group_sem); + up_read(&sp_spg_stat_sem);
if (seq != NULL) seq_puts(seq, "\n");
From: Wang Wensheng wangwensheng4@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I54I7W CVE: NA backport: openEuler-22.03-LTS
--------------------------------------------------
There is a ABBA deadlock in the process of sp_group_add_task and proc_stat_show().
PROCESS A: sp_group_add_task() acquire sp_group_sem write lock ->sp_init_proc_stat() acquire sp_spg_stat_sem write lock PROCESS B: proc_stat_show() acquire sp_spg_stat_sem read lock ->idr_proc_stat_cb() acquire sp_group_sem read lock
Here we choose the simplest way that acquires sp_group_sem and sp_stat_sem read lock subsequently in proc_stat_show(), since it just has effect on the process of debug feature.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/share_pool.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index cd45852919a1..85d175def6ae 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -4083,7 +4083,6 @@ static int idr_proc_stat_cb(int id, void *p, void *data) long sp_res, sp_res_nsize, non_sp_res, non_sp_shm;
/* to prevent ABBA deadlock, first hold sp_group_sem */ - down_read(&sp_group_sem); mutex_lock(&spg_stat->lock); hash_for_each(spg_stat->hash, i, spg_proc_stat, gnode) { proc_stat = spg_proc_stat->proc_stat; @@ -4112,7 +4111,6 @@ static int idr_proc_stat_cb(int id, void *p, void *data) seq_putc(seq, '\n'); } mutex_unlock(&spg_stat->lock); - up_read(&sp_group_sem); return 0; }
@@ -4130,10 +4128,16 @@ static int proc_stat_show(struct seq_file *seq, void *offset) byte2kb(atomic64_read(&kthread_stat.alloc_size)), byte2kb(atomic64_read(&kthread_stat.k2u_size)));
- /* pay attention to potential ABBA deadlock */ + /* + * This ugly code is just for fixing the ABBA deadlock against + * sp_group_add_task. + */ + down_read(&sp_group_sem); down_read(&sp_spg_stat_sem); idr_for_each(&sp_spg_stat_idr, idr_proc_stat_cb, seq); up_read(&sp_spg_stat_sem); + up_read(&sp_group_sem); + return 0; }
From: Zhang Jian zhangjian210@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I54IL8 CVE: NA backport: openEuler-22.03-LTS
-----------------------------
When passing numa id to sp_alloc, sometimes numa id does not work. This is because memory policy will change numa id to a preferred one if memory policy is set. Fix the error by mbind virtual address to desired numa id.
Signed-off-by: Zhang Jian zhangjian210@huawei.com Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com Reviewed-by: Weilong Chen chenweilong@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- include/linux/mempolicy.h | 10 ++++++ mm/mempolicy.c | 14 ++++++--- mm/share_pool.c | 66 +++++++++++++++++++++++++-------------- 3 files changed, 62 insertions(+), 28 deletions(-)
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 64ab4398ba90..ba74e7399dc6 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -201,6 +201,9 @@ extern bool vma_migratable(struct vm_area_struct *vma); extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *);
+extern long __do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags, struct mm_struct *mm); #else
struct mempolicy {}; @@ -301,6 +304,13 @@ static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, return -1; /* no node preference */ }
+static inline long __do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags, struct mm_struct *mm) +{ + return 0; +} + static inline void mpol_put_task_policy(struct task_struct *task) { } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5ce39dbc84e1..d2326f9a38a8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1308,11 +1308,10 @@ static struct page *new_page(struct page *page, unsigned long start) } #endif
-static long do_mbind(unsigned long start, unsigned long len, - unsigned short mode, unsigned short mode_flags, - nodemask_t *nmask, unsigned long flags) +long __do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags, struct mm_struct *mm) { - struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; int err; @@ -1411,6 +1410,13 @@ static long do_mbind(unsigned long start, unsigned long len, return err; }
+static long do_mbind(unsigned long start, unsigned long len, + unsigned short mode, unsigned short mode_flags, + nodemask_t *nmask, unsigned long flags) +{ + return __do_mbind(start, len, mode, mode_flags, nmask, flags, current->mm); +} + /* * User space interface with variable sized bitmaps for nodelists. */ diff --git a/mm/share_pool.c b/mm/share_pool.c index 85d175def6ae..76088952d0a5 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -16,7 +16,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ - #define pr_fmt(fmt) "share pool: " fmt
#include <linux/share_pool.h> @@ -2157,6 +2156,7 @@ struct sp_alloc_context { bool need_fallocate; struct timespec64 start; struct timespec64 end; + bool have_mbind; };
static void trace_sp_alloc_begin(struct sp_alloc_context *ac) @@ -2298,6 +2298,7 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, ac->sp_flags = sp_flags; ac->state = ALLOC_NORMAL; ac->need_fallocate = false; + ac->have_mbind = false; return 0; }
@@ -2391,7 +2392,7 @@ static void sp_alloc_fallback(struct sp_area *spa, struct sp_alloc_context *ac) }
static int sp_alloc_populate(struct mm_struct *mm, struct sp_area *spa, - struct sp_group_node *spg_node, struct sp_alloc_context *ac) + struct sp_alloc_context *ac) { int ret = 0; unsigned long sp_addr = spa->va_start; @@ -2423,25 +2424,20 @@ static int sp_alloc_populate(struct mm_struct *mm, struct sp_area *spa, if (ret) sp_add_work_compact(); } - if (ret) { - if (spa->spg != spg_none) - sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); - else - sp_munmap(mm, spa->va_start, spa->real_size); - - if (unlikely(fatal_signal_pending(current))) - pr_warn_ratelimited("allocation failed, current thread is killed\n"); - else - pr_warn_ratelimited("allocation failed due to mm populate failed(potential no enough memory when -12): %d\n", - ret); - sp_fallocate(spa); /* need this, otherwise memleak */ - sp_alloc_fallback(spa, ac); - } else { - ac->need_fallocate = true; - } return ret; }
+static long sp_mbind(struct mm_struct *mm, unsigned long start, unsigned long len, + unsigned long node) +{ + nodemask_t nmask; + + nodes_clear(nmask); + node_set(node, nmask); + return __do_mbind(start, len, MPOL_BIND, MPOL_F_STATIC_NODES, + &nmask, MPOL_MF_STRICT, mm); +} + static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, struct sp_group_node *spg_node, struct sp_alloc_context *ac) { @@ -2457,7 +2453,34 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, return ret; }
- ret = sp_alloc_populate(mm, spa, spg_node, ac); + if (!ac->have_mbind) { + ret = sp_mbind(mm, spa->va_start, spa->real_size, spa->node_id); + if (ret < 0) { + pr_err("cannot bind the memory range to specified node:%d, err:%d\n", + spa->node_id, ret); + goto err; + } + ac->have_mbind = true; + } + + ret = sp_alloc_populate(mm, spa, ac); + if (ret) { +err: + if (spa->spg != spg_none) + sp_alloc_unmap(list_next_entry(spg_node, proc_node)->master->mm, spa, spg_node); + else + sp_munmap(mm, spa->va_start, spa->real_size); + + if (unlikely(fatal_signal_pending(current))) + pr_warn_ratelimited("allocation failed, current thread is killed\n"); + else + pr_warn_ratelimited("allocation failed due to mm populate failed(potential no enough memory when -12): %d\n", + ret); + sp_fallocate(spa); /* need this, otherwise memleak */ + sp_alloc_fallback(spa, ac); + } else + ac->need_fallocate = true; + return ret; }
@@ -2479,11 +2502,6 @@ static int sp_alloc_mmap_populate(struct sp_area *spa, if (mmap_ret) { if (ac->state != ALLOC_COREDUMP) return mmap_ret; - if (ac->spg == spg_none) { - sp_alloc_unmap(mm, spa, spg_node); - pr_err("dvpp allocation failed due to coredump"); - return mmap_ret; - } ac->state = ALLOC_NORMAL; continue; }
From: Mingwei Zhang mizhang@google.com
mainline inclusion from mainline-v5.18-rc4 commit 683412ccf61294d727ead4a73d97397396e69a6b category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I58DFI CVE: CVE-2022-0171
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Flush the CPU caches when memory is reclaimed from an SEV guest (where reclaim also includes it being unmapped from KVM's memslots). Due to lack of coherency for SEV encrypted memory, failure to flush results in silent data corruption if userspace is malicious/broken and doesn't ensure SEV guest memory is properly pinned and unpinned.
Cache coherency is not enforced across the VM boundary in SEV (AMD APM vol.2 Section 15.34.7). Confidential cachelines, generated by confidential VM guests have to be explicitly flushed on the host side. If a memory page containing dirty confidential cachelines was released by VM and reallocated to another user, the cachelines may corrupt the new user at a later time.
KVM takes a shortcut by assuming all confidential memory remain pinned until the end of VM lifetime. Therefore, KVM does not flush cache at mmu_notifier invalidation events. Because of this incorrect assumption and the lack of cache flushing, malicous userspace can crash the host kernel: creating a malicious VM and continuously allocates/releases unpinned confidential memory pages when the VM is running.
Add cache flush operations to mmu_notifier operations to ensure that any physical memory leaving the guest VM get flushed. In particular, hook mmu_notifier_invalidate_range_start and mmu_notifier_release events and flush cache accordingly. The hook after releasing the mmu lock to avoid contention with other vCPUs.
Cc: stable@vger.kernel.org Suggested-by: Sean Christpherson seanjc@google.com Reported-by: Mingwei Zhang mizhang@google.com Signed-off-by: Mingwei Zhang mizhang@google.com Message-Id: 20220421031407.2516575-4-mizhang@google.com Signed-off-by: Paolo Bonzini pbonzini@redhat.com
conflicts: arch/x86/include/asm/kvm-x86-ops.h arch/x86/kvm/x86.c virt/kvm/kvm_main.c
Signed-off-by: Chen Jiahao chenjiahao16@huawei.com Reviewed-by: Liao Chang liaochang1@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/sev.c | 8 ++++++++ arch/x86/kvm/svm/svm.c | 1 + arch/x86/kvm/svm/svm.h | 2 ++ arch/x86/kvm/x86.c | 8 ++++++++ include/linux/kvm_host.h | 2 ++ virt/kvm/kvm_main.c | 16 ++++++++++++++-- 7 files changed, 36 insertions(+), 2 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4ab8f866e39d..a3bf158f5b12 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1286,6 +1286,7 @@ struct kvm_x86_ops { int (*mem_enc_op)(struct kvm *kvm, void __user *argp); int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); + void (*guest_memory_reclaimed)(struct kvm *kvm);
int (*get_msr_feature)(struct kvm_msr_entry *entry);
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 6c82ef22985d..7828b36d67c1 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1177,6 +1177,14 @@ void sev_hardware_teardown(void) sev_flush_asids(); }
+void sev_guest_memory_reclaimed(struct kvm *kvm) +{ + if (!sev_guest(kvm)) + return; + + wbinvd_on_all_cpus(); +} + void pre_sev_run(struct vcpu_svm *svm, int cpu) { struct svm_cpu_data *sd = per_cpu(svm_data, cpu); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7773a765f548..2124fe54abfb 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4325,6 +4325,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_op = svm_mem_enc_op, .mem_enc_reg_region = svm_register_enc_region, .mem_enc_unreg_region = svm_unregister_enc_region, + .guest_memory_reclaimed = sev_guest_memory_reclaimed,
.can_emulate_instruction = svm_can_emulate_instruction,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 2c007241fbf5..c707d689b60e 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -491,6 +491,8 @@ int svm_register_enc_region(struct kvm *kvm, struct kvm_enc_region *range); int svm_unregister_enc_region(struct kvm *kvm, struct kvm_enc_region *range); +void sev_guest_memory_reclaimed(struct kvm *kvm); + void pre_sev_run(struct vcpu_svm *svm, int cpu); int __init sev_hardware_setup(void); void sev_hardware_teardown(void); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7da272aea5c4..92b3c37481d8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8895,6 +8895,14 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); }
+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) +{ + if (!kvm_x86_ops.guest_memory_reclaimed) + return; + + kvm_x86_ops.guest_memory_reclaimed(kvm); +} + void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) { if (!lapic_in_kernel(vcpu)) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index fea02cfe6520..96f2cd2b46f7 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1480,6 +1480,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, unsigned long start, unsigned long end);
+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm); + #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); #else diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 29bdd17fc135..547c0cc859a6 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -162,6 +162,10 @@ __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, { }
+__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) +{ +} + bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) { /* @@ -338,6 +342,12 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs); #endif
+static void kvm_flush_shadow_all(struct kvm *kvm) +{ + kvm_arch_flush_shadow_all(kvm); + kvm_arch_guest_memory_reclaimed(kvm); +} + void kvm_reload_remote_mmus(struct kvm *kvm) { kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD); @@ -492,6 +502,7 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock); + kvm_arch_guest_memory_reclaimed(kvm); srcu_read_unlock(&kvm->srcu, idx);
return 0; @@ -595,7 +606,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, int idx;
idx = srcu_read_lock(&kvm->srcu); - kvm_arch_flush_shadow_all(kvm); + kvm_flush_shadow_all(kvm); srcu_read_unlock(&kvm->srcu, idx); }
@@ -892,7 +903,7 @@ static void kvm_destroy_vm(struct kvm *kvm) #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); #else - kvm_arch_flush_shadow_all(kvm); + kvm_flush_shadow_all(kvm); #endif kvm_arch_destroy_vm(kvm); kvm_destroy_devices(kvm); @@ -1233,6 +1244,7 @@ static int kvm_set_memslot(struct kvm *kvm, * - kvm_is_visible_gfn (mmu_check_root) */ kvm_arch_flush_shadow_memslot(kvm, slot); + kvm_arch_guest_memory_reclaimed(kvm); }
r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
From: Shakeel Butt shakeelb@google.com
mainline inclusion from mainline-v5.19-rc1 commit 94968384dde15d48263bfc59d280cd71b1259d8c category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545DF CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
This patch series adds a memory.reclaim proactive reclaim interface. The rationale behind the interface and how it works are in the first patch.
This patch (of 4):
Introduce a memcg interface to trigger memory reclaim on a memory cgroup.
Use case: Proactive Reclaim ---------------------------
A userspace proactive reclaimer can continuously probe the memcg to reclaim a small amount of memory. This gives more accurate and up-to-date workingset estimation as the LRUs are continuously sorted and can potentially provide more deterministic memory overcommit behavior. The memory overcommit controller can provide more proactive response to the changing behavior of the running applications instead of being reactive.
A userspace reclaimer's purpose in this case is not a complete replacement for kswapd or direct reclaim, it is to proactively identify memory savings opportunities and reclaim some amount of cold pages set by the policy to free up the memory for more demanding jobs or scheduling new jobs.
A user space proactive reclaimer is used in Google data centers. Additionally, Meta's TMO paper recently referenced a very similar interface used for user space proactive reclaim: https://dl.acm.org/doi/pdf/10.1145/3503222.3507731
Benefits of a user space reclaimer: -----------------------------------
1) More flexible on who should be charged for the cpu of the memory reclaim. For proactive reclaim, it makes more sense to be centralized.
2) More flexible on dedicating the resources (like cpu). The memory overcommit controller can balance the cost between the cpu usage and the memory reclaimed.
3) Provides a way to the applications to keep their LRUs sorted, so, under memory pressure better reclaim candidates are selected. This also gives more accurate and uptodate notion of working set for an application.
Why memory.high is not enough? ------------------------------
- memory.high can be used to trigger reclaim in a memcg and can potentially be used for proactive reclaim. However there is a big downside in using memory.high. It can potentially introduce high reclaim stalls in the target application as the allocations from the processes or the threads of the application can hit the temporary memory.high limit.
- Userspace proactive reclaimers usually use feedback loops to decide how much memory to proactively reclaim from a workload. The metrics used for this are usually either refaults or PSI, and these metrics will become messy if the application gets throttled by hitting the high limit.
- memory.high is a stateful interface, if the userspace proactive reclaimer crashes for any reason while triggering reclaim it can leave the application in a bad state.
- If a workload is rapidly expanding, setting memory.high to proactively reclaim memory can result in actually reclaiming more memory than intended.
The benefits of such interface and shortcomings of existing interface were further discussed in this RFC thread: https://lore.kernel.org/linux-mm/5df21376-7dd1-bf81-8414-32a73cea45dd@google...
Interface: ----------
Introducing a very simple memcg interface 'echo 10M > memory.reclaim' to trigger reclaim in the target memory cgroup.
The interface is introduced as a nested-keyed file to allow for future optional arguments to be easily added to configure the behavior of reclaim.
Possible Extensions: --------------------
- This interface can be extended with an additional parameter or flags to allow specifying one or more types of memory to reclaim from (e.g. file, anon, ..).
- The interface can also be extended with a node mask to reclaim from specific nodes. This has use cases for reclaim-based demotion in memory tiering systens.
- A similar per-node interface can also be added to support proactive reclaim and reclaim-based demotion in systems without memcg.
- Add a timeout parameter to make it easier for user space to call the interface without worrying about being blocked for an undefined amount of time.
For now, let's keep things simple by adding the basic functionality.
[yosryahmed@google.com: worked on versions v2 onwards, refreshed to current master, updated commit message based on recent discussions and use cases] Link: https://lkml.kernel.org/r/20220425190040.2475377-1-yosryahmed@google.com Link: https://lkml.kernel.org/r/20220425190040.2475377-2-yosryahmed@google.com Signed-off-by: Shakeel Butt shakeelb@google.com Co-developed-by: Yosry Ahmed yosryahmed@google.com Signed-off-by: Yosry Ahmed yosryahmed@google.com Acked-by: Johannes Weiner hannes@cmpxchg.org Acked-by: Michal Hocko mhocko@suse.com Acked-by: Wei Xu weixugc@google.com Acked-by: Roman Gushchin roman.gushchin@linux.dev Acked-by: David Rientjes rientjes@google.com Cc: Tejun Heo tj@kernel.org Cc: Zefan Li lizefan.x@bytedance.com Cc: Jonathan Corbet corbet@lwn.net Cc: Shuah Khan shuah@kernel.org Cc: Yu Zhao yuzhao@google.com Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Greg Thelen gthelen@google.com Cc: Chen Wandun chenwandun@huawei.com Cc: Vaibhav Jain vaibhav@linux.ibm.com Cc: "Michal Koutn" mkoutny@suse.com Cc: Tim Chen tim.c.chen@linux.intel.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- Documentation/admin-guide/cgroup-v2.rst | 21 ++++++++++++ mm/memcontrol.c | 45 +++++++++++++++++++++++++ 2 files changed, 66 insertions(+)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 608d7c279396..c54db136d9b4 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1181,6 +1181,27 @@ PAGE_SIZE multiple when read back. high limit is used and monitored properly, this limit's utility is limited to providing the final safety net.
+ memory.reclaim + A write-only nested-keyed file which exists for all cgroups. + + This is a simple interface to trigger memory reclaim in the + target cgroup. + + This file accepts a single key, the number of bytes to reclaim. + No nested keys are currently supported. + + Example:: + + echo "1G" > memory.reclaim + + The interface can be later extended with nested keys to + configure the reclaim behavior. For example, specify the + type of memory to reclaim from (anon, file, ..). + + Please note that the kernel can over or under reclaim from + the target cgroup. If less bytes are reclaimed than the + specified amount, -EAGAIN is returned. + memory.oom.group A read-write single value file which exists on non-root cgroups. The default value is "0". diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7061f9283a34..06cb2c417623 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6711,6 +6711,46 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; }
+static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + unsigned long nr_to_reclaim, nr_reclaimed = 0; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "", &nr_to_reclaim); + if (err) + return err; + + while (nr_reclaimed < nr_to_reclaim) { + unsigned long reclaimed; + + if (signal_pending(current)) + return -EINTR; + + /* + * This is the final attempt, drain percpu lru caches in the + * hope of introducing more evictable pages for + * try_to_free_mem_cgroup_pages(). + */ + if (!nr_retries) + lru_add_drain_all(); + + reclaimed = try_to_free_mem_cgroup_pages(memcg, + nr_to_reclaim - nr_reclaimed, + GFP_KERNEL, true); + + if (!reclaimed && !nr_retries--) + return -EAGAIN; + + nr_reclaimed += reclaimed; + } + + return nbytes; +} + static struct cftype memory_files[] = { { .name = "current", @@ -6769,6 +6809,11 @@ static struct cftype memory_files[] = { .seq_show = memory_oom_group_show, .write = memory_oom_group_write, }, + { + .name = "reclaim", + .flags = CFTYPE_NS_DELEGATABLE, + .write = memory_reclaim, + }, { } /* terminate */ };
From: Yosry Ahmed yosryahmed@google.com
mainline inclusion from mainline-v5.19-rc1 commit 6c26df84e1f2f9181c0741865105a53537da842c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I545DF CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Currently, cg_read()/cg_write() returns 0 on success and -1 on failure. Modify them to return the -errno on failure.
Link: https://lkml.kernel.org/r/20220425190040.2475377-3-yosryahmed@google.com Signed-off-by: Yosry Ahmed yosryahmed@google.com Acked-by: Shakeel Butt shakeelb@google.com Acked-by: David Rientjes rientjes@google.com Acked-by: Roman Gushchin roman.gushchin@linux.dev Cc: Chen Wandun chenwandun@huawei.com Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Greg Thelen gthelen@google.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Jonathan Corbet corbet@lwn.net Cc: Michal Hocko mhocko@suse.com Cc: "Michal Koutn" mkoutny@suse.com Cc: Shuah Khan shuah@kernel.org Cc: Tejun Heo tj@kernel.org Cc: Tim Chen tim.c.chen@linux.intel.com Cc: Vaibhav Jain vaibhav@linux.ibm.com Cc: Wei Xu weixugc@google.com Cc: Yu Zhao yuzhao@google.com Cc: Zefan Li lizefan.x@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/testing/selftests/cgroup/cgroup_util.c | 44 +++++++++----------- 1 file changed, 19 insertions(+), 25 deletions(-)
diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 05853b0b8831..391ab5675290 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -17,6 +17,7 @@ #include "cgroup_util.h" #include "../clone3/clone3_selftests.h"
+/* Returns read len on success, or -errno on failure. */ static ssize_t read_text(const char *path, char *buf, size_t max_len) { ssize_t len; @@ -24,35 +25,29 @@ static ssize_t read_text(const char *path, char *buf, size_t max_len)
fd = open(path, O_RDONLY); if (fd < 0) - return fd; + return -errno;
len = read(fd, buf, max_len - 1); - if (len < 0) - goto out;
- buf[len] = 0; -out: + if (len >= 0) + buf[len] = 0; + close(fd); - return len; + return len < 0 ? -errno : len; }
+/* Returns written len on success, or -errno on failure. */ static ssize_t write_text(const char *path, char *buf, ssize_t len) { int fd;
fd = open(path, O_WRONLY | O_APPEND); if (fd < 0) - return fd; + return -errno;
len = write(fd, buf, len); - if (len < 0) { - close(fd); - return len; - } - close(fd); - - return len; + return len < 0 ? -errno : len; }
char *cg_name(const char *root, const char *name) @@ -85,16 +80,16 @@ char *cg_control(const char *cgroup, const char *control) return ret; }
+/* Returns 0 on success, or -errno on failure. */ int cg_read(const char *cgroup, const char *control, char *buf, size_t len) { char path[PATH_MAX]; + ssize_t ret;
snprintf(path, sizeof(path), "%s/%s", cgroup, control);
- if (read_text(path, buf, len) >= 0) - return 0; - - return -1; + ret = read_text(path, buf, len); + return ret >= 0 ? 0 : ret; }
int cg_read_strcmp(const char *cgroup, const char *control, @@ -175,17 +170,15 @@ long cg_read_lc(const char *cgroup, const char *control) return cnt; }
+/* Returns 0 on success, or -errno on failure. */ int cg_write(const char *cgroup, const char *control, char *buf) { char path[PATH_MAX]; - ssize_t len = strlen(buf); + ssize_t len = strlen(buf), ret;
snprintf(path, sizeof(path), "%s/%s", cgroup, control); - - if (write_text(path, buf, len) == len) - return 0; - - return -1; + ret = write_text(path, buf, len); + return ret == len ? 0 : ret; }
int cg_find_unified_root(char *root, size_t len) @@ -539,7 +532,8 @@ ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t else snprintf(path, sizeof(path), "/proc/%d/%s", pid, item);
- return read_text(path, buf, size); + size = read_text(path, buf, size); + return size < 0 ? -1 : size; }
int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
From: Yosry Ahmed yosryahmed@google.com
mainline inclusion from mainline-v5.19-rc1 commit a3622a53e620700053b648478dbc638ad373be3b category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I545DF CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Currently, alloc_anon_noexit() calls alloc_anon() which instantly frees the allocated memory. alloc_anon_noexit() is usually used with cg_run_nowait() to run a process in the background that allocates memory. It makes sense for the background process to keep the memory allocated and not instantly free it (otherwise there is no point of running it in the background).
Link: https://lkml.kernel.org/r/20220425190040.2475377-4-yosryahmed@google.com Signed-off-by: Yosry Ahmed yosryahmed@google.com Acked-by: Roman Gushchin roman.gushchin@linux.dev Acked-by: Shakeel Butt shakeelb@google.com Acked-by: David Rientjes rientjes@google.com Cc: Chen Wandun chenwandun@huawei.com Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Greg Thelen gthelen@google.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Jonathan Corbet corbet@lwn.net Cc: Michal Hocko mhocko@suse.com Cc: "Michal Koutn" mkoutny@suse.com Cc: Shuah Khan shuah@kernel.org Cc: Tejun Heo tj@kernel.org Cc: Tim Chen tim.c.chen@linux.intel.com Cc: Vaibhav Jain vaibhav@linux.ibm.com Cc: Wei Xu weixugc@google.com Cc: Yu Zhao yuzhao@google.com Cc: Zefan Li lizefan.x@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- tools/testing/selftests/cgroup/test_memcontrol.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-)
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index c19a97dd02d4..9f5cdcf0593f 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -210,13 +210,17 @@ static int alloc_pagecache_50M_noexit(const char *cgroup, void *arg) static int alloc_anon_noexit(const char *cgroup, void *arg) { int ppid = getppid(); + size_t size = (unsigned long)arg; + char *buf, *ptr;
- if (alloc_anon(cgroup, arg)) - return -1; + buf = malloc(size); + for (ptr = buf; ptr < buf + size; ptr += PAGE_SIZE) + *ptr = 0;
while (getppid() == ppid) sleep(1);
+ free(buf); return 0; }
From: Yosry Ahmed yosryahmed@google.com
mainline inclusion from mainline-v5.19-rc1 commit eae3cb2e87ff84547e66211b81301a8f9122840f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545DF CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
--------------------------------
Add a new test for memory.reclaim that verifies that the interface correctly reclaims memory as intended, from both anon and file pages.
Link: https://lkml.kernel.org/r/20220425190040.2475377-5-yosryahmed@google.com Signed-off-by: Yosry Ahmed yosryahmed@google.com Acked-by: Roman Gushchin roman.gushchin@linux.dev Acked-by: David Rientjes rientjes@google.com Cc: Chen Wandun chenwandun@huawei.com Cc: Dave Hansen dave.hansen@linux.intel.com Cc: Greg Thelen gthelen@google.com Cc: Johannes Weiner hannes@cmpxchg.org Cc: Jonathan Corbet corbet@lwn.net Cc: Michal Hocko mhocko@suse.com Cc: "Michal Koutn" mkoutny@suse.com Cc: Shakeel Butt shakeelb@google.com Cc: Shuah Khan shuah@kernel.org Cc: Tejun Heo tj@kernel.org Cc: Tim Chen tim.c.chen@linux.intel.com Cc: Vaibhav Jain vaibhav@linux.ibm.com Cc: Wei Xu weixugc@google.com Cc: Yu Zhao yuzhao@google.com Cc: Zefan Li lizefan.x@bytedance.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- .../selftests/cgroup/test_memcontrol.c | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+)
diff --git a/tools/testing/selftests/cgroup/test_memcontrol.c b/tools/testing/selftests/cgroup/test_memcontrol.c index 9f5cdcf0593f..94e16e383bcf 100644 --- a/tools/testing/selftests/cgroup/test_memcontrol.c +++ b/tools/testing/selftests/cgroup/test_memcontrol.c @@ -683,6 +683,111 @@ static int test_memcg_max(const char *root) return ret; }
+/* + * This test checks that memory.reclaim reclaims the given + * amount of memory (from both anon and file, if possible). + */ +static int test_memcg_reclaim(const char *root) +{ + int ret = KSFT_FAIL, fd, retries; + char *memcg; + long current, expected_usage, to_reclaim; + char buf[64]; + + memcg = cg_name(root, "memcg_test"); + if (!memcg) + goto cleanup; + + if (cg_create(memcg)) + goto cleanup; + + current = cg_read_long(memcg, "memory.current"); + if (current != 0) + goto cleanup; + + fd = get_temp_fd(); + if (fd < 0) + goto cleanup; + + cg_run_nowait(memcg, alloc_pagecache_50M_noexit, (void *)(long)fd); + + /* + * If swap is enabled, try to reclaim from both anon and file, else try + * to reclaim from file only. + */ + if (is_swap_enabled()) { + cg_run_nowait(memcg, alloc_anon_noexit, (void *) MB(50)); + expected_usage = MB(100); + } else + expected_usage = MB(50); + + /* + * Wait until current usage reaches the expected usage (or we run out of + * retries). + */ + retries = 5; + while (!values_close(cg_read_long(memcg, "memory.current"), + expected_usage, 10)) { + if (retries--) { + sleep(1); + continue; + } else { + fprintf(stderr, + "failed to allocate %ld for memcg reclaim test\n", + expected_usage); + goto cleanup; + } + } + + /* + * Reclaim until current reaches 30M, this makes sure we hit both anon + * and file if swap is enabled. + */ + retries = 5; + while (true) { + int err; + + current = cg_read_long(memcg, "memory.current"); + to_reclaim = current - MB(30); + + /* + * We only keep looping if we get EAGAIN, which means we could + * not reclaim the full amount. + */ + if (to_reclaim <= 0) + goto cleanup; + + + snprintf(buf, sizeof(buf), "%ld", to_reclaim); + err = cg_write(memcg, "memory.reclaim", buf); + if (!err) { + /* + * If writing succeeds, then the written amount should have been + * fully reclaimed (and maybe more). + */ + current = cg_read_long(memcg, "memory.current"); + if (!values_close(current, MB(30), 3) && current > MB(30)) + goto cleanup; + break; + } + + /* The kernel could not reclaim the full amount, try again. */ + if (err == -EAGAIN && retries--) + continue; + + /* We got an unexpected error or ran out of retries. */ + goto cleanup; + } + + ret = KSFT_PASS; +cleanup: + cg_destroy(memcg); + free(memcg); + close(fd); + + return ret; +} + static int alloc_anon_50M_check_swap(const char *cgroup, void *arg) { long mem_max = (long)arg; @@ -1185,6 +1290,7 @@ struct memcg_test { T(test_memcg_low), T(test_memcg_high), T(test_memcg_max), + T(test_memcg_reclaim), T(test_memcg_oom_events), T(test_memcg_swap_max), T(test_memcg_sock),
From: Chen Wandun chenwandun@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I545DF CVE: NA
--------------------------------
introduce per-memcg reclaim interface for cgroup v1, and disable memory reclaim for root memcg.
Signed-off-by: Chen Wandun chenwandun@huawei.com Reviewed-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- mm/memcontrol.c | 87 ++++++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 40 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 06cb2c417623..3ede56d6b307 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5152,6 +5152,49 @@ static int memcg_events_local_show(struct seq_file *m, void *v) return 0; }
+static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_retries = MAX_RECLAIM_RETRIES; + unsigned long nr_to_reclaim, nr_reclaimed = 0; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "", &nr_to_reclaim); + if (err) + return err; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && + mem_cgroup_is_root(memcg)) + return -EINVAL; + + while (nr_reclaimed < nr_to_reclaim) { + unsigned long reclaimed; + + if (signal_pending(current)) + return -EINTR; + + /* This is the final attempt, drain percpu lru caches in the + * hope of introducing more evictable pages for + * try_to_free_mem_cgroup_pages(). + */ + if (!nr_retries) + lru_add_drain_all(); + + reclaimed = try_to_free_mem_cgroup_pages(memcg, + nr_to_reclaim - nr_reclaimed, + GFP_KERNEL, true); + + if (!reclaimed && !nr_retries--) + return -EAGAIN; + + nr_reclaimed += reclaimed; + } + + return nbytes; +} + static struct cftype mem_cgroup_legacy_files[] = { { .name = "usage_in_bytes", @@ -5345,6 +5388,10 @@ static struct cftype mem_cgroup_legacy_files[] = { .file_offset = offsetof(struct mem_cgroup, events_local_file), .seq_show = memcg_events_local_show, }, + { + .name = "reclaim", + .write = memory_reclaim, + }, { }, /* terminate */ };
@@ -6711,46 +6758,6 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, return nbytes; }
-static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) -{ - struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); - unsigned int nr_retries = MAX_RECLAIM_RETRIES; - unsigned long nr_to_reclaim, nr_reclaimed = 0; - int err; - - buf = strstrip(buf); - err = page_counter_memparse(buf, "", &nr_to_reclaim); - if (err) - return err; - - while (nr_reclaimed < nr_to_reclaim) { - unsigned long reclaimed; - - if (signal_pending(current)) - return -EINTR; - - /* - * This is the final attempt, drain percpu lru caches in the - * hope of introducing more evictable pages for - * try_to_free_mem_cgroup_pages(). - */ - if (!nr_retries) - lru_add_drain_all(); - - reclaimed = try_to_free_mem_cgroup_pages(memcg, - nr_to_reclaim - nr_reclaimed, - GFP_KERNEL, true); - - if (!reclaimed && !nr_retries--) - return -EAGAIN; - - nr_reclaimed += reclaimed; - } - - return nbytes; -} - static struct cftype memory_files[] = { { .name = "current",
From: Jarkko Sakkinen jarkko@kernel.org
mainline inclusion from mainline-v5.17-rc8 commit 08999b2489b4c9b939d7483dbd03702ee4576d96 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I59I14 CVE: CVE-2021-33135
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
There is a limited amount of SGX memory (EPC) on each system. When that memory is used up, SGX has its own swapping mechanism which is similar in concept but totally separate from the core mm/* code. Instead of swapping to disk, SGX swaps from EPC to normal RAM. That normal RAM comes from a shared memory pseudo-file and can itself be swapped by the core mm code. There is a hierarchy like this:
EPC <-> shmem <-> disk
After data is swapped back in from shmem to EPC, the shmem backing storage needs to be freed. Currently, the backing shmem is not freed. This effectively wastes the shmem while the enclave is running. The memory is recovered when the enclave is destroyed and the backing storage freed.
Sort this out by freeing memory with shmem_truncate_range(), as soon as a page is faulted back to the EPC. In addition, free the memory for PCMD pages as soon as all PCMD's in a page have been marked as unused by zeroing its contents.
Cc: stable@vger.kernel.org Fixes: 1728ab54b4be ("x86/sgx: Add a page reclaimer") Reported-by: Dave Hansen dave.hansen@linux.intel.com Signed-off-by: Jarkko Sakkinen jarkko@kernel.org Signed-off-by: Dave Hansen dave.hansen@linux.intel.com Link: https://lkml.kernel.org/r/20220303223859.273187-1-jarkko@kernel.org Signed-off-by: Chen Jiahao chenjiahao16@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kernel/cpu/sgx/encl.c | 57 ++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 9 deletions(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 97fb7efce224..ebdd7fd49b5c 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -12,6 +12,30 @@ #include "encls.h" #include "sgx.h"
+/* + * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's + * follow right after the EPC data in the backing storage. In addition to the + * visible enclave pages, there's one extra page slot for SECS, before PCMD + * structs. + */ +static inline pgoff_t sgx_encl_get_backing_page_pcmd_offset(struct sgx_encl *encl, + unsigned long page_index) +{ + pgoff_t epc_end_off = encl->size + sizeof(struct sgx_secs); + + return epc_end_off + page_index * sizeof(struct sgx_pcmd); +} + +/* + * Free a page from the backing storage in the given page index. + */ +static inline void sgx_encl_truncate_backing_page(struct sgx_encl *encl, unsigned long page_index) +{ + struct inode *inode = file_inode(encl->backing); + + shmem_truncate_range(inode, PFN_PHYS(page_index), PFN_PHYS(page_index) + PAGE_SIZE - 1); +} + /* * ELDU: Load an EPC page as unblocked. For more info, see "OS Management of EPC * Pages" in the SDM. @@ -22,9 +46,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, { unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; struct sgx_encl *encl = encl_page->encl; + pgoff_t page_index, page_pcmd_off; struct sgx_pageinfo pginfo; struct sgx_backing b; - pgoff_t page_index; + bool pcmd_page_empty; + u8 *pcmd_page; int ret;
if (secs_page) @@ -32,14 +58,16 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, else page_index = PFN_DOWN(encl->size);
+ page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); + ret = sgx_encl_get_backing(encl, page_index, &b); if (ret) return ret;
pginfo.addr = encl_page->desc & PAGE_MASK; pginfo.contents = (unsigned long)kmap_atomic(b.contents); - pginfo.metadata = (unsigned long)kmap_atomic(b.pcmd) + - b.pcmd_offset; + pcmd_page = kmap_atomic(b.pcmd); + pginfo.metadata = (unsigned long)pcmd_page + b.pcmd_offset;
if (secs_page) pginfo.secs = (u64)sgx_get_epc_virt_addr(secs_page); @@ -55,11 +83,24 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, ret = -EFAULT; }
- kunmap_atomic((void *)(unsigned long)(pginfo.metadata - b.pcmd_offset)); + memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); + + /* + * The area for the PCMD in the page was zeroed above. Check if the + * whole page is now empty meaning that all PCMD's have been zeroed: + */ + pcmd_page_empty = !memchr_inv(pcmd_page, 0, PAGE_SIZE); + + kunmap_atomic(pcmd_page); kunmap_atomic((void *)(unsigned long)pginfo.contents);
sgx_encl_put_backing(&b, false);
+ sgx_encl_truncate_backing_page(encl, page_index); + + if (pcmd_page_empty) + sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off)); + return ret; }
@@ -577,7 +618,7 @@ static struct page *sgx_encl_get_backing_page(struct sgx_encl *encl, int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, struct sgx_backing *backing) { - pgoff_t pcmd_index = PFN_DOWN(encl->size) + 1 + (page_index >> 5); + pgoff_t page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index); struct page *contents; struct page *pcmd;
@@ -585,7 +626,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, if (IS_ERR(contents)) return PTR_ERR(contents);
- pcmd = sgx_encl_get_backing_page(encl, pcmd_index); + pcmd = sgx_encl_get_backing_page(encl, PFN_DOWN(page_pcmd_off)); if (IS_ERR(pcmd)) { put_page(contents); return PTR_ERR(pcmd); @@ -594,9 +635,7 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, backing->page_index = page_index; backing->contents = contents; backing->pcmd = pcmd; - backing->pcmd_offset = - (page_index & (PAGE_SIZE / sizeof(struct sgx_pcmd) - 1)) * - sizeof(struct sgx_pcmd); + backing->pcmd_offset = page_pcmd_off & (PAGE_SIZE - 1);
return 0; }
From: Reinette Chatre reinette.chatre@intel.com
mainline inclusion from mainline-v5.19-rc1 commit 2154e1c11b7080aa19f47160bd26b6f39bbd7824 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I59I14 CVE: CVE-2021-33135
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
Recent commit 08999b2489b4 ("x86/sgx: Free backing memory after faulting the enclave page") expanded __sgx_encl_eldu() to clear an enclave page's PCMD (Paging Crypto MetaData) from the PCMD page in the backing store after the enclave page is restored to the enclave.
Since the PCMD page in the backing store is modified the page should be marked as dirty to ensure the modified data is retained.
Cc: stable@vger.kernel.org Fixes: 08999b2489b4 ("x86/sgx: Free backing memory after faulting the enclave page") Signed-off-by: Reinette Chatre reinette.chatre@intel.com Signed-off-by: Dave Hansen dave.hansen@linux.intel.com Reviewed-by: Jarkko Sakkinen jarkko@kernel.org Tested-by: Haitao Huang haitao.huang@intel.com Link: https://lkml.kernel.org/r/00cd2ac480db01058d112e347b32599c1a806bc4.165238982... Signed-off-by: Chen Jiahao chenjiahao16@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kernel/cpu/sgx/encl.c | 1 + 1 file changed, 1 insertion(+)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index ebdd7fd49b5c..a2beb7231f81 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -84,6 +84,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, }
memset(pcmd_page + b.pcmd_offset, 0, sizeof(struct sgx_pcmd)); + set_page_dirty(b.pcmd);
/* * The area for the PCMD in the page was zeroed above. Check if the
From: Reinette Chatre reinette.chatre@intel.com
mainline inclusion from mainline-v5.19-rc1 commit af117837ceb9a78e995804ade4726ad2c2c8981f category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I59I14 CVE: CVE-2021-33135
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
Haitao reported encountering a WARN triggered by the ENCLS[ELDU] instruction faulting with a #GP.
The WARN is encountered when the reclaimer evicts a range of pages from the enclave when the same pages are faulted back right away.
Consider two enclave pages (ENCLAVE_A and ENCLAVE_B) sharing a PCMD page (PCMD_AB). ENCLAVE_A is in the enclave memory and ENCLAVE_B is in the backing store. PCMD_AB contains just one entry, that of ENCLAVE_B.
Scenario proceeds where ENCLAVE_A is being evicted from the enclave while ENCLAVE_B is faulted in.
sgx_reclaim_pages() {
...
/* * Reclaim ENCLAVE_A */ mutex_lock(&encl->lock); /* * Get a reference to ENCLAVE_A's * shmem page where enclave page * encrypted data will be stored * as well as a reference to the * enclave page's PCMD data page, * PCMD_AB. * Release mutex before writing * any data to the shmem pages. */ sgx_encl_get_backing(...); encl_page->desc |= SGX_ENCL_PAGE_BEING_RECLAIMED; mutex_unlock(&encl->lock);
/* * Fault ENCLAVE_B */
sgx_vma_fault() {
mutex_lock(&encl->lock); /* * Get reference to * ENCLAVE_B's shmem page * as well as PCMD_AB. */ sgx_encl_get_backing(...) /* * Load page back into * enclave via ELDU. */ /* * Release reference to * ENCLAVE_B' shmem page and * PCMD_AB. */ sgx_encl_put_backing(...); /* * PCMD_AB is found empty so * it and ENCLAVE_B's shmem page * are truncated. */ /* Truncate ENCLAVE_B backing page */ sgx_encl_truncate_backing_page(); /* Truncate PCMD_AB */ sgx_encl_truncate_backing_page();
mutex_unlock(&encl->lock);
... } mutex_lock(&encl->lock); encl_page->desc &= ~SGX_ENCL_PAGE_BEING_RECLAIMED; /* * Write encrypted contents of * ENCLAVE_A to ENCLAVE_A shmem * page and its PCMD data to * PCMD_AB. */ sgx_encl_put_backing(...)
/* * Reference to PCMD_AB is * dropped and it is truncated. * ENCLAVE_A's PCMD data is lost. */ mutex_unlock(&encl->lock); }
What happens next depends on whether it is ENCLAVE_A being faulted in or ENCLAVE_B being evicted - but both end up with ENCLS[ELDU] faulting with a #GP.
If ENCLAVE_A is faulted then at the time sgx_encl_get_backing() is called a new PCMD page is allocated and providing the empty PCMD data for ENCLAVE_A would cause ENCLS[ELDU] to #GP
If ENCLAVE_B is evicted first then a new PCMD_AB would be allocated by the reclaimer but later when ENCLAVE_A is faulted the ENCLS[ELDU] instruction would #GP during its checks of the PCMD value and the WARN would be encountered.
Noting that the reclaimer sets SGX_ENCL_PAGE_BEING_RECLAIMED at the time it obtains a reference to the backing store pages of an enclave page it is in the process of reclaiming, fix the race by only truncating the PCMD page after ensuring that no page sharing the PCMD page is in the process of being reclaimed.
Cc: stable@vger.kernel.org Fixes: 08999b2489b4 ("x86/sgx: Free backing memory after faulting the enclave page") Reported-by: Haitao Huang haitao.huang@intel.com Signed-off-by: Reinette Chatre reinette.chatre@intel.com Signed-off-by: Dave Hansen dave.hansen@linux.intel.com Reviewed-by: Jarkko Sakkinen jarkko@kernel.org Tested-by: Haitao Huang haitao.huang@intel.com Link: https://lkml.kernel.org/r/ed20a5db516aa813873268e125680041ae11dfcf.165238982... Signed-off-by: Chen Jiahao chenjiahao16@huawei.com Reviewed-by: Xiu Jianfeng xiujianfeng@huawei.com Reviewed-by: Zhang Jianhua chris.zjh@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- arch/x86/kernel/cpu/sgx/encl.c | 94 +++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 1 deletion(-)
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index a2beb7231f81..3aaa709b5054 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -12,6 +12,92 @@ #include "encls.h" #include "sgx.h"
+#define PCMDS_PER_PAGE (PAGE_SIZE / sizeof(struct sgx_pcmd)) +/* + * 32 PCMD entries share a PCMD page. PCMD_FIRST_MASK is used to + * determine the page index associated with the first PCMD entry + * within a PCMD page. + */ +#define PCMD_FIRST_MASK GENMASK(4, 0) + +/** + * reclaimer_writing_to_pcmd() - Query if any enclave page associated with + * a PCMD page is in process of being reclaimed. + * @encl: Enclave to which PCMD page belongs + * @start_addr: Address of enclave page using first entry within the PCMD page + * + * When an enclave page is reclaimed some Paging Crypto MetaData (PCMD) is + * stored. The PCMD data of a reclaimed enclave page contains enough + * information for the processor to verify the page at the time + * it is loaded back into the Enclave Page Cache (EPC). + * + * The backing storage to which enclave pages are reclaimed is laid out as + * follows: + * Encrypted enclave pages:SECS page:PCMD pages + * + * Each PCMD page contains the PCMD metadata of + * PAGE_SIZE/sizeof(struct sgx_pcmd) enclave pages. + * + * A PCMD page can only be truncated if it is (a) empty, and (b) not in the + * process of getting data (and thus soon being non-empty). (b) is tested with + * a check if an enclave page sharing the PCMD page is in the process of being + * reclaimed. + * + * The reclaimer sets the SGX_ENCL_PAGE_BEING_RECLAIMED flag when it + * intends to reclaim that enclave page - it means that the PCMD page + * associated with that enclave page is about to get some data and thus + * even if the PCMD page is empty, it should not be truncated. + * + * Context: Enclave mutex (&sgx_encl->lock) must be held. + * Return: 1 if the reclaimer is about to write to the PCMD page + * 0 if the reclaimer has no intention to write to the PCMD page + */ +static int reclaimer_writing_to_pcmd(struct sgx_encl *encl, + unsigned long start_addr) +{ + int reclaimed = 0; + int i; + + /* + * PCMD_FIRST_MASK is based on number of PCMD entries within + * PCMD page being 32. + */ + BUILD_BUG_ON(PCMDS_PER_PAGE != 32); + + for (i = 0; i < PCMDS_PER_PAGE; i++) { + struct sgx_encl_page *entry; + unsigned long addr; + + addr = start_addr + i * PAGE_SIZE; + + /* + * Stop when reaching the SECS page - it does not + * have a page_array entry and its reclaim is + * started and completed with enclave mutex held so + * it does not use the SGX_ENCL_PAGE_BEING_RECLAIMED + * flag. + */ + if (addr == encl->base + encl->size) + break; + + entry = xa_load(&encl->page_array, PFN_DOWN(addr)); + if (!entry) + continue; + + /* + * VA page slot ID uses same bit as the flag so it is important + * to ensure that the page is not already in backing store. + */ + if (entry->epc_page && + (entry->desc & SGX_ENCL_PAGE_BEING_RECLAIMED)) { + reclaimed = 1; + break; + } + } + + return reclaimed; +} + /* * Calculate byte offset of a PCMD struct associated with an enclave page. PCMD's * follow right after the EPC data in the backing storage. In addition to the @@ -47,6 +133,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, unsigned long va_offset = encl_page->desc & SGX_ENCL_PAGE_VA_OFFSET_MASK; struct sgx_encl *encl = encl_page->encl; pgoff_t page_index, page_pcmd_off; + unsigned long pcmd_first_page; struct sgx_pageinfo pginfo; struct sgx_backing b; bool pcmd_page_empty; @@ -58,6 +145,11 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page, else page_index = PFN_DOWN(encl->size);
+ /* + * Address of enclave page using the first entry within the PCMD page. + */ + pcmd_first_page = PFN_PHYS(page_index & ~PCMD_FIRST_MASK) + encl->base; + page_pcmd_off = sgx_encl_get_backing_page_pcmd_offset(encl, page_index);
ret = sgx_encl_get_backing(encl, page_index, &b); @@ -99,7 +191,7 @@ static int __sgx_encl_eldu(struct sgx_encl_page *encl_page,
sgx_encl_truncate_backing_page(encl, page_index);
- if (pcmd_page_empty) + if (pcmd_page_empty && !reclaimer_writing_to_pcmd(encl, pcmd_first_page)) sgx_encl_truncate_backing_page(encl, PFN_DOWN(page_pcmd_off));
return ret;
From: Zhihao Cheng chengzhihao1@huawei.com
hulk inclusion category: bugfix bugzilla: 186846, https://gitee.com/openeuler/kernel/issues/I5A80Q
--------------------------------
Commit 7bc3e6e55acf06 ("proc: Use a list of inodes to flush from proc") moved proc_flush_task() behind __exit_signal(). Then, process systemd can take long period high cpu usage during releasing task in following concurrent processes:
systemd ps kernel_waitid stat(/proc/tgid) do_wait filename_lookup wait_consider_task lookup_fast release_task __exit_signal __unhash_process detach_pid __change_pid // remove task->pid_links d_revalidate -> pid_revalidate // 0 d_invalidate(/proc/tgid) shrink_dcache_parent(/proc/tgid) d_walk(/proc/tgid) spin_lock_nested(/proc/tgid/fd) // iterating opened fd proc_flush_pid | d_invalidate (/proc/tgid/fd) | shrink_dcache_parent(/proc/tgid/fd) | shrink_dentry_list(subdirs) ↓ shrink_lock_dentry(/proc/tgid/fd) --> race on dentry lock
Function d_invalidate() will remove dentry from hash firstly, but why does proc_flush_pid() process dentry '/proc/tgid/fd' before dentry '/proc/tgid'? That's because proc_pid_make_inode() adds proc inode in reverse order by invoking hlist_add_head_rcu(). But proc should not add any inodes under '/proc/tgid' except '/proc/tgid/task/pid', fix it by adding inode into 'pid->inodes' only if the inode is /proc/tgid or /proc/tgid/task/pid.
Performance regression: Create 200 tasks, each task open one file for 50,000 times. Kill all tasks when opened files exceed 10,000,000 (cat /proc/sys/fs/file-nr).
Before fix: $ time killall -wq aa real 4m40.946s # During this period, we can see 'ps' and 'systemd' taking high cpu usage.
After fix: $ time killall -wq aa real 1m20.732s # During this period, we can see 'systemd' taking high cpu usage.
Fixes: 7bc3e6e55acf06 ("proc: Use a list of inodes to flush from proc") Link: https://bugzilla.kernel.org/show_bug.cgi?id=216054 Signed-off-by: Zhihao Cheng chengzhihao1@huawei.com Signed-off-by: Zhang Yi yi.zhang@huawei.com Reviewed-by: Zhang Yi yi.zhang@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/proc/base.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c index 2ba1313aa444..b9052be86e8d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1988,7 +1988,7 @@ void proc_pid_evict_inode(struct proc_inode *ei) put_pid(pid); }
-struct inode *proc_pid_make_inode(struct super_block * sb, +struct inode *proc_pid_make_inode(struct super_block *sb, struct task_struct *task, umode_t mode) { struct inode * inode; @@ -2017,11 +2017,6 @@ struct inode *proc_pid_make_inode(struct super_block * sb,
/* Let the pid remember us for quick removal */ ei->pid = pid; - if (S_ISDIR(mode)) { - spin_lock(&pid->lock); - hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); - spin_unlock(&pid->lock); - }
task_dump_owner(task, 0, &inode->i_uid, &inode->i_gid); security_task_to_inode(task, inode); @@ -2034,6 +2029,27 @@ struct inode *proc_pid_make_inode(struct super_block * sb, return NULL; }
+static struct inode *proc_pid_make_base_inode(struct super_block *sb, + struct task_struct *task, umode_t mode) +{ + struct inode *inode; + struct proc_inode *ei; + struct pid *pid; + + inode = proc_pid_make_inode(sb, task, mode); + if (!inode) + return NULL; + + /* Let proc_flush_pid find this directory inode */ + ei = PROC_I(inode); + pid = ei->pid; + spin_lock(&pid->lock); + hlist_add_head_rcu(&ei->sibling_inodes, &pid->inodes); + spin_unlock(&pid->lock); + + return inode; +} + int pid_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { @@ -3459,7 +3475,8 @@ static struct dentry *proc_pid_instantiate(struct dentry * dentry, { struct inode *inode;
- inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); + inode = proc_pid_make_base_inode(dentry->d_sb, task, + S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT);
@@ -3765,7 +3782,8 @@ static struct dentry *proc_task_instantiate(struct dentry *dentry, struct task_struct *task, const void *ptr) { struct inode *inode; - inode = proc_pid_make_inode(dentry->d_sb, task, S_IFDIR | S_IRUGO | S_IXUGO); + inode = proc_pid_make_base_inode(dentry->d_sb, task, + S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) return ERR_PTR(-ENOENT);
From: Jann Horn jannh@google.com
stable inclusion from stable-v5.10.110 commit 5a41a3033a9344d7683340e3d83f5435ffb06501 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I57IYD CVE: CVE-2022-30594
--------------------------------
commit ee1fee900537b5d9560e9f937402de5ddc8412f3 upstream.
Setting PTRACE_O_SUSPEND_SECCOMP is supposed to be a highly privileged operation because it allows the tracee to completely bypass all seccomp filters on kernels with CONFIG_CHECKPOINT_RESTORE=y. It is only supposed to be settable by a process with global CAP_SYS_ADMIN, and only if that process is not subject to any seccomp filters at all.
However, while these permission checks were done on the PTRACE_SETOPTIONS path, they were missing on the PTRACE_SEIZE path, which also sets user-specified ptrace flags.
Move the permissions checks out into a helper function and let both ptrace_attach() and ptrace_setoptions() call it.
Cc: stable@kernel.org Fixes: 13c4a90119d2 ("seccomp: add ptrace options for suspend/resume") Signed-off-by: Jann Horn jannh@google.com Link: https://lkml.kernel.org/r/20220319010838.1386861-1-jannh@google.com Signed-off-by: Eric W. Biederman ebiederm@xmission.com Signed-off-by: Greg Kroah-Hartman gregkh@linuxfoundation.org Signed-off-by: Li Huafei lihuafei1@huawei.com Reviewed-by: Kuohai Xu xukuohai@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- kernel/ptrace.c | 47 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 15 deletions(-)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index e3210358bcd2..072033f40e23 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -371,6 +371,26 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode) return !err; }
+static int check_ptrace_options(unsigned long data) +{ + if (data & ~(unsigned long)PTRACE_O_MASK) + return -EINVAL; + + if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { + if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || + !IS_ENABLED(CONFIG_SECCOMP)) + return -EINVAL; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || + current->ptrace & PT_SUSPEND_SECCOMP) + return -EPERM; + } + return 0; +} + static int ptrace_attach(struct task_struct *task, long request, unsigned long addr, unsigned long flags) @@ -382,8 +402,16 @@ static int ptrace_attach(struct task_struct *task, long request, if (seize) { if (addr != 0) goto out; + /* + * This duplicates the check in check_ptrace_options() because + * ptrace_attach() and ptrace_setoptions() have historically + * used different error codes for unknown ptrace options. + */ if (flags & ~(unsigned long)PTRACE_O_MASK) goto out; + retval = check_ptrace_options(flags); + if (retval) + return retval; flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT); } else { flags = PT_PTRACED; @@ -656,22 +684,11 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds static int ptrace_setoptions(struct task_struct *child, unsigned long data) { unsigned flags; + int ret;
- if (data & ~(unsigned long)PTRACE_O_MASK) - return -EINVAL; - - if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) { - if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) || - !IS_ENABLED(CONFIG_SECCOMP)) - return -EINVAL; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (seccomp_mode(¤t->seccomp) != SECCOMP_MODE_DISABLED || - current->ptrace & PT_SUSPEND_SECCOMP) - return -EPERM; - } + ret = check_ptrace_options(data); + if (ret) + return ret;
/* Avoid intermediate state when all opts are cleared */ flags = child->ptrace;