mainline inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4FHN2?from=project-issue CVE: NA ---------------------------
In modern systems it's not unusual to have a system component monitoring memory conditions of the system and tasked with keeping system memory pressure under control. One way to accomplish that is to kill non-essential processes to free up memory for more important ones. Examples of this are Facebook's OOM killer daemon called oomd and Android's low memory killer daemon called lmkd.
For such system component it's important to be able to free memory quickly and efficiently. Unfortunately the time process takes to free up its memory after receiving a SIGKILL might vary based on the state of the process (uninterruptible sleep), size and OPP level of the core the process is running. A mechanism to free resources of the target process in a more predictable way would improve system's ability to control its memory pressure.
Introduce process_mrelease system call that releases memory of a dying process from the context of the caller. This way the memory is freed in a more controllable way with CPU affinity and priority of the caller. The workload of freeing the memory will also be charged to the caller. The operation is allowed only on a dying process.
Signed-off-by: Suren Baghdasaryan surenb@google.com Reviewed-by: Shakeel Butt shakeelb@google.com Acked-by: David Hildenbrand david@redhat.com Acked-by: Michal Hocko mhocko@suse.com Acked-by: Christian Brauner christian.brauner@ubuntu.com Cc: David Rientjes rientjes@google.com Cc: Matthew Wilcox (Oracle) willy@infradead.org Cc: Johannes Weiner hannes@cmpxchg.org Cc: Roman Gushchin guro@fb.com Cc: Rik van Riel riel@surriel.com Cc: Minchan Kim minchan@kernel.org Cc: Christoph Hellwig hch@infradead.org Cc: Oleg Nesterov oleg@redhat.com Cc: Jann Horn jannh@google.com Cc: Geert Uytterhoeven geert@linux-m68k.org Cc: Andy Lutomirski luto@kernel.org Cc: Christian Brauner christian.brauner@ubuntu.com Cc: Florian Weimer fweimer@redhat.com Cc: Jan Engelhardt jengelh@inai.de Cc: Tim Murray timmurray@google.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Wen Zhiwei wenzhiwei@kylinos.cn --- arch/arm64/include/asm/unistd32.h | 3 +- arch/x86/entry/syscalls/syscall_64.tbl | 2 +- include/linux/syscalls.h | 3 ++ include/uapi/asm-generic/unistd.h | 5 +- kernel/sys_ni.c | 1 + mm/oom_kill.c | 72 +++++++++++++++++++++++++ tools/include/uapi/asm-generic/unistd.h | 4 +- 7 files changed, 86 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 107f08e03b9f..e1786b2e8551 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -889,7 +889,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) __SYSCALL(__NR_faccessat2, sys_faccessat2) #define __NR_process_madvise 440 __SYSCALL(__NR_process_madvise, sys_process_madvise) - +#define __NR_process_mrelease 441 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease) /* * Please add new compat syscalls above this comment and update * __NR_compat_syscalls in asm/unistd.h. diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 379819244b91..6eec6496d72c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -362,7 +362,7 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise - +441 common process_mrelease sys_process_mrelease # # Due to a historical design error, certain syscalls are numbered differently # in x32 as compared to native x86_64. These syscalls have numbers 512-547. diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index aea0ce9f3b74..764bfcdbbcb0 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -905,6 +905,9 @@ asmlinkage long sys_mincore(unsigned long start, size_t len, asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, size_t vlen, int behavior, unsigned int flags); + +asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); + asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 2056318988f7..e5f38ea960c4 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -859,9 +859,12 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) __SYSCALL(__NR_faccessat2, sys_faccessat2) #define __NR_process_madvise 440 __SYSCALL(__NR_process_madvise, sys_process_madvise) +#define __NR_process_mrelease 441 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease)
#undef __NR_syscalls -#define __NR_syscalls 441 +#define __NR_syscalls 442 +
/* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index f27ac94d5fa7..6b8203edf531 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -281,6 +281,7 @@ COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); COND_SYSCALL(process_madvise); +COND_SYSCALL(process_mrelease); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index fb39b0902476..d169eb511518 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -43,6 +43,7 @@ #include <linux/kthread.h> #include <linux/init.h> #include <linux/mmu_notifier.h> +#include <linux/syscalls.h>
#include <asm/tlb.h> #include "internal.h" @@ -1186,3 +1187,74 @@ void pagefault_out_of_memory(void) out_of_memory(&oc); mutex_unlock(&oom_lock); } + +SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) +{ +#ifdef CONFIG_MMU + struct mm_struct *mm = NULL; + struct task_struct *task; + struct task_struct *p; + unsigned int f_flags; + bool reap = false; + struct pid *pid; + long ret = 0; + + if (flags) + return -EINVAL; + + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + task = get_pid_task(pid, PIDTYPE_TGID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + + /* + * Make sure to choose a thread which still has a reference to mm + * during the group exit + */ + p = find_lock_task_mm(task); + if (!p) { + ret = -ESRCH; + goto put_task; + } + + if (mmget_not_zero(p->mm)) { + mm = p->mm; + if (task_will_free_mem(p)) + reap = true; + else { + /* Error only if the work has not been done already */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + ret = -EINVAL; + } + } + task_unlock(p); + + if (!reap) + goto drop_mm; + + if (mmap_read_lock_killable(mm)) { + ret = -EINTR; + goto drop_mm; + } + if (!__oom_reap_task_mm(mm)) + ret = -EAGAIN; + mmap_read_unlock(mm); + +drop_mm: + if (mm) + mmput(mm); +put_task: + put_task_struct(task); +put_pid: + put_pid(pid); + return ret; +#else + return -ENOSYS; +#endif /* CONFIG_MMU */ +} + diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 2056318988f7..c252faa802d3 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) __SYSCALL(__NR_faccessat2, sys_faccessat2) #define __NR_process_madvise 440 __SYSCALL(__NR_process_madvise, sys_process_madvise) +#define __NR_process_mrelease 441 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease)
#undef __NR_syscalls -#define __NR_syscalls 441 +#define __NR_syscalls 442
/* * 32 bit systems traditionally used different
Hi, Wen Zhiwei
Thank you for your patch,
but there maybe something to improve, please check and resend your patch
---------------------- [FAIL] checkbuild_arm64 ----------------------
build failed: arm64, allmodconfig
<line too long ...> rm64/include/asm/unistd32.h:893:11: note: in expansion of macro ‘__NR_process_mrelease’ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) ^~~~~~~~~~~~~~~~~~~~~ arch/arm64/kernel/sys32.c:28:35: warning: excess elements in array initializer #define __SYSCALL(nr, sym) [nr] = __arm64_##sym, ^ ./arch/arm64/include/asm/unistd32.h:893:1: note: in expansion of macro ‘__SYSCALL’ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) ^~~~~~~~~ arch/arm64/kernel/sys32.c:28:35: note: (near initialization for ‘a32_sys_call_table’) #define __SYSCALL(nr, sym) [nr] = __arm64_##sym, ^ ./arch/arm64/include/asm/unistd32.h:893:1: note: in expansion of macro ‘__SYSCALL’ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) ^~~~~~~~~ make[2]: *** [arch/arm64/kernel/sys32.o] Error 1 make[1]: *** [arch/arm64/kernel] Error 2 make[1]: *** Waiting for unfinished jobs.... make: *** [arch/arm64] Error 2 make: *** Waiting for unfinished jobs....
------------------------------ END ------------------------------
在 2021/11/13 17:28, Wen Zhiwei 写道:
mainline inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I4FHN2?from=project-issue CVE: NA
In modern systems it's not unusual to have a system component monitoring memory conditions of the system and tasked with keeping system memory pressure under control. One way to accomplish that is to kill non-essential processes to free up memory for more important ones. Examples of this are Facebook's OOM killer daemon called oomd and Android's low memory killer daemon called lmkd.
For such system component it's important to be able to free memory quickly and efficiently. Unfortunately the time process takes to free up its memory after receiving a SIGKILL might vary based on the state of the process (uninterruptible sleep), size and OPP level of the core the process is running. A mechanism to free resources of the target process in a more predictable way would improve system's ability to control its memory pressure.
Introduce process_mrelease system call that releases memory of a dying process from the context of the caller. This way the memory is freed in a more controllable way with CPU affinity and priority of the caller. The workload of freeing the memory will also be charged to the caller. The operation is allowed only on a dying process.
Signed-off-by: Suren Baghdasaryan surenb@google.com Reviewed-by: Shakeel Butt shakeelb@google.com Acked-by: David Hildenbrand david@redhat.com Acked-by: Michal Hocko mhocko@suse.com Acked-by: Christian Brauner christian.brauner@ubuntu.com Cc: David Rientjes rientjes@google.com Cc: Matthew Wilcox (Oracle) willy@infradead.org Cc: Johannes Weiner hannes@cmpxchg.org Cc: Roman Gushchin guro@fb.com Cc: Rik van Riel riel@surriel.com Cc: Minchan Kim minchan@kernel.org Cc: Christoph Hellwig hch@infradead.org Cc: Oleg Nesterov oleg@redhat.com Cc: Jann Horn jannh@google.com Cc: Geert Uytterhoeven geert@linux-m68k.org Cc: Andy Lutomirski luto@kernel.org Cc: Christian Brauner christian.brauner@ubuntu.com Cc: Florian Weimer fweimer@redhat.com Cc: Jan Engelhardt jengelh@inai.de Cc: Tim Murray timmurray@google.com Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Signed-off-by: Wen Zhiwei wenzhiwei@kylinos.cn
arch/arm64/include/asm/unistd32.h | 3 +- arch/x86/entry/syscalls/syscall_64.tbl | 2 +- include/linux/syscalls.h | 3 ++ include/uapi/asm-generic/unistd.h | 5 +- kernel/sys_ni.c | 1 + mm/oom_kill.c | 72 +++++++++++++++++++++++++ tools/include/uapi/asm-generic/unistd.h | 4 +- 7 files changed, 86 insertions(+), 4 deletions(-)
diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 107f08e03b9f..e1786b2e8551 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -889,7 +889,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) __SYSCALL(__NR_faccessat2, sys_faccessat2) #define __NR_process_madvise 440 __SYSCALL(__NR_process_madvise, sys_process_madvise)
+#define __NR_process_mrelease 441 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease) /*
- Please add new compat syscalls above this comment and update
- __NR_compat_syscalls in asm/unistd.h.
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 379819244b91..6eec6496d72c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -362,7 +362,7 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise
+441 common process_mrelease sys_process_mrelease # # Due to a historical design error, certain syscalls are numbered differently # in x32 as compared to native x86_64. These syscalls have numbers 512-547. diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index aea0ce9f3b74..764bfcdbbcb0 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -905,6 +905,9 @@ asmlinkage long sys_mincore(unsigned long start, size_t len, asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, size_t vlen, int behavior, unsigned int flags);
+asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags);
- asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags);
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 2056318988f7..e5f38ea960c4 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -859,9 +859,12 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) __SYSCALL(__NR_faccessat2, sys_faccessat2) #define __NR_process_madvise 440 __SYSCALL(__NR_process_madvise, sys_process_madvise) +#define __NR_process_mrelease 441 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease)
#undef __NR_syscalls -#define __NR_syscalls 441 +#define __NR_syscalls 442
/*
- 32 bit systems traditionally used different
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index f27ac94d5fa7..6b8203edf531 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -281,6 +281,7 @@ COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); COND_SYSCALL(process_madvise); +COND_SYSCALL(process_mrelease); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index fb39b0902476..d169eb511518 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -43,6 +43,7 @@ #include <linux/kthread.h> #include <linux/init.h> #include <linux/mmu_notifier.h> +#include <linux/syscalls.h>
#include <asm/tlb.h> #include "internal.h" @@ -1186,3 +1187,74 @@ void pagefault_out_of_memory(void) out_of_memory(&oc); mutex_unlock(&oom_lock); }
+SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) +{ +#ifdef CONFIG_MMU
- struct mm_struct *mm = NULL;
- struct task_struct *task;
- struct task_struct *p;
- unsigned int f_flags;
- bool reap = false;
- struct pid *pid;
- long ret = 0;
- if (flags)
return -EINVAL;
- pid = pidfd_get_pid(pidfd, &f_flags);
- if (IS_ERR(pid))
return PTR_ERR(pid);
- task = get_pid_task(pid, PIDTYPE_TGID);
- if (!task) {
ret = -ESRCH;
goto put_pid;
- }
- /*
* Make sure to choose a thread which still has a reference to mm
* during the group exit
*/
- p = find_lock_task_mm(task);
- if (!p) {
ret = -ESRCH;
goto put_task;
- }
- if (mmget_not_zero(p->mm)) {
mm = p->mm;
if (task_will_free_mem(p))
reap = true;
else {
/* Error only if the work has not been done already */
if (!test_bit(MMF_OOM_SKIP, &mm->flags))
ret = -EINVAL;
}
- }
- task_unlock(p);
- if (!reap)
goto drop_mm;
- if (mmap_read_lock_killable(mm)) {
ret = -EINTR;
goto drop_mm;
- }
- if (!__oom_reap_task_mm(mm))
ret = -EAGAIN;
- mmap_read_unlock(mm);
+drop_mm:
- if (mm)
mmput(mm);
+put_task:
- put_task_struct(task);
+put_pid:
- put_pid(pid);
- return ret;
+#else
- return -ENOSYS;
+#endif /* CONFIG_MMU */ +}
diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index 2056318988f7..c252faa802d3 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) __SYSCALL(__NR_faccessat2, sys_faccessat2) #define __NR_process_madvise 440 __SYSCALL(__NR_process_madvise, sys_process_madvise) +#define __NR_process_mrelease 441 +__SYSCALL(__NR_process_mrelease, sys_process_mrelease)
#undef __NR_syscalls -#define __NR_syscalls 441 +#define __NR_syscalls 442
/*
- 32 bit systems traditionally used different