From: Ma Wupeng mawupeng1@huawei.com
delay the OOM reaper to allow time for proper futex cleanup.
Changelog since v1: - move oom_reaper_timer in struct struct task_struct_resvd.
Ma Wupeng (1): mm: oom_kill: fix KABI broken by "oom_kill.c: futex: delay the OOM reaper to allow time for proper futex cleanup"
Nico Pache (1): oom_kill.c: futex: delay the OOM reaper to allow time for proper futex cleanup
Zheng Zucheng (1): sched: Allocate a new task_struct_resvd object for fork task
include/linux/sched.h | 23 ++++++++++---- include/linux/sched/grid_qos.h | 2 +- init/init_task.c | 5 ++++ kernel/fork.c | 21 ++++++++++++- kernel/sched/grid/qos.c | 20 ++++++------- mm/oom_kill.c | 55 +++++++++++++++++++++++++--------- 6 files changed, 95 insertions(+), 31 deletions(-)
From: Zheng Zucheng zhengzucheng@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9HSOM CVE: NA
--------------------------------
Allocate a new task_struct_resvd object for the recently cloned task and rework grid feature use task_struct grid_qos field
Signed-off-by: Zheng Zucheng zhengzucheng@huawei.com --- include/linux/sched.h | 20 +++++++++++++++----- include/linux/sched/grid_qos.h | 2 +- init/init_task.c | 5 +++++ kernel/fork.c | 21 ++++++++++++++++++++- kernel/sched/grid/qos.c | 20 ++++++++++---------- 5 files changed, 51 insertions(+), 17 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 8fd8c5b7cdc6..2d59cba2fe64 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -620,6 +620,20 @@ typedef union { } fork_pid_t; #endif
+/* + * struct task_struct_resvd - KABI extension struct + */ +struct task_struct_resvd { + /* + * pointer back to the main task_struct + */ + struct task_struct *task; + +#ifdef CONFIG_QOS_SCHED_SMART_GRID + struct sched_grid_qos *grid_qos; +#endif +}; + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -1279,11 +1293,7 @@ struct task_struct { #endif
#if !defined(__GENKSYMS__) -#if defined(CONFIG_QOS_SCHED_SMART_GRID) - struct sched_grid_qos *grid_qos; -#else - KABI_RESERVE(8) -#endif + struct task_struct_resvd *_resvd; #else KABI_RESERVE(8) #endif diff --git a/include/linux/sched/grid_qos.h b/include/linux/sched/grid_qos.h index 23d08dbb6ae6..93f663453e16 100644 --- a/include/linux/sched/grid_qos.h +++ b/include/linux/sched/grid_qos.h @@ -76,7 +76,7 @@ struct sched_grid_qos {
static inline int sched_qos_affinity_set(struct task_struct *p) { - return p->grid_qos->affinity_set(p); + return p->_resvd->grid_qos->affinity_set(p); }
int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig); diff --git a/init/init_task.c b/init/init_task.c index b312a045f4b9..db5b7461b9c3 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -50,6 +50,10 @@ static struct sighand_struct init_sighand = { .signalfd_wqh = __WAIT_QUEUE_HEAD_INITIALIZER(init_sighand.signalfd_wqh), };
+static struct task_struct_resvd init_task_struct_resvd = { + .task = &init_task, +}; + /* * Set up the first task table, touch at your own risk!. Base=0, * limit=0x1fffff (=2MB) @@ -188,6 +192,7 @@ struct task_struct init_task .fork_pid = 0, }, #endif + ._resvd = &init_task_struct_resvd, }; EXPORT_SYMBOL(init_task);
diff --git a/kernel/fork.c b/kernel/fork.c index bfc4534ff116..02b676d10054 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -161,6 +161,7 @@ static inline struct task_struct *alloc_task_struct_node(int node)
static inline void free_task_struct(struct task_struct *tsk) { + kfree(tsk->_resvd); kmem_cache_free(task_struct_cachep, tsk); } #endif @@ -845,6 +846,18 @@ void set_task_stack_end_magic(struct task_struct *tsk) *stackend = STACK_END_MAGIC; /* for overflow detection */ }
+static bool dup_resvd_task_struct(struct task_struct *dst, + struct task_struct *orig, int node) +{ + dst->_resvd = kzalloc_node(sizeof(struct task_struct_resvd), + GFP_KERNEL, node); + if (!dst->_resvd) + return false; + + dst->_resvd->task = dst; + return true; +} + static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; @@ -857,6 +870,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk = alloc_task_struct_node(node); if (!tsk) return NULL; + /* + * before proceeding, we need to make tsk->_resvd = NULL, + * otherwise the error paths below, if taken, might end up causing + * a double-free for task_struct_resvd extension object. + */ + WRITE_ONCE(tsk->_resvd, NULL);
stack = alloc_thread_stack_node(tsk, node); if (!stack) @@ -882,7 +901,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) atomic_set(&tsk->stack_refcount, 1); #endif
- if (err) + if (err || !dup_resvd_task_struct(tsk, orig, node)) goto free_stack;
#ifdef CONFIG_SECCOMP diff --git a/kernel/sched/grid/qos.c b/kernel/sched/grid/qos.c index f0f10dfb9fd4..b3df69d91499 100644 --- a/kernel/sched/grid/qos.c +++ b/kernel/sched/grid/qos.c @@ -26,7 +26,7 @@ static inline int qos_affinity_set(struct task_struct *p) { int n; - struct sched_grid_qos_affinity *affinity = &p->grid_qos->affinity; + struct sched_grid_qos_affinity *affinity = &p->_resvd->grid_qos->affinity;
if (likely(affinity->prefer_cpus == p->select_cpus)) return 0; @@ -58,18 +58,18 @@ int sched_grid_qos_fork(struct task_struct *p, struct task_struct *orig) qos_stat_init(&qos->stat);
nodes_clear(qos->affinity.mem_preferred_node_mask); - if (likely(orig->grid_qos)) - qos->affinity = orig->grid_qos->affinity; + if (likely(orig->_resvd->grid_qos)) + qos->affinity = orig->_resvd->grid_qos->affinity; qos->affinity_set = qos_affinity_set; - p->grid_qos = qos; + p->_resvd->grid_qos = qos;
return 0; }
void sched_grid_qos_free(struct task_struct *p) { - kfree(p->grid_qos); - p->grid_qos = NULL; + kfree(p->_resvd->grid_qos); + p->_resvd->grid_qos = NULL; }
/* dynamic select a more appropriate preferred interleave nid for process */ @@ -80,9 +80,9 @@ int sched_grid_preferred_interleave_nid(struct mempolicy *policy) struct task_struct *me = current; nodemask_t *preferred_nmask = NULL;
- if (likely(me->grid_qos)) + if (likely(me->_resvd->grid_qos)) preferred_nmask = - &me->grid_qos->affinity.mem_preferred_node_mask; + &me->_resvd->grid_qos->affinity.mem_preferred_node_mask;
if (!preferred_nmask || !policy) return NUMA_NO_NODE; @@ -111,9 +111,9 @@ int sched_grid_preferred_nid(int preferred_nid, nodemask_t *nodemask) nodemask_t nmask, ndmask; nodemask_t *preferred_nmask = NULL;
- if (likely(current->grid_qos)) + if (likely(current->_resvd->grid_qos)) preferred_nmask = - ¤t->grid_qos->affinity.mem_preferred_node_mask; + ¤t->_resvd->grid_qos->affinity.mem_preferred_node_mask;
if (!preferred_nmask) return preferred_nid;
From: Nico Pache npache@redhat.com
mainline inclusion from mainline-v5.18-rc4 commit e4a38402c36e42df28eb1a5394be87e6571fb48a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9HSOM CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=...
--------------------------------
The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which can be targeted by the oom reaper. This mapping is used to store the futex robust list head; the kernel does not keep a copy of the robust list and instead references a userspace address to maintain the robustness during a process death.
A race can occur between exit_mm and the oom reaper that allows the oom reaper to free the memory of the futex robust list before the exit path has handled the futex death:
CPU1 CPU2 -------------------------------------------------------------------- page_fault do_exit "signal" wake_oom_reaper oom_reaper oom_reap_task_mm (invalidates mm) exit_mm exit_mm_release futex_exit_release futex_cleanup exit_robust_list get_user (EFAULT- can't access memory)
If the get_user EFAULT's, the kernel will be unable to recover the waiters on the robust_list, leaving userspace mutexes hung indefinitely.
Delay the OOM reaper, allowing more time for the exit path to perform the futex cleanup.
Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer
Based on a patch by Michal Hocko.
Link: https://elixir.bootlin.com/glibc/glibc-2.35/source/nptl/allocatestack.c#L370 [1] Link: https://lkml.kernel.org/r/20220414144042.677008-1-npache@redhat.com Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") Signed-off-by: Joel Savitz jsavitz@redhat.com Signed-off-by: Nico Pache npache@redhat.com Co-developed-by: Joel Savitz jsavitz@redhat.com Suggested-by: Thomas Gleixner tglx@linutronix.de Acked-by: Thomas Gleixner tglx@linutronix.de Acked-by: Michal Hocko mhocko@suse.com Cc: Rafael Aquini aquini@redhat.com Cc: Waiman Long longman@redhat.com Cc: Herton R. Krzesinski herton@redhat.com Cc: Juri Lelli juri.lelli@redhat.com Cc: Vincent Guittot vincent.guittot@linaro.org Cc: Dietmar Eggemann dietmar.eggemann@arm.com Cc: Steven Rostedt rostedt@goodmis.org Cc: Ben Segall bsegall@google.com Cc: Mel Gorman mgorman@suse.de Cc: Daniel Bristot de Oliveira bristot@redhat.com Cc: David Rientjes rientjes@google.com Cc: Andrea Arcangeli aarcange@redhat.com Cc: Davidlohr Bueso dave@stgolabs.net Cc: Peter Zijlstra peterz@infradead.org Cc: Ingo Molnar mingo@redhat.com Cc: Joel Savitz jsavitz@redhat.com Cc: Darren Hart dvhart@infradead.org Cc: stable@vger.kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org Signed-off-by: Linus Torvalds torvalds@linux-foundation.org Conflicts: mm/oom_kill.c Signed-off-by: Ma Wupeng mawupeng1@huawei.com --- include/linux/sched.h | 1 + mm/oom_kill.c | 54 ++++++++++++++++++++++++++++++++----------- 2 files changed, 41 insertions(+), 14 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index 2d59cba2fe64..dff84406bd49 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1236,6 +1236,7 @@ struct task_struct { int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; + struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4ad05a72bb8c..31d2bafc8a9b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -718,7 +718,7 @@ static void oom_reap_task(struct task_struct *tsk) */ set_bit(MMF_OOM_SKIP, &mm->flags);
- /* Drop a reference taken by wake_oom_reaper */ + /* Drop a reference taken by queue_oom_reaper */ put_task_struct(tsk); }
@@ -728,12 +728,12 @@ static int oom_reaper(void *unused) struct task_struct *tsk = NULL;
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); - spin_lock(&oom_reaper_lock); + spin_lock_irq(&oom_reaper_lock); if (oom_reaper_list != NULL) { tsk = oom_reaper_list; oom_reaper_list = tsk->oom_reaper_list; } - spin_unlock(&oom_reaper_lock); + spin_unlock_irq(&oom_reaper_lock);
if (tsk) oom_reap_task(tsk); @@ -742,22 +742,48 @@ static int oom_reaper(void *unused) return 0; }
-static void wake_oom_reaper(struct task_struct *tsk) +static void wake_oom_reaper(struct timer_list *timer) { - /* mm is already queued? */ - if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) - return; + struct task_struct *tsk = container_of(timer, struct task_struct, + oom_reaper_timer); + struct mm_struct *mm = tsk->signal->oom_mm; + unsigned long flags;
- get_task_struct(tsk); + /* The victim managed to terminate on its own - see exit_mmap */ + if (test_bit(MMF_OOM_SKIP, &mm->flags)) { + put_task_struct(tsk); + return; + }
- spin_lock(&oom_reaper_lock); + spin_lock_irqsave(&oom_reaper_lock, flags); tsk->oom_reaper_list = oom_reaper_list; oom_reaper_list = tsk; - spin_unlock(&oom_reaper_lock); + spin_unlock_irqrestore(&oom_reaper_lock, flags); trace_wake_reaper(tsk->pid); wake_up(&oom_reaper_wait); }
+/* + * Give the OOM victim time to exit naturally before invoking the oom_reaping. + * The timers timeout is arbitrary... the longer it is, the longer the worst + * case scenario for the OOM can take. If it is too small, the oom_reaper can + * get in the way and release resources needed by the process exit path. + * e.g. The futex robust list can sit in Anon|Private memory that gets reaped + * before the exit path is able to wake the futex waiters. + */ +#define OOM_REAPER_DELAY (2*HZ) +static void queue_oom_reaper(struct task_struct *tsk) +{ + /* mm is already queued? */ + if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags)) + return; + + get_task_struct(tsk); + timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); + tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; + add_timer(&tsk->oom_reaper_timer); +} + static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); @@ -765,7 +791,7 @@ static int __init oom_init(void) } subsys_initcall(oom_init) #else -static inline void wake_oom_reaper(struct task_struct *tsk) +static inline void queue_oom_reaper(struct task_struct *tsk) { } #endif /* CONFIG_MMU */ @@ -1012,7 +1038,7 @@ static void __oom_kill_process(struct task_struct *victim) rcu_read_unlock();
if (can_oom_reap) - wake_oom_reaper(victim); + queue_oom_reaper(victim);
mmdrop(mm); put_task_struct(victim); @@ -1053,7 +1079,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) task_lock(p); if (task_will_free_mem(p)) { mark_oom_victim(p); - wake_oom_reaper(p); + queue_oom_reaper(p); task_unlock(p); put_task_struct(p); return; @@ -1254,7 +1280,7 @@ bool out_of_memory(struct oom_control *oc) */ if (task_will_free_mem(current)) { mark_oom_victim(current); - wake_oom_reaper(current); + queue_oom_reaper(current); return true; }
From: Ma Wupeng mawupeng1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I9HSOM CVE: NA
-------------------------------
Move oom_reaper_timer from task_struct to task_struct_resvd to fix KABI broken.
Signed-off-by: Ma Wupeng mawupeng1@huawei.com --- include/linux/sched.h | 4 +++- mm/oom_kill.c | 11 ++++++----- 2 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h index dff84406bd49..26255b76ca52 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -629,6 +629,9 @@ struct task_struct_resvd { */ struct task_struct *task;
+#ifdef CONFIG_MMU + struct timer_list oom_reaper_timer; +#endif #ifdef CONFIG_QOS_SCHED_SMART_GRID struct sched_grid_qos *grid_qos; #endif @@ -1236,7 +1239,6 @@ struct task_struct { int pagefault_disabled; #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; - struct timer_list oom_reaper_timer; #endif #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 31d2bafc8a9b..d917e1e47f7d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -744,8 +744,9 @@ static int oom_reaper(void *unused)
static void wake_oom_reaper(struct timer_list *timer) { - struct task_struct *tsk = container_of(timer, struct task_struct, - oom_reaper_timer); + struct task_struct_resvd *tsk_resvd = container_of(timer, + struct task_struct_resvd, oom_reaper_timer); + struct task_struct *tsk = tsk_resvd->task; struct mm_struct *mm = tsk->signal->oom_mm; unsigned long flags;
@@ -779,9 +780,9 @@ static void queue_oom_reaper(struct task_struct *tsk) return;
get_task_struct(tsk); - timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0); - tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; - add_timer(&tsk->oom_reaper_timer); + timer_setup(&tsk->_resvd->oom_reaper_timer, wake_oom_reaper, 0); + tsk->_resvd->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY; + add_timer(&tsk->_resvd->oom_reaper_timer); }
static int __init oom_init(void)
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/6494 邮件列表地址:https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/5...
FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/6494 Mailing list address: https://mailweb.openeuler.org/hyperkitty/list/kernel@openeuler.org/message/5...