[PATCH OLK-5.10 5/9] mm/sharepool: Fix add group failed with errno 28

14 Nov 2022

From: Xu Qiang <xuqiang36@huawei.com>

ascend inclusion
category: bugfix
bugzilla: https://gitee.com/openeuler/kernel/issues/I612UG
CVE: NA

--------------------------------

We increase task->mm->mm_users by one when we add the task to a
sharepool group. Correspondingly we should drop the mm_users count when
the task exits. Currently we hijack the mmput function and make it
return early and decrease mm->mm_users by one (just as mmput would do)
if it is not called from a task's exiting process, or we decrease
mm->mm_users by the group number the task was added to. This has two
problems:
1. It makes mmput and sp_group_exit hard to understand.
2. The process of judging if the task (also its mm) is exiting and
   decrease its mm_users count is not atomic. We use this condition:
     mm->mm_users == master->count + MM_WOULD_FREE(1)
   If someone else change the mm->mm_users during those two steps, the
   mm->mm_users would be wrong and mm_struct cannot be released anymore.

Suppose the following process:

        proc1                                        proc2

1)      mmput
          |
          V
2)  enter sp_group_exit and
    'mm->mm_users == master->count + 1' is true
3)        |                                         mmget
          V
4)  decrease mm->mm_users by master->count
          |
          V
5)  enter __mmput and release mm_struct
    if mm->mm_users == 1
6)                                                  mmput

The statistical structure who has the same id of the task would get leaked
together with mm_struct, so the next time we try to create the statistical
structure of the same id, we get a failure.

We fix this by moving sp_group_exit to do_exit() actually where the task is
exiting. We don't need to judge if the task is exiting when someone
calling mmput so there is no chance to change mm_users wrongly.

Signed-off-by: Xu Qiang <xuqiang36@huawei.com>
Signed-off-by: Wang Wensheng <wangwensheng4@huawei.com>
---
 include/linux/share_pool.h |  4 ++--
 kernel/exit.c              |  8 ++++++++
 kernel/fork.c              |  4 ----
 mm/share_pool.c            | 38 ++++++++------------------------------
 4 files changed, 18 insertions(+), 36 deletions(-)

diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h
index 5e15e7a1234f..8190c8d82439 100644
--- a/include/linux/share_pool.h
+++ b/include/linux/share_pool.h
@@ -278,7 +278,7 @@ extern bool mg_is_sharepool_addr(unsigned long addr);
 extern int mg_sp_id_of_current(void);
 
 extern void sp_area_drop(struct vm_area_struct *vma);
-extern int sp_group_exit(struct mm_struct *mm);
+extern int sp_group_exit(void);
 extern void sp_group_post_exit(struct mm_struct *mm);
 vm_fault_t sharepool_no_page(struct mm_struct *mm,
 			     struct vm_area_struct *vma,
@@ -331,7 +331,7 @@ static inline int mg_sp_group_del_task(int tgid, int spg_id)
 	return -EPERM;
 }
 
-static inline int sp_group_exit(struct mm_struct *mm)
+static inline int sp_group_exit(void)
 {
 	return 0;
 }
diff --git a/kernel/exit.c b/kernel/exit.c
index ab900b661867..d612cb5b5943 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -64,6 +64,7 @@
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
 #include <linux/io_uring.h>
+#include <linux/share_pool.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -795,6 +796,13 @@ void __noreturn do_exit(long code)
 	tsk->exit_code = code;
 	taskstats_exit(tsk, group_dead);
 
+	/*
+	 * sp_group_exit must be executed before exit_mm,
+	 * otherwise it will cause mm leakage.
+	 */
+	if (group_dead)
+		sp_group_exit();
+
 	exit_mm();
 
 	if (group_dead)
diff --git a/kernel/fork.c b/kernel/fork.c
index 8a2e827815b6..8dbb8d985e78 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -96,7 +96,6 @@
 #include <linux/kasan.h>
 #include <linux/scs.h>
 #include <linux/io_uring.h>
-#include <linux/share_pool.h>
 
 #include <linux/share_pool.h>
 #include <asm/pgalloc.h>
@@ -1116,9 +1115,6 @@ void mmput(struct mm_struct *mm)
 {
 	might_sleep();
 
-	if (sp_group_exit(mm))
-		return;
-
 	if (atomic_dec_and_test(&mm->mm_users))
 		__mmput(mm);
 }
diff --git a/mm/share_pool.c b/mm/share_pool.c
index 8c30f0a5e3a1..d170929c1e89 100644
--- a/mm/share_pool.c
+++ b/mm/share_pool.c
@@ -4276,34 +4276,13 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm,
 	goto out;
 }
 
-#define MM_WOULD_FREE	1
-
 /*
- * Recall we add mm->users by 1 deliberately in sp_group_add_task().
- * If the mm_users == sp_group_master->count + 1, it means that the mm is ready
- * to be freed because the last owner of this mm is in exiting procedure:
- * do_exit() -> exit_mm() -> mmput() -> sp_group_exit -> THIS function.
+ * The caller must ensure that this function is called
+ * when the last thread in the thread group exits.
  */
-static bool need_free_sp_group(struct mm_struct *mm,
-			      struct sp_group_master *master)
-{
-	/* thread exits but process is still alive */
-	if ((unsigned int)atomic_read(&mm->mm_users) != master->count + MM_WOULD_FREE) {
-		if (atomic_dec_and_test(&mm->mm_users))
-			WARN(1, "Invalid user counting\n");
-		return false;
-	}
-
-	return true;
-}
-
-/*
- * Return:
- * 1	- let mmput() return immediately
- * 0	- let mmput() decrease mm_users and try __mmput()
- */
-int sp_group_exit(struct mm_struct *mm)
+int sp_group_exit(void)
 {
+	struct mm_struct *mm;
 	struct sp_group *spg;
 	struct sp_group_master *master;
 	struct sp_group_node *spg_node, *tmp;
@@ -4312,6 +4291,10 @@ int sp_group_exit(struct mm_struct *mm)
 	if (!sp_is_enabled())
 		return 0;
 
+	if (current->flags & PF_KTHREAD)
+		return 0;
+
+	mm = current->mm;
 	down_write(&sp_group_sem);
 
 	master = mm->sp_group_master;
@@ -4320,11 +4303,6 @@ int sp_group_exit(struct mm_struct *mm)
 		return 0;
 	}
 
-	if (!need_free_sp_group(mm, master)) {
-		up_write(&sp_group_sem);
-		return 1;
-	}
-
 	list_for_each_entry_safe(spg_node, tmp, &master->node_list, group_node) {
 		spg = spg_node->spg;
 
-- 
2.17.1