From: Zhang Jian zhangjian210@huawei.com
It includes these patches: Six patches for sharepool bugfix and ascend feature One patch for gic bugfix One patch for adding ascend characteristic switch One patch for export some interface for ascend
Guo Mengqi (1): sharepool: fix sp_alloc_populate no fallocate bug
Wang Wensheng (1): ascend/arm64: Add ascend_enable_all kernel parameter
Xu Qiang (2): irq-gic-v3: Fix too large cpu_count mm/sharepool: Fix add group failed with errno 28
Yuan Can (1): ascend: export interfaces required by ascend drivers
Zhang Zekun (2): mm/sharepool: Use "tgid" instead of "pid" to find a task mm: sharepool: Fix static check warning
Zhou Guanghui (2): mm: fix alloc CDM node memory for MPOL_BIND mm: fix ignore cpuset enforcement
arch/arm64/kernel/cpufeature.c | 2 +- arch/arm64/mm/init.c | 42 ++++++++++ drivers/irqchip/irq-gic-v3-its.c | 10 +++ include/linux/share_pool.h | 28 +++---- kernel/exit.c | 8 ++ kernel/fork.c | 4 - kernel/power/autosleep.c | 1 + kernel/workqueue.c | 3 + mm/hugetlb.c | 2 +- mm/memcontrol.c | 2 +- mm/oom_kill.c | 1 + mm/page_alloc.c | 6 +- mm/share_pool.c | 136 ++++++++++++++----------------- mm/vmalloc.c | 3 + 14 files changed, 152 insertions(+), 96 deletions(-)
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I612UG CVE: NA
--------------------------------
This kernel parameter is used for ascend scene and would open all the options needed at once.
Signed-off-by: Wang Wensheng wangwensheng4@huawei.com --- arch/arm64/kernel/cpufeature.c | 2 +- arch/arm64/mm/init.c | 42 ++++++++++++++++++++++++++++++++++ mm/hugetlb.c | 2 +- mm/memcontrol.c | 2 +- 4 files changed, 45 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index f5ce1e3a532f..159481996630 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1729,7 +1729,7 @@ static void cpu_enable_e0pd(struct arm64_cpu_capabilities const *cap) #endif /* CONFIG_ARM64_E0PD */
#ifdef CONFIG_ARM64_PSEUDO_NMI -static bool enable_pseudo_nmi; +bool enable_pseudo_nmi;
static int __init early_enable_pseudo_nmi(char *p) { diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 7ef7c0e7ee7c..2fe037c19501 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -31,6 +31,8 @@ #include <linux/hugetlb.h> #include <linux/acpi_iort.h> #include <linux/pin_mem.h> +#include <linux/suspend.h> +#include <linux/nmi.h>
#include <asm/boot.h> #include <asm/fixmap.h> @@ -724,3 +726,43 @@ void dump_mem_limit(void) pr_emerg("Memory Limit: none\n"); } } + +void ascend_enable_all_features(void) +{ + if (IS_ENABLED(CONFIG_ASCEND_DVPP_MMAP)) + enable_mmap_dvpp = 1; + +#ifdef CONFIG_ASCEND_CHARGE_MIGRATE_HUGEPAGES + extern int enable_charge_mighp; + + enable_charge_mighp = 1; +#endif + +#ifdef CONFIG_SUSPEND + mem_sleep_current = PM_SUSPEND_ON; +#endif + +#ifdef CONFIG_MEMCG_KMEM + extern bool cgroup_memory_nokmem; + + cgroup_memory_nokmem = false; +#endif + +#ifdef CONFIG_ARM64_PSEUDO_NMI + extern bool enable_pseudo_nmi; + + enable_pseudo_nmi = true; +#endif + +#ifdef CONFIG_CORELOCKUP_DETECTOR + enable_corelockup_detector = true; +#endif +} + +static int __init ascend_enable_setup(char *__unused) +{ + ascend_enable_all_features(); + + return 0; +} +early_param("ascend_enable_all", ascend_enable_setup); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f5f8227b090b..af19c0bf024c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6212,7 +6212,7 @@ void __init hugetlb_cma_check(void) #endif /* CONFIG_CMA */
#ifdef CONFIG_ASCEND_FEATURES -static int enable_charge_mighp __read_mostly; +int enable_charge_mighp __read_mostly;
const struct hstate *hugetlb_get_hstate(void) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac07a0ffbe20..99e4e8d0242f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -84,7 +84,7 @@ DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg); static bool cgroup_memory_nosocket;
/* Kernel memory accounting disabled */ -static bool cgroup_memory_nokmem = true; +bool cgroup_memory_nokmem = true;
/* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP
From: Zhang Zekun zhangzekun11@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I612UG CVE: NA
--------------------------------
To support container scenario, use tgid instead of pid to find a specific task. In normal cases, "tgid" represent a process in init_pid_ns, this patch should not introduce problems to existing code.
Rename the input parameter "int pid" to "int tgid" in following exported interfaces: 1.mg_sp_group_id_by_pid() 2.mg_sp_group_add_task() 3.mg_sp_group_del_task() 4.mg_sp_make_share_k2u() 5.mg_sp_make_share_u2k() 6.mg_sp_config_dvpp_range()
Besides, rename these static function together: 1.__sp_find_spg_locked() 2.__sp_find_spg()
The following function use "current->pid" to find spg, change "current->pid" to "current->tgid". 1.find_or_alloc_sp_group() 2.sp_alloc_prepare() 3.mg_sp_make_share_k2u()
Signed-off-by: Zhang Zekun zhangzekun11@huawei.com --- include/linux/share_pool.h | 24 +++++++------- mm/share_pool.c | 64 ++++++++++++++++++++------------------ 2 files changed, 45 insertions(+), 43 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 1432aaa08087..5e15e7a1234f 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -249,9 +249,9 @@ static inline void sp_init_mm(struct mm_struct *mm) /* * Those interfaces are exported for modules */ -extern int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id); -extern int mg_sp_group_del_task(int pid, int spg_id); -extern int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num); +extern int mg_sp_group_add_task(int tgid, unsigned long prot, int spg_id); +extern int mg_sp_group_del_task(int tgid, int spg_id); +extern int mg_sp_group_id_by_pid(int tgid, int *spg_ids, int *num); extern int proc_sp_group_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task);
@@ -259,8 +259,8 @@ extern void *mg_sp_alloc(unsigned long size, unsigned long sp_flags, int spg_id) extern int mg_sp_free(unsigned long addr, int id);
extern void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size, - unsigned long sp_flags, int pid, int spg_id); -extern void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int pid); + unsigned long sp_flags, int tgid, int spg_id); +extern void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int tgid); extern int mg_sp_unshare(unsigned long va, unsigned long size, int spg_id);
extern int mg_sp_walk_page_range(unsigned long uva, unsigned long size, @@ -271,7 +271,7 @@ extern void mg_sp_walk_page_free(struct sp_walk_data *sp_walk_data); extern int sp_register_notifier(struct notifier_block *nb); extern int sp_unregister_notifier(struct notifier_block *nb);
-extern bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid); +extern bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, int tgid);
extern bool mg_is_sharepool_addr(unsigned long addr);
@@ -321,12 +321,12 @@ static inline bool is_vmalloc_sharepool(unsigned long vm_flags)
#else /* CONFIG_ASCEND_SHARE_POOL */
-static inline int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) +static inline int mg_sp_group_add_task(int tgid, unsigned long prot, int spg_id) { return -EPERM; }
-static inline int mg_sp_group_del_task(int pid, int spg_id) +static inline int mg_sp_group_del_task(int tgid, int spg_id) { return -EPERM; } @@ -340,7 +340,7 @@ static inline void sp_group_post_exit(struct mm_struct *mm) { }
-static inline int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num) +static inline int mg_sp_group_id_by_pid(int tgid, int *spg_ids, int *num) { return -EPERM; } @@ -362,12 +362,12 @@ static inline int mg_sp_free(unsigned long addr, int id) }
static inline void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size, - unsigned long sp_flags, int pid, int spg_id) + unsigned long sp_flags, int tgid, int spg_id) { return NULL; }
-static inline void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) +static inline void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int tgid) { return NULL; } @@ -410,7 +410,7 @@ static inline int sp_unregister_notifier(struct notifier_block *nb) return -EPERM; }
-static inline bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) +static inline bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, int tgid) { return false; } diff --git a/mm/share_pool.c b/mm/share_pool.c index 68e8f5c93a1f..7a1b7f5caf54 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -975,12 +975,14 @@ static void sp_group_drop(struct sp_group *spg) }
/* use with put_task_struct(task) */ -static int get_task(int pid, struct task_struct **task) +static int get_task(int tgid, struct task_struct **task) { struct task_struct *tsk; + struct pid *p;
rcu_read_lock(); - tsk = find_task_by_vpid(pid); + p = find_pid_ns(tgid, &init_pid_ns); + tsk = pid_task(p, PIDTYPE_TGID); if (!tsk || (tsk->flags & PF_EXITING)) { rcu_read_unlock(); return -ESRCH; @@ -1010,14 +1012,14 @@ static bool is_process_in_group(struct sp_group *spg, }
/* user must call sp_group_drop() after use */ -static struct sp_group *__sp_find_spg_locked(int pid, int spg_id) +static struct sp_group *__sp_find_spg_locked(int tgid, int spg_id) { struct sp_group *spg = NULL; struct task_struct *tsk = NULL; int ret = 0;
if (spg_id == SPG_ID_DEFAULT) { - ret = get_task(pid, &tsk); + ret = get_task(tgid, &tsk); if (ret) return NULL;
@@ -1039,19 +1041,19 @@ static struct sp_group *__sp_find_spg_locked(int pid, int spg_id) return spg; }
-static struct sp_group *__sp_find_spg(int pid, int spg_id) +static struct sp_group *__sp_find_spg(int tgid, int spg_id) { struct sp_group *spg;
down_read(&sp_group_sem); - spg = __sp_find_spg_locked(pid, spg_id); + spg = __sp_find_spg_locked(tgid, spg_id); up_read(&sp_group_sem); return spg; }
/** * mp_sp_group_id_by_pid() - Get the sp_group ID array of a process. - * @pid: pid of target process. + * @tgid: tgid of target process. * @spg_ids: point to an array to save the group ids the process belongs to * @num: input the spg_ids array size; output the spg number of the process * @@ -1061,7 +1063,7 @@ static struct sp_group *__sp_find_spg(int pid, int spg_id) * -EINVAL - spg_ids or num is NULL. * -E2BIG - the num of groups process belongs to is larger than *num */ -int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num) +int mg_sp_group_id_by_pid(int tgid, int *spg_ids, int *num) { int ret = 0, real_count; struct sp_group_node *node; @@ -1076,7 +1078,7 @@ int mg_sp_group_id_by_pid(int pid, int *spg_ids, int *num) if (!spg_ids || !num || *num <= 0) return -EINVAL;
- ret = get_task(pid, &tsk); + ret = get_task(tgid, &tsk); if (ret) return ret;
@@ -1198,7 +1200,7 @@ static struct sp_group *find_or_alloc_sp_group(int spg_id, unsigned long flag) { struct sp_group *spg;
- spg = __sp_find_spg_locked(current->pid, spg_id); + spg = __sp_find_spg_locked(current->tgid, spg_id);
if (!spg) { spg = create_spg(spg_id, flag); @@ -1350,7 +1352,7 @@ static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg)
/** * mg_sp_group_add_task() - Add a process to an share group (sp_group). - * @pid: the pid of the task to be added. + * @tgid: the tgid of the task to be added. * @prot: the prot of task for this spg. * @spg_id: the ID of the sp_group. * @flag: to give some special message. @@ -1364,7 +1366,7 @@ static int local_group_add_task(struct mm_struct *mm, struct sp_group *spg) * The automatically allocated ID is between [SPG_ID_AUTO_MIN, SPG_ID_AUTO_MAX]. * When negative, the return value is -errno. */ -int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) +int mg_sp_group_add_task(int tgid, unsigned long prot, int spg_id) { unsigned long flag = 0; struct task_struct *tsk; @@ -1393,7 +1395,7 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id) }
if (spg_id >= SPG_ID_AUTO_MIN && spg_id <= SPG_ID_AUTO_MAX) { - spg = __sp_find_spg(pid, spg_id); + spg = __sp_find_spg(tgid, spg_id);
if (!spg) { pr_err_ratelimited("spg %d hasn't been created\n", spg_id); @@ -1424,7 +1426,7 @@ int mg_sp_group_add_task(int pid, unsigned long prot, int spg_id)
down_write(&sp_group_sem);
- ret = get_task(pid, &tsk); + ret = get_task(tgid, &tsk); if (ret) { up_write(&sp_group_sem); free_new_spg_id(id_newly_generated, spg_id); @@ -1599,7 +1601,7 @@ EXPORT_SYMBOL_GPL(mg_sp_group_add_task);
/** * mg_sp_group_del_task() - delete a process from a sp group. - * @pid: the pid of the task to be deleted + * @tgid: the tgid of the task to be deleted * @spg_id: sharepool group id * * the group's spa list must be empty, or deletion will fail. @@ -1607,9 +1609,9 @@ EXPORT_SYMBOL_GPL(mg_sp_group_add_task); * Return: * * if success, return 0. * * -EINVAL, spg_id invalid or spa_lsit not emtpy or spg dead - * * -ESRCH, the task group of pid is not in group / process dead + * * -ESRCH, the task group of tgid is not in group / process dead */ -int mg_sp_group_del_task(int pid, int spg_id) +int mg_sp_group_del_task(int tgid, int spg_id) { int ret = 0; struct sp_group *spg; @@ -1626,7 +1628,7 @@ int mg_sp_group_del_task(int pid, int spg_id) return -EINVAL; }
- spg = __sp_find_spg(pid, spg_id); + spg = __sp_find_spg(tgid, spg_id); if (!spg) { pr_err_ratelimited("spg not found or get task failed."); return -EINVAL; @@ -1647,7 +1649,7 @@ int mg_sp_group_del_task(int pid, int spg_id) goto out; }
- ret = get_task(pid, &tsk); + ret = get_task(tgid, &tsk); if (ret) { up_write(&sp_group_sem); pr_err_ratelimited("task is not found"); @@ -1759,7 +1761,7 @@ static void insert_sp_area(struct sp_mapping *spm, struct sp_area *spa) * @flags: how to allocate the memory. * @spg: the share group that the memory is allocated to. * @type: the type of the region. - * @applier: the pid of the task which allocates the region. + * @applier: the tgid of the task which allocates the region. * * Return: a valid pointer for success, NULL on failure. */ @@ -2318,7 +2320,7 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, sp_flags |= SP_HUGEPAGE;
if (spg_id != SPG_ID_DEFAULT) { - spg = __sp_find_spg(current->pid, spg_id); + spg = __sp_find_spg(current->tgid, spg_id); if (!spg) { pr_err_ratelimited("allocation failed, can't find group\n"); return -ENODEV; @@ -2921,7 +2923,7 @@ static void *sp_k2u_finish(void *uva, struct sp_k2u_context *kc) * @kva: the VA of shared kernel memory. * @size: the size of shared kernel memory. * @sp_flags: how to allocate the memory. We only support SP_DVPP. - * @pid: the pid of the specified process (Not currently in use). + * @tgid: the tgid of the specified process (Not currently in use). * @spg_id: the share group that the memory is shared to. * * Return: the shared target user address to start at @@ -2934,7 +2936,7 @@ static void *sp_k2u_finish(void *uva, struct sp_k2u_context *kc) * * if fail, return the pointer of -errno. */ void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size, - unsigned long sp_flags, int pid, int spg_id) + unsigned long sp_flags, int tgid, int spg_id) { void *uva; int ret; @@ -2954,7 +2956,7 @@ void *mg_sp_make_share_k2u(unsigned long kva, unsigned long size, } else { struct sp_group *spg;
- spg = __sp_find_spg(current->pid, kc.spg_id); + spg = __sp_find_spg(current->tgid, kc.spg_id); if (spg) { ret = sp_check_caller_permission(spg, current->mm); if (ret < 0) { @@ -3214,13 +3216,13 @@ static void __sp_walk_page_free(struct sp_walk_data *data) * mg_sp_make_share_u2k() - Share user memory of a specified process to kernel. * @uva: the VA of shared user memory * @size: the size of shared user memory - * @pid: the pid of the specified process(Not currently in use) + * @tgid: the tgid of the specified process(Not currently in use) * * Return: * * if success, return the starting kernel address of the shared memory. * * if failed, return the pointer of -errno. */ -void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) +void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int tgid) { int ret = 0; struct mm_struct *mm = current->mm; @@ -3282,7 +3284,7 @@ void *mg_sp_make_share_u2k(unsigned long uva, unsigned long size, int pid) EXPORT_SYMBOL_GPL(mg_sp_make_share_u2k);
/* - * Input parameters uva, pid and spg_id are now useless. spg_id will be useful + * Input parameters uva, tgid and spg_id are now useless. spg_id will be useful * when supporting a process in multiple sp groups. * * Procedure of unshare uva must be compatible with: @@ -3612,13 +3614,13 @@ static bool is_sp_dynamic_dvpp_addr(unsigned long addr); * @start: the value of share pool start * @size: the value of share pool * @device_id: the num of Da-vinci device - * @pid: the pid of device process + * @tgid: the tgid of device process * * Return true for success. * Return false if parameter invalid or has been set up. * This functuon has no concurrent problem. */ -bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) +bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, int tgid) { int ret; bool err = false; @@ -3632,12 +3634,12 @@ bool mg_sp_config_dvpp_range(size_t start, size_t size, int device_id, int pid) return false;
/* NOTE: check the start address */ - if (pid < 0 || size <= 0 || size > MMAP_SHARE_POOL_16G_SIZE || + if (tgid < 0 || size <= 0 || size > MMAP_SHARE_POOL_16G_SIZE || device_id < 0 || device_id >= MAX_DEVID || !is_online_node_id(device_id) || !is_sp_dynamic_dvpp_addr(start) || !is_sp_dynamic_dvpp_addr(start + size - 1)) return false;
- ret = get_task(pid, &tsk); + ret = get_task(tgid, &tsk); if (ret) return false;
From: Xu Qiang xuqiang36@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I612UG CVE: NA
--------------------------------
Fix that value of CPU is too large in its_inc_lpi_count.
Signed-off-by: Xu Qiang xuqiang36@huawei.com --- drivers/irqchip/irq-gic-v3-its.c | 10 ++++++++++ 1 file changed, 10 insertions(+)
diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 81271fd8954f..1cb392fb16d0 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -1543,6 +1543,11 @@ static __maybe_unused u32 its_read_lpi_count(struct irq_data *d, int cpu)
static void its_inc_lpi_count(struct irq_data *d, int cpu) { +#ifdef CONFIG_ASCEND_INIT_ALL_GICR + if (cpu >= nr_cpu_ids) + return; +#endif + if (irqd_affinity_is_managed(d)) atomic_inc(&per_cpu_ptr(&cpu_lpi_count, cpu)->managed); else @@ -1551,6 +1556,11 @@ static void its_inc_lpi_count(struct irq_data *d, int cpu)
static void its_dec_lpi_count(struct irq_data *d, int cpu) { +#ifdef CONFIG_ASCEND_INIT_ALL_GICR + if (cpu >= nr_cpu_ids) + return; +#endif + if (irqd_affinity_is_managed(d)) atomic_dec(&per_cpu_ptr(&cpu_lpi_count, cpu)->managed); else
From: Zhang Zekun zhangzekun11@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I612UG CVE: NA
--------------------------------
Fix the following static check warning. Use parentheses to specify the sequence of expressions, instead of using the default priority.Should use parenthesis while use bitwise operator.
Fix this by add bracket in the expression.
Signed-off-by: Zhang Zekun zhangzekun11@huawei.com --- mm/share_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index 7a1b7f5caf54..8c30f0a5e3a1 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2693,7 +2693,7 @@ static unsigned long sp_remap_kva_to_vma(unsigned long kva, struct sp_area *spa, if (prot & PROT_WRITE) vma->vm_page_prot = __pgprot(((~PTE_RDONLY) & vma->vm_page_prot.pgprot) | PTE_DIRTY);
- if (kc && kc->sp_flags & SP_PROT_RO) + if (kc && (kc->sp_flags & SP_PROT_RO)) vma->vm_flags &= ~VM_MAYWRITE;
if (is_vm_hugetlb_page(vma)) {
From: Xu Qiang xuqiang36@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I612UG CVE: NA
--------------------------------
We increase task->mm->mm_users by one when we add the task to a sharepool group. Correspondingly we should drop the mm_users count when the task exits. Currently we hijack the mmput function and make it return early and decrease mm->mm_users by one (just as mmput would do) if it is not called from a task's exiting process, or we decrease mm->mm_users by the group number the task was added to. This has two problems: 1. It makes mmput and sp_group_exit hard to understand. 2. The process of judging if the task (also its mm) is exiting and decrease its mm_users count is not atomic. We use this condition: mm->mm_users == master->count + MM_WOULD_FREE(1) If someone else change the mm->mm_users during those two steps, the mm->mm_users would be wrong and mm_struct cannot be released anymore.
Suppose the following process:
proc1 proc2
1) mmput | V 2) enter sp_group_exit and 'mm->mm_users == master->count + 1' is true 3) | mmget V 4) decrease mm->mm_users by master->count | V 5) enter __mmput and release mm_struct if mm->mm_users == 1 6) mmput
The statistical structure who has the same id of the task would get leaked together with mm_struct, so the next time we try to create the statistical structure of the same id, we get a failure.
We fix this by moving sp_group_exit to do_exit() actually where the task is exiting. We don't need to judge if the task is exiting when someone calling mmput so there is no chance to change mm_users wrongly.
Signed-off-by: Xu Qiang xuqiang36@huawei.com Signed-off-by: Wang Wensheng wangwensheng4@huawei.com --- include/linux/share_pool.h | 4 ++-- kernel/exit.c | 8 ++++++++ kernel/fork.c | 4 ---- mm/share_pool.c | 38 ++++++++------------------------------ 4 files changed, 18 insertions(+), 36 deletions(-)
diff --git a/include/linux/share_pool.h b/include/linux/share_pool.h index 5e15e7a1234f..8190c8d82439 100644 --- a/include/linux/share_pool.h +++ b/include/linux/share_pool.h @@ -278,7 +278,7 @@ extern bool mg_is_sharepool_addr(unsigned long addr); extern int mg_sp_id_of_current(void);
extern void sp_area_drop(struct vm_area_struct *vma); -extern int sp_group_exit(struct mm_struct *mm); +extern int sp_group_exit(void); extern void sp_group_post_exit(struct mm_struct *mm); vm_fault_t sharepool_no_page(struct mm_struct *mm, struct vm_area_struct *vma, @@ -331,7 +331,7 @@ static inline int mg_sp_group_del_task(int tgid, int spg_id) return -EPERM; }
-static inline int sp_group_exit(struct mm_struct *mm) +static inline int sp_group_exit(void) { return 0; } diff --git a/kernel/exit.c b/kernel/exit.c index ab900b661867..d612cb5b5943 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -64,6 +64,7 @@ #include <linux/rcuwait.h> #include <linux/compat.h> #include <linux/io_uring.h> +#include <linux/share_pool.h>
#include <linux/uaccess.h> #include <asm/unistd.h> @@ -795,6 +796,13 @@ void __noreturn do_exit(long code) tsk->exit_code = code; taskstats_exit(tsk, group_dead);
+ /* + * sp_group_exit must be executed before exit_mm, + * otherwise it will cause mm leakage. + */ + if (group_dead) + sp_group_exit(); + exit_mm();
if (group_dead) diff --git a/kernel/fork.c b/kernel/fork.c index 8a2e827815b6..8dbb8d985e78 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -96,7 +96,6 @@ #include <linux/kasan.h> #include <linux/scs.h> #include <linux/io_uring.h> -#include <linux/share_pool.h>
#include <linux/share_pool.h> #include <asm/pgalloc.h> @@ -1116,9 +1115,6 @@ void mmput(struct mm_struct *mm) { might_sleep();
- if (sp_group_exit(mm)) - return; - if (atomic_dec_and_test(&mm->mm_users)) __mmput(mm); } diff --git a/mm/share_pool.c b/mm/share_pool.c index 8c30f0a5e3a1..d170929c1e89 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -4276,34 +4276,13 @@ vm_fault_t sharepool_no_page(struct mm_struct *mm, goto out; }
-#define MM_WOULD_FREE 1 - /* - * Recall we add mm->users by 1 deliberately in sp_group_add_task(). - * If the mm_users == sp_group_master->count + 1, it means that the mm is ready - * to be freed because the last owner of this mm is in exiting procedure: - * do_exit() -> exit_mm() -> mmput() -> sp_group_exit -> THIS function. + * The caller must ensure that this function is called + * when the last thread in the thread group exits. */ -static bool need_free_sp_group(struct mm_struct *mm, - struct sp_group_master *master) -{ - /* thread exits but process is still alive */ - if ((unsigned int)atomic_read(&mm->mm_users) != master->count + MM_WOULD_FREE) { - if (atomic_dec_and_test(&mm->mm_users)) - WARN(1, "Invalid user counting\n"); - return false; - } - - return true; -} - -/* - * Return: - * 1 - let mmput() return immediately - * 0 - let mmput() decrease mm_users and try __mmput() - */ -int sp_group_exit(struct mm_struct *mm) +int sp_group_exit(void) { + struct mm_struct *mm; struct sp_group *spg; struct sp_group_master *master; struct sp_group_node *spg_node, *tmp; @@ -4312,6 +4291,10 @@ int sp_group_exit(struct mm_struct *mm) if (!sp_is_enabled()) return 0;
+ if (current->flags & PF_KTHREAD) + return 0; + + mm = current->mm; down_write(&sp_group_sem);
master = mm->sp_group_master; @@ -4320,11 +4303,6 @@ int sp_group_exit(struct mm_struct *mm) return 0; }
- if (!need_free_sp_group(mm, master)) { - up_write(&sp_group_sem); - return 1; - } - list_for_each_entry_safe(spg_node, tmp, &master->node_list, group_node) { spg = spg_node->spg;
From: Guo Mengqi guomengqi3@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I612UG CVE: NA
--------------------------------
do_mm_populate() will try allocate physical pages from the start of required range [start, end), and return error on the first allocation failure without releasing the pages allocated before. That means we must release the shared-file range after calling do_mm_populate().
Remove need_fallocate, and always call sp_fallocate() on the error path of sp_alloc_mmap_populate().
Signed-off-by: Guo Mengqi guomengqi3@huawei.com --- mm/share_pool.c | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-)
diff --git a/mm/share_pool.c b/mm/share_pool.c index d170929c1e89..e9d2ae0a5352 100644 --- a/mm/share_pool.c +++ b/mm/share_pool.c @@ -2284,7 +2284,6 @@ struct sp_alloc_context { unsigned long sp_flags; unsigned long populate; int state; - bool need_fallocate; bool have_mbind; enum spa_type type; }; @@ -2362,7 +2361,6 @@ static int sp_alloc_prepare(unsigned long size, unsigned long sp_flags, ac->size = size; ac->sp_flags = sp_flags; ac->state = ALLOC_NORMAL; - ac->need_fallocate = false; ac->have_mbind = false; return 0; } @@ -2457,6 +2455,7 @@ static int sp_alloc_populate(struct mm_struct *mm, struct sp_area *spa, * page fault later on, and more importantly sp_make_share_u2k() * depends on this feature (and MAP_LOCKED) to work correctly. */ + return do_mm_populate(mm, spa->va_start, ac->populate, 0); }
@@ -2477,6 +2476,7 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, int ret;
ret = sp_alloc_mmap(mm, spa, spg_node, ac); + if (ret < 0) return ret;
@@ -2485,21 +2485,18 @@ static int __sp_alloc_mmap_populate(struct mm_struct *mm, struct sp_area *spa, if (ret < 0) { pr_err("cannot bind the memory range to specified node:%d, err:%d\n", spa->node_id, ret); - goto err; + return ret; } ac->have_mbind = true; }
ret = sp_alloc_populate(mm, spa, ac); if (ret) { -err: if (unlikely(fatal_signal_pending(current))) pr_warn_ratelimited("allocation failed, current thread is killed\n"); else pr_warn_ratelimited("allocation failed due to mm populate failed(potential no enough memory when -12): %d\n", ret); - } else { - ac->need_fallocate = true; } return ret; } @@ -2517,8 +2514,16 @@ static int sp_alloc_mmap_populate(struct sp_area *spa, mm = spg_node->master->mm; mmap_ret = __sp_alloc_mmap_populate(mm, spa, spg_node, ac); if (mmap_ret) { + + /* + * Goto fallback procedure upon ERR_VALUE, + * but skip the coredump situation, + * because we don't want one misbehaving process to affect others. + */ if (ac->state != ALLOC_COREDUMP) goto unmap; + + /* Reset state and discard the coredump error. */ ac->state = ALLOC_NORMAL; continue; } @@ -2533,11 +2538,16 @@ static int sp_alloc_mmap_populate(struct sp_area *spa, end_mm = list_next_entry(spg_node, proc_node)->master->mm; sp_alloc_unmap(end_mm, spa, spg_node);
- /* only fallocate spa if physical memory had been allocated */ - if (ac->need_fallocate) { - sp_fallocate(spa); - ac->need_fallocate = false; - } + /* + * Sometimes do_mm_populate() allocates some memory and then failed to + * allocate more. (e.g. memory use reaches cgroup limit.) + * In this case, it will return enomem, but will not free the + * memory which has already been allocated. + * + * So if __sp_alloc_mmap_populate fails, always call sp_fallocate() + * to make sure backup physical memory of the shared file is freed. + */ + sp_fallocate(spa);
/* if hugepage allocation fails, this will transfer to normal page * and try again. (only if SP_HUGEPAGE_ONLY is not flagged
From: Yuan Can yuancan@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I612UG CVE: NA
--------------------------------
Export oom_type_notifier_call map_kernel_range and __get_vm_area_caller for ascend drivers. Export pm_autosleep_set_state and __vmalloc_node_range. Export alloc_workqueue_attrs, free_workqueue_attrs and apply_workqueue_attrs.
Signed-off-by: Yuan Can yuancan@huawei.com --- kernel/power/autosleep.c | 1 + kernel/workqueue.c | 3 +++ mm/oom_kill.c | 1 + mm/vmalloc.c | 3 +++ 4 files changed, 8 insertions(+)
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c index 9af5a50d3489..6aee5077fbfa 100644 --- a/kernel/power/autosleep.c +++ b/kernel/power/autosleep.c @@ -113,6 +113,7 @@ int pm_autosleep_set_state(suspend_state_t state) mutex_unlock(&autosleep_lock); return 0; } +EXPORT_SYMBOL_GPL(pm_autosleep_set_state);
int __init pm_autosleep_init(void) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 14d4c072c79b..422ee6312475 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3353,6 +3353,7 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) kfree(attrs); } } +EXPORT_SYMBOL_GPL(free_workqueue_attrs);
/** * alloc_workqueue_attrs - allocate a workqueue_attrs @@ -3378,6 +3379,7 @@ struct workqueue_attrs *alloc_workqueue_attrs(void) free_workqueue_attrs(attrs); return NULL; } +EXPORT_SYMBOL_GPL(alloc_workqueue_attrs);
static void copy_workqueue_attrs(struct workqueue_attrs *to, const struct workqueue_attrs *from) @@ -4092,6 +4094,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
return ret; } +EXPORT_SYMBOL_GPL(apply_workqueue_attrs);
/** * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ffbe8fe2bbf6..dd2b4f890403 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1119,6 +1119,7 @@ int oom_type_notifier_call(unsigned int type, struct oom_control *oc)
return blocking_notifier_call_chain(&oom_type_notify_list, type, NULL); } +EXPORT_SYMBOL_GPL(oom_type_notifier_call); #endif
/** diff --git a/mm/vmalloc.c b/mm/vmalloc.c index dadbea29241d..d7a68eb0db42 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -643,6 +643,7 @@ int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, flush_cache_vmap(start, start + size); return ret; } +EXPORT_SYMBOL(map_kernel_range);
int is_vmalloc_or_module_addr(const void *x) { @@ -2460,6 +2461,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, NUMA_NO_NODE, GFP_KERNEL, caller); } +EXPORT_SYMBOL(__get_vm_area_caller);
/** * get_vm_area - reserve a contiguous kernel virtual area @@ -3058,6 +3060,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
return NULL; } +EXPORT_SYMBOL_GPL(__vmalloc_node_range);
/** * __vmalloc_node - allocate virtually contiguous memory
From: Zhou Guanghui zhouguanghui1@huawei.com
hulk inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I612UG CVE: NA
------------------------------------------------------
Memory can be allocated from a specified CDM node only when it is allowed to apply for memory from the CDM node. Otherwise, memory will be allocated from other non-CDM nodes that are not allowed by th cpuset.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index efa0d4479e6e..07f980c165fb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3867,7 +3867,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, (alloc_flags & ALLOC_CPUSET) && !__cpuset_zone_allowed(zone, gfp_mask) #ifdef CONFIG_COHERENT_DEVICE - && !(alloc_flags & ALLOC_CDM) + && (!is_cdm_node(zone->zone_pgdat->node_id) || + !(alloc_flags & ALLOC_CDM)) #endif ) continue;
From: Zhou Guanghui zhouguanghui1@huawei.com
ascend inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/I612UG CVE: NA
-----------------------------------------------------
Since the current condition ignores the cpuset enforcement by adding __GFP_THISNODEi to the gfp_mask, this will result in allocations that specify __GFP_THISNODE and non-cdm nodes not subject to cpuset restrictions.
For example, procA pid 1000: node 0 cpus: 0 1 2 3 node 0 free: 57199MB node 1 cpus: 4 5 6 7 node 1 free: 55930MB
cpuset/test/cpuset.mems 1 cpuset/test/tasks 1000 cpuset/test/cpuset.cpus 0-3
No cdm node exists. When procA malloc 100MB memory, the result is: node 0 cpus: 0 1 2 3 node 0 free: 57099MB node 1 cpus: 4 5 6 7 node 1 free: 55930MB This is not what we expected, and in fact 100 MB of memory should be allocated from node1. The reason for this problem is that THP specifies __GFP_THISNODE to attempt to allocate from the local node.
Therefore, the cpuset enforcement should be ignored only when explicitly allocating memory from the cdm node using __GFP_ THISNODE.
Signed-off-by: Zhou Guanghui zhouguanghui1@huawei.com --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07f980c165fb..4d768dc5956d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4940,7 +4940,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, ac->migratetype = gfp_migratetype(gfp_mask);
#ifdef CONFIG_COHERENT_DEVICE - if (cpusets_enabled() && !(*alloc_gfp & __GFP_THISNODE)) { + if (cpusets_enabled() && + (!(*alloc_gfp & __GFP_THISNODE) || !is_cdm_node(preferred_nid))) { #else if (cpusets_enabled()) { #endif