[PATCH OLK-5.10] mm: proc: use per_vma mmap_lock for vma traversal.
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID6KMQ -------------------------------- The current method of reading /proc/pid/numa_maps uses a global mmap_lock, which can block high-priority writers and therefore lead to low performance. The mmap_lock is held during vma traversal, acquired in m_start() and released in m_stop(). Now introduces a more fine-grained mmap_lock that next() releases and reacquires the lock between vma iterations. The fine-grained lock avoids priority inversion and improves concurrency. However, the output is non-atomic for the entire address space. Signed-off-by: Qi Xi <xiqi2@huawei.com> --- fs/proc/task_mmu.c | 113 +++++++++++++++++++++++++++++++++++++-- include/linux/pagewalk.h | 2 + mm/pagewalk.c | 27 ++++++++++ 3 files changed, 138 insertions(+), 4 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7b41f7c290ce..4370eb895e76 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -182,6 +182,102 @@ static void *m_next(struct seq_file *m, void *v, loff_t *ppos) return next; } +static void *numa_maps_next(struct seq_file *m, void *v, loff_t *ppos) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *next = NULL; + struct vm_area_struct *vma = v; + struct mm_struct *mm = priv->mm; + unsigned long last_vma_end; + unsigned long last_vma_start; + int ret; + + if (!mm) + goto out; + + if (vma == priv->tail_vma) + goto out; + + if (vma->vm_next) + next = vma->vm_next; + else + next = priv->tail_vma; + + last_vma_end = vma->vm_end; + last_vma_start = vma->vm_start; + if (mmap_lock_is_contended(mm)) { + mmap_read_unlock(mm); + ret = mmap_read_lock_killable(mm); + if (ret) { + mmput(mm); + put_task_struct(priv->task); + priv->task = NULL; + return ERR_PTR(ret); + } + + /* + * After dropping the lock, there are four cases to + * consider. See the following example for explanation. + * + * +------+------+-----------+ + * | VMA1 | VMA2 | VMA3 | + * +------+------+-----------+ + * | | | | + * 4k 8k 16k 400k + * + * Suppose we drop the lock after reading VMA2 due to + * contention, then we get: + * + * last_vma_end = 16k + * + * 1) VMA2 is freed, but VMA3 exists: + * + * find_vma(mm, 16k - 1) will return VMA3. + * In this case, just continue from VMA3. + * + * 2) VMA2 still exists: + * + * find_vma(mm, 16k - 1) will return VMA2. + * Iterate the loop like the original one. + * + * 3) No more VMAs can be found: + * + * find_vma(mm, 16k - 1) will return NULL. + * No more things to do, just break. + * + * 4) (last_vma_end - 1) is the middle of a vma (VMA'): + * + * find_vma(mm, 16k - 1) will return VMA' whose range + * contains last_vma_end. + * Iterate VMA' from last_vma_end. + */ + next = find_vma(mm, last_vma_end - 1); + + /* Case 3 above */ + if (!next) + goto out; + + /* Case 1 above */ + if (next->vm_start >= last_vma_end) + goto out; + + /* Case 4 above */ + if (next->vm_end > last_vma_end) { + *ppos = last_vma_end; + return next; + } + + /* Case 2 above */ + if (next->vm_next) + next = next->vm_next; + else + next = priv->tail_vma; + } +out: + *ppos = next ? next->vm_start : -1UL; + return next; +} + static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; @@ -2260,6 +2356,7 @@ static int show_numa_map(struct seq_file *m, void *v) struct mempolicy *pol; char buffer[64]; int nid; + unsigned long start_addr; if (!mm) return 0; @@ -2267,7 +2364,15 @@ static int show_numa_map(struct seq_file *m, void *v) /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); - pol = __get_vma_policy(vma, vma->vm_start); + start_addr = max_t(unsigned long, vma->vm_start, m->index); + if (start_addr >= vma->vm_end) { + VM_WARN_ONCE(1, + "invalid range: start_addr=%p vma=[%p-%p]\n", + (void *)start_addr, (void *)vma->vm_start, (void *)vma->vm_end); + return 0; + } + + pol = __get_vma_policy(vma, start_addr); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); @@ -2275,7 +2380,7 @@ static int show_numa_map(struct seq_file *m, void *v) mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); } - seq_printf(m, "%08lx %s", vma->vm_start, buffer); + seq_printf(m, "%08lx %s", start_addr, buffer); if (file) { seq_puts(m, " file="); @@ -2290,7 +2395,7 @@ static int show_numa_map(struct seq_file *m, void *v) seq_puts(m, " huge"); /* mmap_lock is held by m_start */ - walk_page_vma(vma, &show_numa_ops, md); + walk_page_vma_range(vma, start_addr, &show_numa_ops, md); if (!md->pages) goto out; @@ -2328,7 +2433,7 @@ static int show_numa_map(struct seq_file *m, void *v) static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, - .next = m_next, + .next = numa_maps_next, .stop = m_stop, .show = show_numa_map, }; diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index b1cb6b753abb..9b8e2586cfc2 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -101,6 +101,8 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, void *private); int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private); +int walk_page_vma_range(struct vm_area_struct *vma, unsigned long start_addr, + const struct mm_walk_ops *ops, void *private); int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, pgoff_t nr, const struct mm_walk_ops *ops, void *private); diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 6a4212cf00ba..12032258d1ec 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -485,6 +485,33 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } +int walk_page_vma_range(struct vm_area_struct *vma, unsigned long start_addr, + const struct mm_walk_ops *ops, void *private) +{ + struct mm_walk walk = { + .ops = ops, + .mm = vma->vm_mm, + .vma = vma, + .private = private, + }; + int err; + + if (!walk.mm) + return -EINVAL; + + if (start_addr < vma->vm_start || start_addr >= vma->vm_end) + return -EINVAL; + + mmap_assert_locked(walk.mm); + + err = walk_page_test(start_addr, vma->vm_end, &walk); + if (err > 0) + return 0; + if (err < 0) + return err; + return __walk_page_range(start_addr, vma->vm_end, &walk); +} + /** * walk_page_mapping - walk all memory areas mapped into a struct address_space. * @mapping: Pointer to the struct address_space -- 2.33.0
反馈: 您发送到kernel@openeuler.org的补丁/补丁集,已成功转换为PR! PR链接地址: https://gitee.com/openeuler/kernel/pulls/19062 邮件列表地址:https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/VYX... FeedBack: The patch(es) which you have sent to kernel@openeuler.org mailing list has been converted to a pull request successfully! Pull request link: https://gitee.com/openeuler/kernel/pulls/19062 Mailing list address: https://mailweb.openeuler.org/archives/list/kernel@openeuler.org/message/VYX...
participants (2)
-
patchwork bot -
Qi Xi