From: "Uladzislau Rezki (Sony)" urezki@gmail.com
mainline inclusion from mainline-6.9-rc1 commit d093602919ad5908532142a048539800fa94a0d1 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I9CHG1 CVE: NA
Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?i...
-------------------------------------------------
Store allocated objects in a separate nodes. A va->va_start address is converted into a correct node where it should be placed and resided. An addr_to_node() function is used to do a proper address conversion to determine a node that contains a VA.
Such approach balances VAs across nodes as a result an access becomes scalable. Number of nodes in a system depends on number of CPUs.
Please note:
1. As of now allocated VAs are bound to a node-0. It means the patch does not give any difference comparing with a current behavior;
2. The global vmap_area_lock, vmap_area_root are removed as there is no need in it anymore. The vmap_area_list is still kept and is _empty_. It is exported for a kexec only;
3. The vmallocinfo and vread() have to be reworked to be able to handle multiple nodes.
[urezki@gmail.com: mark vmap_init_free_space() with __init tag] Link: https://lkml.kernel.org/r/20240111132628.299644-1-urezki@gmail.com [urezki@gmail.com: fix a wrong value passed to __find_vmap_area()] Link: https://lkml.kernel.org/r/20240111121104.180993-1-urezki@gmail.com Link: https://lkml.kernel.org/r/20240102184633.748113-5-urezki@gmail.com Signed-off-by: Uladzislau Rezki (Sony) urezki@gmail.com Reviewed-by: Baoquan He bhe@redhat.com Reviewed-by: Christoph Hellwig hch@lst.de Reviewed-by: Lorenzo Stoakes lstoakes@gmail.com Reviewed-by: Anshuman Khandual anshuman.khandual@arm.com Cc: Dave Chinner david@fromorbit.com Cc: Joel Fernandes (Google) joel@joelfernandes.org Cc: Kazuhito Hagio k-hagio-ab@nec.com Cc: Liam R. Howlett Liam.Howlett@oracle.com Cc: Matthew Wilcox (Oracle) willy@infradead.org Cc: Oleksiy Avramchenko oleksiy.avramchenko@sony.com Cc: Paul E. McKenney paulmck@kernel.org Signed-off-by: Andrew Morton akpm@linux-foundation.org (cherry picked from commit d093602919ad5908532142a048539800fa94a0d1) Signed-off-by: Kefeng Wang wangkefeng.wang@huawei.com Signed-off-by: ZhangPeng zhangpeng362@huawei.com --- mm/vmalloc.c | 242 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 174 insertions(+), 68 deletions(-)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 59cd3f855df7..5856db42e1c4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -800,11 +800,9 @@ EXPORT_SYMBOL(vmalloc_to_pfn); #define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
-static DEFINE_SPINLOCK(vmap_area_lock); static DEFINE_SPINLOCK(free_vmap_area_lock); /* Export for kexec only */ LIST_HEAD(vmap_area_list); -static struct rb_root vmap_area_root = RB_ROOT; static bool vmap_initialized __read_mostly;
static struct rb_root purge_vmap_area_root = RB_ROOT; @@ -844,6 +842,38 @@ static struct rb_root free_vmap_area_root = RB_ROOT; */ static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
+/* + * An effective vmap-node logic. Users make use of nodes instead + * of a global heap. It allows to balance an access and mitigate + * contention. + */ +struct rb_list { + struct rb_root root; + struct list_head head; + spinlock_t lock; +}; + +static struct vmap_node { + /* Bookkeeping data of this node. */ + struct rb_list busy; +} single; + +static struct vmap_node *vmap_nodes = &single; +static __read_mostly unsigned int nr_vmap_nodes = 1; +static __read_mostly unsigned int vmap_zone_size = 1; + +static inline unsigned int +addr_to_node_id(unsigned long addr) +{ + return (addr / vmap_zone_size) % nr_vmap_nodes; +} + +static inline struct vmap_node * +addr_to_node(unsigned long addr) +{ + return &vmap_nodes[addr_to_node_id(addr)]; +} + static __always_inline unsigned long va_size(struct vmap_area *va) { @@ -875,10 +905,11 @@ unsigned long vmalloc_nr_pages(void) }
/* Look up the first VA which satisfies addr < va_end, NULL if none. */ -static struct vmap_area *find_vmap_area_exceed_addr(unsigned long addr) +static struct vmap_area * +find_vmap_area_exceed_addr(unsigned long addr, struct rb_root *root) { struct vmap_area *va = NULL; - struct rb_node *n = vmap_area_root.rb_node; + struct rb_node *n = root->rb_node;
addr = (unsigned long)kasan_reset_tag((void *)addr);
@@ -1624,12 +1655,14 @@ __alloc_vmap_area(struct rb_root *root, struct list_head *head, */ static void free_vmap_area(struct vmap_area *va) { + struct vmap_node *vn = addr_to_node(va->va_start); + /* * Remove from the busy tree/list. */ - spin_lock(&vmap_area_lock); - unlink_va(va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + spin_lock(&vn->busy.lock); + unlink_va(va, &vn->busy.root); + spin_unlock(&vn->busy.lock);
/* * Insert/Merge it back to the free tree/list. @@ -1672,6 +1705,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, int node, gfp_t gfp_mask, unsigned long va_flags) { + struct vmap_node *vn; struct vmap_area *va; unsigned long freed; unsigned long addr; @@ -1717,9 +1751,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, va->vm = NULL; va->flags = va_flags;
- spin_lock(&vmap_area_lock); - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); - spin_unlock(&vmap_area_lock); + vn = addr_to_node(va->va_start); + + spin_lock(&vn->busy.lock); + insert_vmap_area(va, &vn->busy.root, &vn->busy.head); + spin_unlock(&vn->busy.lock);
BUG_ON(!IS_ALIGNED(va->va_start, align)); BUG_ON(va->va_start < vstart); @@ -1943,26 +1979,61 @@ static void free_unmap_vmap_area(struct vmap_area *va)
struct vmap_area *find_vmap_area(unsigned long addr) { + struct vmap_node *vn; struct vmap_area *va; + int i, j;
- spin_lock(&vmap_area_lock); - va = __find_vmap_area(addr, &vmap_area_root); - spin_unlock(&vmap_area_lock); + /* + * An addr_to_node_id(addr) converts an address to a node index + * where a VA is located. If VA spans several zones and passed + * addr is not the same as va->va_start, what is not common, we + * may need to scan an extra nodes. See an example: + * + * <--va--> + * -|-----|-----|-----|-----|- + * 1 2 0 1 + * + * VA resides in node 1 whereas it spans 1 and 2. If passed + * addr is within a second node we should do extra work. We + * should mention that it is rare and is a corner case from + * the other hand it has to be covered. + */ + i = j = addr_to_node_id(addr); + do { + vn = &vmap_nodes[i];
- return va; + spin_lock(&vn->busy.lock); + va = __find_vmap_area(addr, &vn->busy.root); + spin_unlock(&vn->busy.lock); + + if (va) + return va; + } while ((i = (i + 1) % nr_vmap_nodes) != j); + + return NULL; }
static struct vmap_area *find_unlink_vmap_area(unsigned long addr) { + struct vmap_node *vn; struct vmap_area *va; + int i, j;
- spin_lock(&vmap_area_lock); - va = __find_vmap_area(addr, &vmap_area_root); - if (va) - unlink_va(va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + i = j = addr_to_node_id(addr); + do { + vn = &vmap_nodes[i];
- return va; + spin_lock(&vn->busy.lock); + va = __find_vmap_area(addr, &vn->busy.root); + if (va) + unlink_va(va, &vn->busy.root); + spin_unlock(&vn->busy.lock); + + if (va) + return va; + } while ((i = (i + 1) % nr_vmap_nodes) != j); + + return NULL; }
/*** Per cpu kva allocator ***/ @@ -2164,6 +2235,7 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
static void free_vmap_block(struct vmap_block *vb) { + struct vmap_node *vn; struct vmap_block *tmp; struct xarray *xa;
@@ -2171,9 +2243,10 @@ static void free_vmap_block(struct vmap_block *vb) tmp = xa_erase(xa, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb);
- spin_lock(&vmap_area_lock); - unlink_va(vb->va, &vmap_area_root); - spin_unlock(&vmap_area_lock); + vn = addr_to_node(vb->va->va_start); + spin_lock(&vn->busy.lock); + unlink_va(vb->va, &vn->busy.root); + spin_unlock(&vn->busy.lock);
free_vmap_area_noflush(vb->va); kfree_rcu(vb, rcu_head); @@ -2597,9 +2670,11 @@ static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { - spin_lock(&vmap_area_lock); + struct vmap_node *vn = addr_to_node(va->va_start); + + spin_lock(&vn->busy.lock); setup_vmalloc_vm_locked(vm, va, flags, caller); - spin_unlock(&vmap_area_lock); + spin_unlock(&vn->busy.lock); }
static void clear_vm_uninitialized_flag(struct vm_struct *vm) @@ -3792,6 +3867,7 @@ static size_t vmap_ram_vread_iter(struct iov_iter *iter, const char *addr, */ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) { + struct vmap_node *vn; struct vmap_area *va; struct vm_struct *vm; char *vaddr; @@ -3805,8 +3881,11 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count)
remains = count;
- spin_lock(&vmap_area_lock); - va = find_vmap_area_exceed_addr((unsigned long)addr); + /* Hooked to node_0 so far. */ + vn = addr_to_node(0); + spin_lock(&vn->busy.lock); + + va = find_vmap_area_exceed_addr((unsigned long)addr, &vn->busy.root); if (!va) goto finished_zero;
@@ -3814,7 +3893,7 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) if ((unsigned long)addr + remains <= va->va_start) goto finished_zero;
- list_for_each_entry_from(va, &vmap_area_list, list) { + list_for_each_entry_from(va, &vn->busy.head, list) { size_t copied;
if (remains == 0) @@ -3873,12 +3952,12 @@ long vread_iter(struct iov_iter *iter, const char *addr, size_t count) }
finished_zero: - spin_unlock(&vmap_area_lock); + spin_unlock(&vn->busy.lock); /* zero-fill memory holes */ return count - remains + zero_iter(iter, remains); finished: /* Nothing remains, or We couldn't copy/zero everything. */ - spin_unlock(&vmap_area_lock); + spin_unlock(&vn->busy.lock);
return count - remains; } @@ -4212,14 +4291,15 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, }
/* insert all vm's */ - spin_lock(&vmap_area_lock); for (area = 0; area < nr_vms; area++) { - insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list); + struct vmap_node *vn = addr_to_node(vas[area]->va_start);
+ spin_lock(&vn->busy.lock); + insert_vmap_area(vas[area], &vn->busy.root, &vn->busy.head); setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC, pcpu_get_vm_areas); + spin_unlock(&vn->busy.lock); } - spin_unlock(&vmap_area_lock);
/* * Mark allocated areas as accessible. Do it now as a best-effort @@ -4330,31 +4410,32 @@ bool vmalloc_dump_obj(void *object) { void *objp = (void *)PAGE_ALIGN((unsigned long)object); const void *caller; - struct vm_struct *vm; struct vmap_area *va; + struct vmap_node *vn; unsigned long addr; unsigned int nr_pages; + bool success = false;
- if (!spin_trylock(&vmap_area_lock)) - return false; - va = __find_vmap_area((unsigned long)objp, &vmap_area_root); - if (!va) { - spin_unlock(&vmap_area_lock); - return false; - } + vn = addr_to_node((unsigned long)objp);
- vm = va->vm; - if (!vm) { - spin_unlock(&vmap_area_lock); - return false; + if (spin_trylock(&vn->busy.lock)) { + va = __find_vmap_area((unsigned long)objp, &vn->busy.root); + + if (va && va->vm) { + addr = (unsigned long)va->vm->addr; + caller = va->vm->caller; + nr_pages = va->vm->nr_pages; + success = true; + } + + spin_unlock(&vn->busy.lock); } - addr = (unsigned long)vm->addr; - caller = vm->caller; - nr_pages = vm->nr_pages; - spin_unlock(&vmap_area_lock); - pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", - nr_pages, addr, caller); - return true; + + if (success) + pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n", + nr_pages, addr, caller); + + return success; } #endif
@@ -4558,25 +4639,26 @@ EXPORT_SYMBOL(remap_vmalloc_hugepage_range);
#ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) - __acquires(&vmap_purge_lock) - __acquires(&vmap_area_lock) { + struct vmap_node *vn = addr_to_node(0); + mutex_lock(&vmap_purge_lock); - spin_lock(&vmap_area_lock); + spin_lock(&vn->busy.lock);
- return seq_list_start(&vmap_area_list, *pos); + return seq_list_start(&vn->busy.head, *pos); }
static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - return seq_list_next(p, &vmap_area_list, pos); + struct vmap_node *vn = addr_to_node(0); + return seq_list_next(p, &vn->busy.head, pos); }
static void s_stop(struct seq_file *m, void *p) - __releases(&vmap_area_lock) - __releases(&vmap_purge_lock) { - spin_unlock(&vmap_area_lock); + struct vmap_node *vn = addr_to_node(0); + + spin_unlock(&vn->busy.lock); mutex_unlock(&vmap_purge_lock); }
@@ -4619,9 +4701,11 @@ static void show_purge_info(struct seq_file *m)
static int s_show(struct seq_file *m, void *p) { + struct vmap_node *vn; struct vmap_area *va; struct vm_struct *v;
+ vn = addr_to_node(0); va = list_entry(p, struct vmap_area, list);
if (!va->vm) { @@ -4675,7 +4759,7 @@ static int s_show(struct seq_file *m, void *p) * As a final step, dump "unpurged" areas. */ final: - if (list_is_last(&va->list, &vmap_area_list)) + if (list_is_last(&va->list, &vn->busy.head)) show_purge_info(m);
return 0; @@ -4702,11 +4786,12 @@ module_init(proc_vmalloc_init);
#endif
-static void vmap_init_free_space(void) +static void __init vmap_init_free_space(void) { unsigned long vmap_start = 1; const unsigned long vmap_end = ULONG_MAX; - struct vmap_area *busy, *free; + struct vmap_area *free; + struct vm_struct *busy;
/* * B F B B B F @@ -4714,12 +4799,12 @@ static void vmap_init_free_space(void) * | The KVA space | * |<--------------------------------->| */ - list_for_each_entry(busy, &vmap_area_list, list) { - if (busy->va_start - vmap_start > 0) { + for (busy = vmlist; busy; busy = busy->next) { + if ((unsigned long) busy->addr - vmap_start > 0) { free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); if (!WARN_ON_ONCE(!free)) { free->va_start = vmap_start; - free->va_end = busy->va_start; + free->va_end = (unsigned long) busy->addr;
insert_vmap_area_augment(free, NULL, &free_vmap_area_root, @@ -4727,7 +4812,7 @@ static void vmap_init_free_space(void) } }
- vmap_start = busy->va_end; + vmap_start = (unsigned long) busy->addr + busy->size; }
if (vmap_end - vmap_start > 0) { @@ -4743,9 +4828,23 @@ static void vmap_init_free_space(void) } }
+static void vmap_init_nodes(void) +{ + struct vmap_node *vn; + int i; + + for (i = 0; i < nr_vmap_nodes; i++) { + vn = &vmap_nodes[i]; + vn->busy.root = RB_ROOT; + INIT_LIST_HEAD(&vn->busy.head); + spin_lock_init(&vn->busy.lock); + } +} + void __init vmalloc_init(void) { struct vmap_area *va; + struct vmap_node *vn; struct vm_struct *tmp; int i;
@@ -4767,6 +4866,11 @@ void __init vmalloc_init(void) xa_init(&vbq->vmap_blocks); }
+ /* + * Setup nodes before importing vmlist. + */ + vmap_init_nodes(); + /* Import existing vmlist entries. */ for (tmp = vmlist; tmp; tmp = tmp->next) { va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT); @@ -4776,7 +4880,9 @@ void __init vmalloc_init(void) va->va_start = (unsigned long)tmp->addr; va->va_end = va->va_start + tmp->size; va->vm = tmp; - insert_vmap_area(va, &vmap_area_root, &vmap_area_list); + + vn = addr_to_node(va->va_start); + insert_vmap_area(va, &vn->busy.root, &vn->busy.head); }
/*