From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
Implement memory related operations for memory management.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: sunqiuyang sunqiuyang@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/alloc_interface.h | 113 +++ fs/eulerfs/nvalloc.c | 1451 ++++++++++++++++++++++++++++++++++ fs/eulerfs/nvalloc.h | 214 +++++ 3 files changed, 1778 insertions(+) create mode 100644 fs/eulerfs/alloc_interface.h create mode 100644 fs/eulerfs/nvalloc.c create mode 100644 fs/eulerfs/nvalloc.h
diff --git a/fs/eulerfs/alloc_interface.h b/fs/eulerfs/alloc_interface.h new file mode 100644 index 000000000000..22d30c7672e0 --- /dev/null +++ b/fs/eulerfs/alloc_interface.h @@ -0,0 +1,113 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_ALLOC_INTERFACE_H +#define EUFS_ALLOC_INTERFACE_H + +#include "nvalloc.h" +#include "pbatch.h" + +static __always_inline void *nvzalloc(struct super_block *sb, size_t size, + u8 tag, bool nonblocking) +{ + void *r = nvmalloc(sb, size, tag, nonblocking); + + if (r) + memset(r, 0, size); + + return r; +} + +static __always_inline void * +nv_zalloc_file_data_nonblocking(struct super_block *sb) +{ + return nvzalloc(sb, PAGE_SIZE, EUFS_PAGE_FILE_DATA, true); +} + +struct eufs_inode; +struct nv_name_ext; +struct nv_dict_entry; + +static __always_inline struct eufs_inode * +eufs_malloc_pinode(struct super_block *sb) +{ + /* mirrored inodes: the head inode and the tail inode */ + return nvmalloc(sb, EUFS_INODE_SIZE * 2, EUFS_LINE4_INODE, false); +} +static __always_inline struct nv_dict_entry * +eufs_malloc_dentry(struct super_block *sb) +{ + return nvmalloc(sb, CACHELINE_SIZE, EUFS_LINE_DENTRY, false); +} +static __always_inline struct nv_name_ext * +eufs_malloc_name_ext(struct super_block *sb) +{ + return nvmalloc(sb, CACHELINE_SIZE, EUFS_LINE_NAME_EXT, false); +} + +static __always_inline void *eufs_malloc_file_data(struct super_block *sb) +{ + return nvmalloc(sb, PAGE_SIZE, EUFS_PAGE_FILE_DATA, false); +} +static __always_inline void *eufs_zalloc_file_data(struct super_block *sb) +{ + return nvzalloc(sb, PAGE_SIZE, EUFS_PAGE_FILE_DATA, false); +} +static __always_inline void *eufs_zmlloc_file_index(struct super_block *sb) +{ + return nvmalloc(sb, PAGE_SIZE, EUFS_PAGE_FILE_INDEX, false); +} +static __always_inline void *eufs_zalloc_symlink(struct super_block *sb) +{ + return nvzalloc(sb, PAGE_SIZE, EUFS_PAGE_SYMLINK, false); +} +static __always_inline void *eufs_zalloc_htable(struct super_block *sb) +{ + return nvzalloc(sb, PAGE_SIZE, EUFS_PAGE_HTABLE, false); +} +static __always_inline void *eufs_malloc_inode_ext(struct super_block *sb) +{ + return nvmalloc(sb, PAGE_SIZE, EUFS_PAGE_INODE_EXT, false); +} + +static __always_inline void nv_zfree(struct super_block *sb, void *p) +{ + if (p == NULL_ADDR_PTR) + return; + + nvfree(sb, p, false); +} + +static __always_inline void nv_free(struct super_block *sb, void *p) +{ + if (p != NULL_ADDR_PTR) + nv_zfree(sb, p); +} + +static __always_inline void nv_free_rest(struct super_block *sb, void *p) +{ + if (p != NULL_ADDR_PTR) + nvfree(sb, p, true); +} + +static __always_inline void *zalloc(ssize_t size) +{ + return kzalloc(size, GFP_KERNEL); +} + +static __always_inline void zfree(void *p) +{ + kfree(p); +} + +#endif /* EUFS_ALLOC_INTERFACE_H */ diff --git a/fs/eulerfs/nvalloc.c b/fs/eulerfs/nvalloc.c new file mode 100644 index 000000000000..8b60a2494636 --- /dev/null +++ b/fs/eulerfs/nvalloc.c @@ -0,0 +1,1451 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/percpu.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/random.h> +#include <linux/fs.h> +#include <linux/vmalloc.h> +#include <linux/ratelimit.h> +#include "nvalloc.h" +#include "euler.h" + +static __always_inline void print_ptr_list_node(struct ptr_list_node *node) +{ + eufs_info("========> &ptr_list_node = %px <==========\n", node); + eufs_info("= node => .prev=%px .next=%px\n", node->node.prev, + node->node.next); + eufs_info("= ptr =%px\n", node->ptr); + eufs_info("======== reported @cpu=%d =============\n", + smp_processor_id()); +} + +static __always_inline void memclr(void *ptr, size_t len) +{ + memset(ptr, 0, len); +} + +static __always_inline void *eufs_get_page(struct super_block *sb, int page_no) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + return sbi->data_start + page_no * PAGE_SIZE; +} + +void eufs_get_layout(struct super_block *sb, bool init) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + unsigned long start_addr = (u64)sbi->virt_addr; + ssize_t len = sbi->initsize; + + unsigned long ptr; + ssize_t page_map_size; + + /* only support 4K page now */ + BUG_ON(PAGE_SIZE != 4096); + BUILD_BUG_ON(sizeof(unsigned long) != sizeof(void *)); + + /* align the start to 4K */ + ptr = round_up(start_addr, PAGE_SIZE); + len -= (ptr - start_addr); + + sbi->npages = len / PAGE_SIZE; /* round down */ + sbi->data_start = (void *)((uintptr_t) ptr); + + /* skip the first 4K, reserved for super blocks */ + ptr += PAGE_SIZE; + len -= PAGE_SIZE; + + /* get page-map */ + if (init) + sbi->page_map = (page_info_t *)ptr; + page_map_size = round_up(sbi->npages * sizeof(page_info_t), PAGE_SIZE); + + ptr += page_map_size; + len -= page_map_size; + + /* skip for renamej */ + sbi->renamej = (void *)ptr; + ptr += EUFS_RENAMEJ_SIZE; + len -= EUFS_RENAMEJ_SIZE; + if (init) { + /* clear the pagemap */ + memclr(sbi->page_map, page_map_size); + memclr(sbi->renamej, EUFS_RENAMEJ_SIZE); + eufs_flush_buffer(sbi->renamej, EUFS_RENAMEJ_SIZE, true); + } +} + +static void partition_page(struct eufs_sb_info *sbi, int page_no, + line_info_t *gens, int *line4_cpu, + int *line4_countdown) +{ + struct ptr_list_node *node; + int i = page_no; + int j; + + /* no cache line is in global pool */ + sbi->line_indicators[i] = 0; + for (j = 1; j < 64; ++j) { + node = &sbi->line_node_ptrs[i][j]; + node->ptr = ((void *)gens) + CACHELINE_SIZE * j; + if (gens[j] == EUFS_LINE_DENTRY || + gens[j] == EUFS_LINE_NAME_EXT) { + /* line used */ + node->busy = true; + node->solid = true; + node->multiple = false; + node->tag = gens[j]; + continue; + } + if (gens[j] == EUFS_LINE4_INODE) { + int k; + /* linex4 used */ + node->busy = true; + node->solid = true; + node->multiple = true; + node->tag = gens[j]; + for (k = 1; k < 4; ++k) { + sbi->line_node_ptrs[i][j + k].ptr = + ((void *)gens) + + CACHELINE_SIZE * (j + k); + sbi->line_node_ptrs[i][j + k].busy = false; + sbi->line_node_ptrs[i][j + k].solid = false; + sbi->line_node_ptrs[i][j + k].multiple = false; + } + j += 3; + continue; + } + /* EUFS_LINE_FREE */ + if ((j & 3) == 0 && + /* probe */ + (gens[j + 1] == EUFS_LINE_FREE && + gens[j + 2] == EUFS_LINE_FREE && + gens[j + 3] == EUFS_LINE_FREE)) { + struct mem_pool *line4_ppool; + int k; + + node->busy = false; + node->solid = false; + node->multiple = true; + for (k = 1; k < 4; ++k) { + sbi->line_node_ptrs[i][j + k].ptr = + ((void *)gens) + + CACHELINE_SIZE * (j + k); + sbi->line_node_ptrs[i][j + k].busy = false; + sbi->line_node_ptrs[i][j + k].solid = false; + sbi->line_node_ptrs[i][j + k].multiple = false; + } + if (*line4_countdown == 0) { + /* switch to next cpu */ + *line4_cpu = cpumask_next(*line4_cpu, + cpu_possible_mask); + if (*line4_cpu >= nr_cpu_ids) + *line4_cpu = cpumask_next( + -1, cpu_possible_mask); + *line4_countdown = EUFS_PRE_PAGES_PERCPU; + } + line4_ppool = per_cpu_ptr(sbi->ppool, *line4_cpu); + list_add(&node->node, &line4_ppool->line4_list); + line4_ppool->nline4s++; + (*line4_countdown)--; + j += 3; + continue; + } + node->busy = false; + node->solid = false; + node->multiple = false; + ++sbi->line_indicators[i]; + list_add(&node->node, &sbi->gpool->line_list); + sbi->gpool->nlines++; + } +} + +static bool probe_large_page(struct eufs_sb_info *sbi, long page_no) +{ + long i = page_no; + int k; + + for (k = 1; k < 512; ++k) { + if (sbi->page_map[i + k] != EUFS_PAGE_FREE) + return false; + } + return true; +} + +/* Partition the area into multiple zones */ +static void partition(struct super_block *sb, bool init) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + u64 start_addr = (u64)sbi->virt_addr; + u64 len = sbi->initsize; + u64 npages_percpu; + u64 cpu_page_left; + u64 start_page; + int cpu; + int i; + int k; + struct mem_pool *pool; + struct ptr_list_node *node; + ssize_t page_map_size; + int line4_cpu; + int line4_countdown; + + /* + * The status of 64 cache-lines in a pmem page are tracked by + * 64 ptr_list_node in volatile page, so check whether or not + * the size of ptr_list_node is too large. + */ + BUILD_BUG_ON(64 * sizeof(struct ptr_list_node) > PAGE_SIZE); + + eufs_get_layout(sb, init); + page_map_size = round_up(sbi->npages * sizeof(page_info_t), PAGE_SIZE); + + /* allocate space for volatile allocator */ + sbi->cached_nodes = vmalloc(sizeof(struct ptr_list_node) * sbi->npages); + memclr(sbi->cached_nodes, sizeof(struct ptr_list_node) * sbi->npages); + + /* pointers reserved for cache line nodes for a page (64 lines) */ + sbi->line_node_ptrs = + vmalloc(sizeof(struct ptr_list_node *) * sbi->npages); + memclr(sbi->line_node_ptrs, + sizeof(struct ptr_list_node *) * sbi->npages); + + sbi->line_indicators = + vmalloc(sizeof(*sbi->line_indicators) * sbi->npages); + memclr(sbi->line_indicators, + sizeof(*sbi->line_indicators) * sbi->npages); + + i = 0; + if (init) { + unsigned int reserved_pages; + + eufs_info("start: %llx, len=%llu\n", start_addr, len); + + /* +1 for super block */ + reserved_pages = + 1 + page_map_size / PAGE_SIZE + + round_up(EUFS_RENAMEJ_SIZE, PAGE_SIZE) / PAGE_SIZE; + while (reserved_pages-- > 0) + sbi->page_map[i++] = EUFS_PAGE_RESERVED; + + eufs_flush_buffer(sbi->page_map, page_map_size, true); + } + + npages_percpu = EUFS_PRE_PAGES_PERCPU; + + cpu = -1; + cpu_page_left = 0; + start_page = 0; + + /* init spinlock for gpool */ + spin_lock_init(&sbi->large_lock); + spin_lock_init(&sbi->page_lock); + spin_lock_init(&sbi->line_lock); + spin_lock_init(&sbi->rest_lock); + + sbi->gpool->nlarges = 0; + sbi->gpool->npages = 0; + sbi->gpool->nlines = 0; + + line4_cpu = cpumask_next(-1, cpu_possible_mask); + line4_countdown = npages_percpu; + + for (; i < sbi->npages; ++i) { + if (cpu_page_left == 0) { + eufs_info( + "%s for cpu=%d, page=[%llu~%llu) [%px~%px)\n", + __func__, cpu, (u64)start_page, (u64)i, + eufs_get_page(sb, start_page), + eufs_get_page(sb, i)); + if (cpu < (int)nr_cpu_ids) + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) { + pool = sbi->gpool; + cpu_page_left = + sbi->npages; /* never exhausted */ + } else { + pool = per_cpu_ptr(sbi->ppool, cpu); + cpu_page_left = npages_percpu; + } + start_page = i; + } + node = sbi->cached_nodes + (i); + node->ptr = eufs_get_page(sb, i); + switch (sbi->page_map[i]) { + case EUFS_PAGE_LARGE_USED: + node->busy = true; + node->solid = true; + node->multiple = true; + node->tag = sbi->page_map[i]; + WARN(((u64)node->ptr) & ((2 << 20) - 1), + "EulerFS unalinged large page!"); + for (k = 1; k < 512; ++k) { + sbi->cached_nodes[i + k].ptr = + eufs_get_page(sb, i + k); + sbi->cached_nodes[i + k].busy = false; + sbi->cached_nodes[i + k].solid = false; + sbi->cached_nodes[i + k].multiple = false; + } + i += 511; + continue; + /* case EUFS_PAGE_USED: */ + case EUFS_PAGE_RESERVED: + case EUFS_PAGE_FILE_DATA: + case EUFS_PAGE_FILE_INDEX: + case EUFS_PAGE_HTABLE: + case EUFS_PAGE_SYMLINK: + case EUFS_PAGE_INODE_EXT: + BUG_ON(init); + node->busy = true; + node->solid = true; + node->multiple = false; + node->tag = sbi->page_map[i]; + /* page used */ + continue; + case EUFS_PAGE_LINE_USED: + BUG_ON(init); + /* page used as cache lines */ + node->busy = true; + node->solid = true; + node->multiple = false; + node->tag = sbi->page_map[i]; + + /* TODO: add cache lines */ + BUG_ON(sbi->line_node_ptrs[i]); + sbi->line_node_ptrs[i] = eufs_zalloc_page(); + + partition_page(sbi, i, node->ptr, &line4_cpu, + &line4_countdown); + + break; + case EUFS_PAGE_FREE: + /* allocate and fill the node */ + node->busy = false; + node->solid = false; + + if ((((u64)node->ptr) & ((2 << 20) - 1)) == 0 && + probe_large_page(sbi, i)) { + /* insert as large page */ + node->multiple = true; + + list_add(&node->node, &pool->large_list); + pool->nlarges++; + + cpu_page_left--; + + for (k = 1; k < 512; ++k) { + sbi->cached_nodes[i + k].ptr = + eufs_get_page(sb, i + k); + sbi->cached_nodes[i + k].busy = false; + sbi->cached_nodes[i + k].solid = false; + sbi->cached_nodes[i + k].multiple = + false; + } + i += 511; + } else { + /* insert to ppool */ + node->multiple = false; + list_add(&node->node, &pool->page_list); + pool->npages++; + + cpu_page_left--; + } + break; + default: + eufs_warn( + "Invalid value 0x%x in pagemap[%d] is detected!\n", + sbi->page_map[i], i); + continue; + } + } + if (cpu < nr_cpu_ids) + eufs_info("%s for cpu=%d, page=[%llu~%llu) [%px~%px)\n", + __func__, cpu, (u64)start_page, (u64)i, + eufs_get_page(sb, start_page), eufs_get_page(sb, i)); + else + eufs_info("%s for global pool, page=[%llu~%llu)\n", + __func__, start_page, (u64)i); +} + +static void return_page(struct eufs_sb_info *sbi, struct mem_pool *ppool, + struct ptr_list_node *node, bool rest) +{ + unsigned long flags; + u64 page_num = (node->ptr - sbi->data_start) / PAGE_SIZE; + + sbi->page_map[page_num] = EUFS_PAGE_FREE; + eufs_flush_cacheline(&sbi->page_map[page_num]); + eufs_pbarrier(); + if (wear_control && + (node->counter++ % wear_alloc_threshold == 0 || rest)) { + spin_lock_irqsave(&sbi->rest_lock, flags); + list_add(&node->node, &sbi->rest_pool->page_list); + sbi->rest_pool->npages++; + spin_unlock_irqrestore(&sbi->rest_lock, flags); + } else if (ppool->npages >= LOCAL_PAGE_MAX) { + spin_lock_irqsave(&sbi->page_lock, flags); + list_add(&node->node, &sbi->gpool->page_list); + sbi->gpool->npages++; + spin_unlock_irqrestore(&sbi->page_lock, flags); + } else { + local_irq_save(flags); + + list_add(&node->node, &ppool->page_list); + ppool->npages++; + + local_irq_restore(flags); + } +} + +static void _unset_bitmap(struct eufs_sb_info *sbi, u64 addr, bool flush); +static void return_cl(struct eufs_sb_info *sbi, struct mem_pool *ppool, + struct ptr_list_node *node, bool rest) +{ + unsigned long flags, flags2; + u64 page_no; + u64 page_off; + int i; + struct ptr_list_node *tmp; + + if (wear_control && + (node->counter++ % wear_alloc_threshold == 0 || rest)) { + spin_lock_irqsave(&sbi->rest_lock, flags); + list_add(&node->node, &sbi->rest_pool->line_list); + sbi->rest_pool->nlines++; + spin_unlock_irqrestore(&sbi->rest_lock, flags); + } else if (ppool->nlines >= LOCAL_LINE_MAX) { + page_off = (node->ptr - sbi->data_start); + page_no = page_off / PAGE_SIZE; + page_off = page_off % PAGE_SIZE; + + spin_lock_irqsave(&sbi->line_lock, flags2); + /* line_indicators are protected by sbi->line_lock */ + if (++sbi->line_indicators[page_no] == 63) { + /* Remove all cache lines */ + for (i = 1; i < 64; ++i) { + tmp = &sbi->line_node_ptrs[page_no][i]; + if (tmp == node) + continue; + list_del(&tmp->node); + /* It must be !solid since we ensure it during nvfree */ + BUG_ON(tmp->solid); + --sbi->gpool->nlines; + } + spin_unlock_irqrestore(&sbi->line_lock, flags2); + eufs_dbg("! cacheline coalescence !\n"); + + /* Add back a whole page */ + tmp = &sbi->cached_nodes[page_no]; + BUG_ON(!tmp->solid); + _unset_bitmap(sbi, (u64)tmp->ptr, true); + _SET_NON_BUSY(tmp, "fault addr %px", tmp->ptr); + + spin_lock_irqsave(&sbi->page_lock, flags); + list_add(&tmp->node, &sbi->gpool->page_list); + sbi->gpool->npages++; + sbi->page_map[page_no] = EUFS_PAGE_FREE; + + spin_unlock_irqrestore(&sbi->page_lock, flags); + + return; + } + + list_add(&node->node, &sbi->gpool->line_list); + sbi->gpool->nlines++; + spin_unlock_irqrestore(&sbi->line_lock, flags2); + + } else { + list_add(&node->node, &ppool->line_list); + ppool->nlines++; + } +} + +static void return_line4(struct eufs_sb_info *sbi, struct mem_pool *ppool, + struct ptr_list_node *node, bool rest) +{ + if (wear_control && + (node->counter++ % wear_alloc_threshold == 0 || rest)) { + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + + } else if (ppool->nlines >= LOCAL_LINE_MAX) { + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + node->multiple = false; + return_cl(sbi, ppool, node++, rest); + + } else { + list_add(&node->node, &ppool->line4_list); + ppool->nline4s++; + } +} + +void nv_fini(struct super_block *sb) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + int i; + + vfree(sbi->cached_nodes); + for (i = 0; i < sbi->npages; ++i) + if (sbi->line_node_ptrs[i]) + eufs_free_page(sbi->line_node_ptrs[i]); + vfree(sbi->line_node_ptrs); + vfree(sbi->line_indicators); + + free_percpu(sbi->ppool); + kfree(sbi->rest_pool); + kfree(sbi->gpool); +} + +void nv_init(struct super_block *sb, bool init) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct mem_pool *ppool; + int cpu; + + /* allocate pools */ + sbi->gpool = kmalloc(sizeof(struct mem_pool), GFP_KERNEL); + INIT_LIST_HEAD(&sbi->gpool->large_list); + INIT_LIST_HEAD(&sbi->gpool->page_list); + INIT_LIST_HEAD(&sbi->gpool->line4_list); + INIT_LIST_HEAD(&sbi->gpool->line_list); + sbi->gpool->nlarges = 0; + sbi->gpool->npages = 0; + sbi->gpool->nline4s = 0; + sbi->gpool->nlines = 0; + + sbi->rest_pool = kmalloc(sizeof(struct mem_pool), GFP_KERNEL); + INIT_LIST_HEAD(&sbi->rest_pool->large_list); + INIT_LIST_HEAD(&sbi->rest_pool->page_list); + INIT_LIST_HEAD(&sbi->rest_pool->line4_list); + INIT_LIST_HEAD(&sbi->rest_pool->line_list); + + sbi->rest_pool->nlarges = 0; + sbi->rest_pool->npages = 0; + sbi->rest_pool->nline4s = 0; + sbi->rest_pool->nlines = 0; + + sbi->ppool = alloc_percpu(struct mem_pool); + for_each_online_cpu(cpu) { + ppool = per_cpu_ptr(sbi->ppool, cpu); + INIT_LIST_HEAD(&ppool->large_list); + INIT_LIST_HEAD(&ppool->page_list); + INIT_LIST_HEAD(&ppool->line4_list); + INIT_LIST_HEAD(&ppool->line_list); + ppool->nlarges = 0; + ppool->npages = 0; + ppool->nline4s = 0; + ppool->nlines = 0; + ppool->fetch_count = FETCH_COUNT; + } + + partition(sb, init); +} + +static int cut_from_list_remaining(struct list_head *head, int remaining, + struct list_head *tmp) +{ + int i = 0; + struct list_head *end; + struct list_head *sentry; + + if (list_empty(head)) + return 0; + end = head; + sentry = head; + for (i = 0; i < remaining; ++i) { + if (sentry->next == head) + /* too few */ + return 0; + sentry = sentry->next; + } + + for (i = 0; sentry->next != head; ++i) { + end = end->next; + sentry = sentry->next; + } + + INIT_LIST_HEAD(tmp); + list_cut_position(tmp, head, end); + return i; +} + +static void give_up_pages(void *info) +{ + struct eufs_sb_info *sbi = info; + unsigned long flags, flags2; + LIST_HEAD(tmp); + struct mem_pool *ppool; + int i = 0; + int cpu; + + cpu = get_cpu(); + local_irq_save(flags2); + /* Need a way to get it back */ + ppool = per_cpu_ptr(sbi->ppool, cpu); + ppool->fetch_count = 10; + + i = cut_from_list_remaining(&ppool->page_list, ppool->fetch_count, + &tmp); + + if (i) { + spin_lock_irqsave(&sbi->page_lock, flags); + list_splice_tail(&tmp, &sbi->gpool->page_list); + sbi->gpool->npages += i; + spin_unlock_irqrestore(&sbi->page_lock, flags); + + ppool->npages -= i; + } + + i = cut_from_list_remaining(&ppool->large_list, 1, &tmp); + if (i) { + spin_lock_irqsave(&sbi->large_lock, flags); + list_splice_tail(&tmp, &sbi->gpool->large_list); + sbi->gpool->nlarges += i; + spin_unlock_irqrestore(&sbi->large_lock, flags); + + ppool->nlarges -= i; + } + + local_irq_restore(flags2); + put_cpu(); +} + +void revive_rest_pool(struct eufs_sb_info *sbi); + +static void gather_pages(struct eufs_sb_info *sbi) +{ + smp_call_func_t func = give_up_pages; + unsigned long flags; + + /* Gather from other CPUs */ + mutex_lock(&sbi->gather_mutex); + + smp_call_function(func, sbi, true); + + mutex_unlock(&sbi->gather_mutex); + + /* Gather from rest pool, if necessary */ + spin_lock_irqsave(&sbi->page_lock, flags); + if (!list_empty(&sbi->gpool->page_list)) { + spin_unlock_irqrestore(&sbi->page_lock, flags); + return; + } + spin_unlock_irqrestore(&sbi->page_lock, flags); + + revive_rest_pool(sbi); + /* I've tried the best */ +} + +static bool reload_lines_from_gpool(struct eufs_sb_info *sbi, + struct mem_pool *ppool) +{ + struct ptr_list_node *node; + struct list_head *head; + struct list_head *end; + unsigned long flags; + LIST_HEAD(tmp); + int i; + + spin_lock_irqsave(&sbi->line_lock, flags); + head = &sbi->gpool->line_list; + if (list_empty(head)) { + spin_unlock_irqrestore(&sbi->line_lock, flags); + return false; + } + end = head; + + /* head is not a legal node */ + for (i = 0; i < ppool->fetch_count && end->next != head; ++i) { + end = end->next; + node = list_entry(end, struct ptr_list_node, node); + /* move out of global pool */ + --sbi->line_indicators[(node->ptr - sbi->data_start) / + PAGE_SIZE]; + } + list_cut_position(&tmp, head, end); + list_splice_tail(&tmp, &ppool->line_list); + + sbi->gpool->nlines -= i; + ppool->nlines += i; + spin_unlock_irqrestore(&sbi->line_lock, flags); + + return true; +} + +static bool reload_large_from_gpool(struct eufs_sb_info *sbi, + struct mem_pool *ppool, bool nonblocking) +{ + struct list_head *head; + struct list_head *end; + LIST_HEAD(tmp); + int i; + unsigned long flags; + + spin_lock_irqsave(&sbi->large_lock, flags); + + if (nonblocking) { + if (sbi->gpool->nlarges == 0) { + spin_unlock_irqrestore(&sbi->large_lock, flags); + return false; + } + } else { + /* blocking is okay */ + if (sbi->gpool->nlarges <= NR_RESERVED_PAGES) { + spin_unlock_irqrestore(&sbi->large_lock, flags); + return false; + } + } + head = &sbi->gpool->large_list; + end = head; + + for (i = 0; i < ppool->fetch_count && end->next != head; ++i) + end = end->next; + list_cut_position(&tmp, head, end); + list_splice_tail(&tmp, &ppool->large_list); + + sbi->gpool->nlarges -= i; + ppool->nlarges += i; + + spin_unlock_irqrestore(&sbi->large_lock, flags); + + return true; +} + +static bool reload_page_from_gpool(struct eufs_sb_info *sbi, + struct mem_pool *ppool, bool nonblocking) +{ + struct list_head *head; + struct list_head *end; + LIST_HEAD(tmp); + int i; + unsigned long flags; + + spin_lock_irqsave(&sbi->page_lock, flags); + + if (nonblocking) { + if (sbi->gpool->npages == 0) { + spin_unlock_irqrestore(&sbi->page_lock, flags); + return false; + } + } else { + /* blocking is okay */ + if (sbi->gpool->npages <= NR_RESERVED_PAGES) { + spin_unlock_irqrestore(&sbi->page_lock, flags); + return false; + } + } + head = &sbi->gpool->page_list; + end = head; + + for (i = 0; i < ppool->fetch_count && end->next != head; ++i) + end = end->next; + list_cut_position(&tmp, head, end); + list_splice_tail(&tmp, &ppool->page_list); + + sbi->gpool->npages -= i; + ppool->npages += i; + + spin_unlock_irqrestore(&sbi->page_lock, flags); + + return true; +} + +void revive_rest_pool(struct eufs_sb_info *sbi) +{ + unsigned long flags; + + spin_lock_irqsave(&sbi->page_lock, flags); + spin_lock(&sbi->large_lock); + spin_lock(&sbi->line_lock); + spin_lock(&sbi->rest_lock); + + list_splice_init(&sbi->rest_pool->large_list, &sbi->gpool->large_list); + list_splice_init(&sbi->rest_pool->page_list, &sbi->gpool->page_list); + list_splice_init(&sbi->rest_pool->line_list, &sbi->gpool->line_list); + sbi->gpool->nlarges += sbi->rest_pool->nlarges; + sbi->gpool->npages += sbi->rest_pool->npages; + sbi->gpool->nlines += sbi->rest_pool->nlines; + sbi->rest_pool->nlarges = 0; + sbi->rest_pool->npages = 0; + sbi->rest_pool->nlines = 0; + + spin_unlock(&sbi->rest_lock); + spin_unlock(&sbi->line_lock); + spin_unlock(&sbi->large_lock); + spin_unlock_irqrestore(&sbi->page_lock, flags); +} + +static __always_inline int cut_from_list(struct list_head *head, + struct list_head *list, int count) +{ + struct list_head *end = head; + int i; + + for (i = 0; i < count && end->next != head; ++i) + end = end->next; + list_cut_position(list, head, end); + return i; +} + +static void preallocate_pages_from_larges_and_pages(struct eufs_sb_info *sbi, + struct alloc_batch *ab, + size_t count, + struct mem_pool *pool) +{ + struct ptr_list_node *list_node; + long nlarges_needed; + size_t r = 0; + int i; + + WARN(!irqs_disabled(), "Interrupt is not disabled!"); + + WARN(count > pool->nlarges * 512 + pool->npages, + "Invarients violated!"); + + if (count <= pool->npages) { + r = cut_from_list(&pool->page_list, &ab->list, count); + pool->npages -= r; + WARN_ON(r != count); + return; + } + + nlarges_needed = DIV_ROUND_UP(count - pool->npages, 512); + if ((nlarges_needed * 512) < count) { + r = cut_from_list(&pool->page_list, &ab->list, + count - (nlarges_needed * 512)); + WARN_ON(r != count - (nlarges_needed * 512)); + pool->npages -= r; + } + while (nlarges_needed--) { + list_node = list_first_entry(&pool->large_list, + struct ptr_list_node, node); + list_del(&list_node->node); + pool->nlarges--; + list_node->multiple = false; + /* split the large page */ + for (i = 0; i < 512; ++i) { + if (r < count) { + list_add(&list_node->node, &ab->list); + } else { + /* + * When all requested pages come from splitting of + * large pages, the remaining pages needs to add + * the list of normal page + */ + list_add(&list_node->node, &pool->page_list); + pool->npages++; + } + + r++; + list_node++; + } + } +} + +static int preallocate_page_from_pool(struct eufs_sb_info *sbi, + struct alloc_batch *ab, size_t count, + struct mem_pool *ppool) +{ + BUG_ON(!list_empty(&ab->list)); + BUG_ON(count > ppool->nlarges * 512 + ppool->npages); + + /* get locally with large pages and pages */ + preallocate_pages_from_larges_and_pages(sbi, ab, count, ppool); + + return 0; +} + +static int preallocate_page_from_gpool(struct eufs_sb_info *sbi, + struct alloc_batch *ab, size_t count) +{ + unsigned long flags; + u64 nlarges_avail = 0; + u64 npages_avail = 0; + + BUG_ON(!list_empty(&ab->list)); + + spin_lock_irqsave(&sbi->page_lock, flags); + spin_lock(&sbi->large_lock); + /* enough pages are available? */ + /* + * We have NR_RESERVED_PAGES pages reserved for allocation in page fault + * handlers, so do not use reserved pages if we can gather from other + * CPUs. + * NOTICE: We'd better not to use minus here since sbi->gpool->npages is + * unsigned. + */ + if (sbi->gpool->nlarges > NR_RESERVED_PAGES) + nlarges_avail = sbi->gpool->nlarges - NR_RESERVED_PAGES; + if (sbi->gpool->npages > NR_RESERVED_PAGES) + npages_avail = sbi->gpool->npages - NR_RESERVED_PAGES; + + if (count > nlarges_avail * 512 + npages_avail) { + spin_unlock(&sbi->large_lock); + spin_unlock_irqrestore(&sbi->page_lock, flags); + /* unlock and gather page */ + gather_pages(sbi); + /* relock after the gathering */ + spin_lock_irqsave(&sbi->page_lock, flags); + spin_lock(&sbi->large_lock); + /* enough pages this time? */ + if (count > sbi->gpool->nlarges * 512 + sbi->gpool->npages) { + spin_unlock(&sbi->large_lock); + spin_unlock_irqrestore(&sbi->page_lock, flags); + return -ENOSPC; + } + } + + /* get locally with large pages and pages */ + preallocate_pages_from_larges_and_pages(sbi, ab, count, sbi->gpool); + + spin_unlock(&sbi->large_lock); + spin_unlock_irqrestore(&sbi->page_lock, flags); + + return 0; +} + +void *nvmalloc_pre_get_from_list(struct super_block *sb, struct list_head *list, + u8 tag) +{ + struct ptr_list_node *list_node = + list_first_entry(list, struct ptr_list_node, node); + void __pmem *page = list_node->ptr; + + list_del(&list_node->node); + list_node->tag = tag; + /* list_node->solid is unchanged. */ + _SET_BUSY(list_node, "set_busy addr=%px", page); + + eufs_dbg("nvallocate pre-from-list: %px bitmap=%d busy=%d\n", page, + EUFS_SB(sb)->page_map[(page - EUFS_SB(sb)->data_start) / + PAGE_SIZE], + EUFS_SB(sb) + ->cached_nodes[(page - EUFS_SB(sb)->data_start) / + PAGE_SIZE] + .busy); + return page; +} + +int nvmalloc_pre(struct super_block *sb, struct alloc_batch *ab, size_t count, + size_t size) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct mem_pool *ppool; + unsigned long flags; + int cpu; + int r; + /* size other than PAGE_SIZE not supported currently */ + if (size != PAGE_SIZE) + return -EOPNOTSUPP; + + cpu = get_cpu(); + local_irq_save(flags); + + ppool = per_cpu_ptr(sbi->ppool, cpu); + if (count <= ppool->nlarges * 512 + ppool->npages) { + /* get locally */ + r = preallocate_page_from_pool(sbi, ab, count, ppool); + local_irq_restore(flags); + put_cpu(); + return r; + } + + /* get from global pool */ + local_irq_restore(flags); + put_cpu(); + r = preallocate_page_from_gpool(sbi, ab, count); + return r; +} + +/* + * Large: 2M + * Page: 4K + * Line4: 256B + * Line: 64B + */ + +#define LARGE_PAGE_SIZE (2 << 20) + +/* + * get from ppool list, then from the global list if present, + * if failed, break larger units. + */ +static void *try_get_large_page(struct eufs_sb_info *sbi, + struct mem_pool *ppool, u8 tag, + bool nonblocking) +{ + struct ptr_list_node *list_node; + void *ret; + unsigned long flags; + +retry: + if (list_empty(&ppool->large_list) && + !reload_large_from_gpool(sbi, ppool, nonblocking)) + return NULL; + + local_irq_save(flags); + if (list_empty(&ppool->large_list)) { + local_irq_restore(flags); + goto retry; + } + + list_node = list_first_entry(&ppool->large_list, struct ptr_list_node, + node); + ret = list_node->ptr; + list_del(&list_node->node); + ppool->nlarges--; + list_node->tag = tag; + + local_irq_restore(flags); + + /* list_node->solid is unchanged. */ + _SET_BUSY(list_node, "set_busy addr=%px", ret); + + BUG_ON(((u64)ret % LARGE_PAGE_SIZE)); + + return ret; +} + +static void *eufs_try_get_page(struct eufs_sb_info *sbi, struct mem_pool *ppool, + u8 tag, bool use_reserved) +{ + struct ptr_list_node *list_node; + struct ptr_list_node *node; + void *ret; + unsigned long flags; + void *large; + int i; + u64 page_no; + +retry: + if (list_empty(&ppool->page_list)) { + /* slow path */ + if (!reload_page_from_gpool(sbi, ppool, use_reserved)) { + /* TODO: merge pages back to large pages? */ + large = try_get_large_page(sbi, ppool, 0, use_reserved); + if (!large) + return NULL; + page_no = (large - sbi->data_start) / PAGE_SIZE; + for (i = 1; i < 512; ++i) { + node = &sbi->cached_nodes[page_no + i]; + node->multiple = false; + return_page(sbi, ppool, node, false); + } + sbi->cached_nodes[page_no].multiple = false; + sbi->cached_nodes[page_no].tag = tag; + return large; + } + } + local_irq_save(flags); + if (list_empty(&ppool->page_list)) { + local_irq_restore(flags); + goto retry; + } + list_node = + list_first_entry(&ppool->page_list, struct ptr_list_node, node); + + ret = list_node->ptr; + list_del(&list_node->node); + ppool->npages--; + list_node->tag = tag; + + local_irq_restore(flags); + + /* list_node->solid is unchanged. */ + _SET_BUSY(list_node, "set_busy addr=%px", ret); + + BUG_ON(((u64)ret % PAGE_SIZE)); + + return ret; +} + +/* NOTICE: cpu changes in this function */ +static struct ptr_list_node *split_page_to_lines(struct eufs_sb_info *sbi, + struct mem_pool *ppool, + void *page, bool use_line4) +{ + struct ptr_list_node *node, *ret = NULL; + u64 page_no; + int cpu; + int i; + /* Release the cpu since may need to allocate a page. */ + put_cpu(); + + /* Split the page */ + page_no = (page - sbi->data_start) / PAGE_SIZE; + sbi->line_indicators[page_no] = 0; + + if (sbi->line_node_ptrs[page_no]) { + memclr(sbi->line_node_ptrs[page_no], PAGE_SIZE); + } else { + sbi->line_node_ptrs[page_no] = eufs_zalloc_page(); + BUG_ON(!sbi->line_node_ptrs[page_no]); + } + memclr(page, CACHELINE_SIZE); + + /* cache line 0: bitmap */ + /* cache line 1~3: insert to line_list */ + /* cache line >4: insert to line4_list */ + + /* + * Reget the cpu. The cpu might be different from the + * one we previously got, but it doesn't matter. + */ + cpu = get_cpu(); + ppool = per_cpu_ptr(sbi->ppool, cpu); + for (i = 1; i < 64; ++i) { + node = &sbi->line_node_ptrs[page_no][i]; + node->ptr = page + i * CACHELINE_SIZE; + node->busy = false; + node->solid = false; + node->multiple = false; + } + for (i = 1; i < 4; ++i) { + node = &sbi->line_node_ptrs[page_no][i]; + if (!use_line4 && i == 1) { + ret = node; + continue; + } + return_cl(sbi, ppool, node, false); + } + for (i = 4; i < 64; i += 4) { + node = &sbi->line_node_ptrs[page_no][i]; + node->multiple = true; + if (use_line4 && i == 4) { + ret = node; + continue; + } + return_line4(sbi, ppool, node, false); + } + return ret; +} + +static void *try_get_line4(struct eufs_sb_info *sbi, struct mem_pool *ppool, + u8 tag, bool use_reserved) +{ + struct ptr_list_node *list_node; + unsigned long flags; + void *ret; + +retry: + /* cache line x 4 */ + if (list_empty(&ppool->line4_list)) { + /* Cannot fetch cache lines from gpool, get from page */ + ret = eufs_try_get_page(sbi, ppool, 0, use_reserved); + if (ret == NULL) + return NULL; + + list_node = split_page_to_lines(sbi, ppool, ret, true); + ret = list_node->ptr; + list_node->tag = tag; + goto out; + } + + local_irq_save(flags); + if (list_empty(&ppool->line4_list)) { + local_irq_restore(flags); + goto retry; + } + + list_node = list_first_entry(&ppool->line4_list, struct ptr_list_node, + node); + ret = list_node->ptr; + list_del(&list_node->node); + + ppool->nline4s--; + list_node->tag = tag; + + local_irq_restore(flags); +out: + + _SET_BUSY(list_node, "error cacheline addr=%px", ret); + + return ret; +} + +static void *try_get_line(struct eufs_sb_info *sbi, struct mem_pool *ppool, + u8 tag, bool use_reserved) +{ + struct ptr_list_node *list_node; + struct ptr_list_node *node; + unsigned long flags; + void *ret; + int k; + +retry: + /* cache line x 1 */ + if (list_empty(&ppool->line_list)) { + /* Fetch cache lines from gpool */ + if (!reload_lines_from_gpool(sbi, ppool) /* slow path */) { + if (list_empty(&ppool->line4_list)) { + ret = eufs_try_get_page(sbi, ppool, 0, + use_reserved); + if (ret == NULL) + return NULL; + + list_node = split_page_to_lines(sbi, ppool, ret, + false); + ret = list_node->ptr; + list_node->tag = tag; + goto out; + } else { + local_irq_save(flags); + if (list_empty(&ppool->line4_list)) { + local_irq_restore(flags); + goto retry; + } + list_node = + list_first_entry(&ppool->line4_list, + struct ptr_list_node, + node); + ret = list_node->ptr; + list_del(&list_node->node); + ppool->nline4s--; + list_node->tag = tag; + + list_node->multiple = false; + + for (k = 1; k < 4; ++k) { + node = list_node + k; + node->multiple = false; + list_add(&node->node, + &ppool->line_list); + ppool->nlines++; + } + local_irq_restore(flags); + goto out; + } + } + } + + local_irq_save(flags); + if (list_empty(&ppool->line_list)) { + local_irq_restore(flags); + goto retry; + } + + list_node = + list_first_entry(&ppool->line_list, struct ptr_list_node, node); + ret = list_node->ptr; + list_del(&list_node->node); + + ppool->nlines--; + list_node->tag = tag; + + local_irq_restore(flags); +out: + + _SET_BUSY(list_node, "error cacheline addr=%px", ret); + + return ret; +} + +/* + * If nonblocking is set, we will skip the gather phase and allocate from the + * reserved pages (in gpool) + */ +void *nvmalloc(struct super_block *sb, size_t size, u8 tag, bool nonblocking) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct mem_pool *ppool; + void __pmem *ret; + int cpu; + u64 npages; + u64 nlines; + bool once_gathered = false; + void *(*try_get_)(struct eufs_sb_info *sbi, struct mem_pool *mp, u8 tag, + bool use_reserved); + + if (size == PAGE_SIZE << 9) { + try_get_ = try_get_large_page; + } else if (size == PAGE_SIZE) { + try_get_ = eufs_try_get_page; + } else if (size == CACHELINE_SIZE << 2) { + try_get_ = try_get_line4; + } else if (size == CACHELINE_SIZE) { + try_get_ = try_get_line; + } else { + WARN(1, "EulerFS: INVALID allocation size!"); + return NULL; + } + +gathered_retry: + cpu = get_cpu(); + ppool = per_cpu_ptr(sbi->ppool, cpu); + /* + * If we have gathered, we must try our best to allocate, so + * even the reserved pages can be used + */ + ret = try_get_(sbi, ppool, tag, nonblocking || once_gathered); + + if (ret == NULL) { + if (once_gathered || nonblocking) + /* Really full */ + goto full_out; + /* Maybe full. Try gather from other CPUs. */ + put_cpu(); + gather_pages(sbi); + once_gathered = true; + goto gathered_retry; + } + put_cpu(); + + eufs_dbg("nvallocate: %px bitmap=%d busy=%d @cpu=%d\n", ret, + sbi->page_map[(ret - sbi->data_start) / PAGE_SIZE], + sbi->cached_nodes[(ret - sbi->data_start) / PAGE_SIZE].busy, + cpu); + + WARN_ON(ret == NULL); + return ret; +full_out: + put_cpu(); + nv_stat(sbi, &npages, &nlines); + pr_warn_ratelimited("EulerFS is FULL! @%d (%lld pages, %lld lines)\n", + smp_processor_id(), npages, nlines); + return NULL; +} + +static void _unset_bitmap(struct eufs_sb_info *sbi, u64 addr, bool flush) +{ + u64 page_no = (addr - (u64)sbi->data_start) / PAGE_SIZE; + u64 rem = addr % PAGE_SIZE; + line_info_t __pmem *line_map; + struct ptr_list_node *node; + int line_no; + + node = sbi->cached_nodes + (page_no); + if (rem == 0) { + /* + * the nvmalloc->nvfree case should be handled when nolde->solid + * is false if the allocation is implemented. Same as below. + */ + if (node->solid) { + BUG_ON(sbi->page_map[page_no] == EUFS_PAGE_FREE); + sbi->page_map[page_no] = EUFS_PAGE_FREE; + if (flush) + eufs_flush_cacheline(&sbi->page_map[page_no]); + } + + BUG_ON(sbi->page_map[page_no] != EUFS_PAGE_FREE); + node->solid = false; + } else { + /* line */ + BUG_ON(rem % CACHELINE_SIZE != 0); + + BUG_ON(sbi->page_map[page_no] != EUFS_PAGE_FREE && + sbi->page_map[page_no] != EUFS_PAGE_LINE_USED); + + if (!node->solid) { + /* the allocation is not written yet */ + /* HACK: idempotent */ + if (sbi->page_map[page_no] != EUFS_PAGE_LINE_USED) { + sbi->page_map[page_no] = EUFS_PAGE_LINE_USED; + eufs_flush_cacheline(&sbi->page_map[page_no]); + } + node->solid = true; + } + + node = &sbi->line_node_ptrs[page_no][rem / CACHELINE_SIZE]; + line_map = (void *)(addr - rem); + line_no = rem / CACHELINE_SIZE; + + if (node->solid) { + BUG_ON(line_map[line_no] == EUFS_LINE_FREE); + line_map[line_no] = EUFS_LINE_FREE; + eufs_dbg("unset %px[%d] = 0\n", line_map, line_no); + + if (flush) + eufs_flush_cacheline(&line_map[line_no]); + } + + node->solid = false; + BUG_ON(line_map[line_no] != EUFS_LINE_FREE); + } +} + +void nvfree(struct super_block *sb, void *ptr, bool rest) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + struct mem_pool *ppool; + struct ptr_list_node *node; + s64 offset; + int cpu; + u64 end = sbi->npages * PAGE_SIZE; + + if (ptr == NULL_ADDR_PTR) + return; + + offset = ptr - sbi->data_start; + BUG_ON(offset < 0); + BUG_ON(offset >= end); + + eufs_dbg("%s: %px bitmap=%d busy=%d\n", __func__, ptr, + sbi->page_map[(ptr - sbi->data_start) / PAGE_SIZE], + sbi->cached_nodes[(ptr - sbi->data_start) / PAGE_SIZE].busy); + + _unset_bitmap(sbi, (u64)ptr, true); + + cpu = get_cpu(); + ppool = per_cpu_ptr(sbi->ppool, cpu); + if ((u64)ptr % PAGE_SIZE == 0) { + /* page */ + + /* get node */ + node = sbi->cached_nodes + offset / PAGE_SIZE; + node->ptr = ptr; + _SET_NON_BUSY(node, "fault addr %px", ptr); + /* add to page-to-free list */ + if (node->multiple) + WARN_ON_ONCE(1); + else + return_page(sbi, ppool, node, rest); + } else if ((u64)ptr % CACHELINE_SIZE == 0) { + /* cache line */ + + /* get node */ + node = &sbi->line_node_ptrs[offset / PAGE_SIZE] + [offset % PAGE_SIZE / CACHELINE_SIZE]; + _SET_NON_BUSY(node, "fault addr %px", ptr); + /* add to local cl pool */ + if (node->multiple) + return_line4(sbi, ppool, node, rest); + else + return_cl(sbi, ppool, node, rest); + } else { + /* error */ + eufs_warn("!err allocation type!\n"); + } + put_cpu(); + eufs_dbg("%s done: %px bitmap=%d busy=%d\n", __func__, ptr, + sbi->page_map[(ptr - sbi->data_start) / PAGE_SIZE], + sbi->cached_nodes[(ptr - sbi->data_start) / PAGE_SIZE].busy); +} diff --git a/fs/eulerfs/nvalloc.h b/fs/eulerfs/nvalloc.h new file mode 100644 index 000000000000..a39b81862bfb --- /dev/null +++ b/fs/eulerfs/nvalloc.h @@ -0,0 +1,214 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_NVALLOC_H +#define EUFS_NVALLOC_H + +#include "euler.h" +#include "euler_common.h" +#include <linux/sort.h> +#include <linux/vmalloc.h> + +/* + * The design of the allocator is hybrid, and the in-dram allocators are + * per-cpu to accelerate the zalloc/zfree process. + * + * This allocator supports only 4K and 64B (one cache line) so no chunks + * are involved in the design. + */ +/* + * In NVM +---> 0/1: 4K/64B + * +-----------------+ | + * | page-map *+-->+----+---1B-in-size+- + * | (page-info) | | type | birth_gen | + * +-----------------+ +------------------+ + * | pages | | type2| check_gen | + * | (actual data)* *| +------------------+ + * +--------------+-++ + * | | + * | +---> +-------------------------+ + * v | gen x 63 + lock/cpu x 1 | + * +-----------+ +-------------------------+ + * | | | cache line data 1 | + * | 4K page | +-------------------------+ + * | data | | cache line data 2 | + * | | +-------------------------+ + * +-----------+ | ... | + * +-------------------------+ + * | cache line data 63 | + * +-------------------------+ + */ + +/* + * In DRAM, percpu + * +----------------+ + * | free-page-list +----->... + * +----------------+ + * | free-line-list +----->... + * +----------------+ + * + * global + * +----------------+ + * | free-page-lists+----> free-page-list --> free-page-list + * +----------------+ + * | free-line-lists +----> free-line-list --> free-line-list + * +----------------+ + */ + +extern int wear_control; +extern int wear_alloc_threshold; + +struct ptr_list_node { + struct list_head node; /* points to next list node */ + void __pmem *ptr; + bool busy; /* whether it is allocated in the volatile allocator */ + bool solid; /* whether it is allocated in the bitmap */ + bool multiple; /* whther it is a linex4/large-page */ + u8 tag; + int counter; /* How many times has it been allocated? */ +}; + +struct mem_pool { + struct list_head page_list; /* points to ptr_lists_node */ + struct list_head line_list; /* points to ptr_lists_node */ + struct list_head line4_list; + struct list_head large_list; + u64 npages; + u64 nlines; + u64 nline4s; + u64 nlarges; + int fetch_count; +}; + +#define _SET_NON_BUSY(node, fmt, args...) \ + do { \ + if (node->busy == false) { \ + eufs_info(fmt, ##args); \ + BUG(); \ + } \ + node->busy = false; \ + } while (0) + +#define _SET_BUSY(node, fmt, args...) \ + do { \ + if (node->busy == true) { \ + eufs_info(fmt, ##args); \ + BUG(); \ + } \ + node->busy = true; \ + } while (0) + +#define EUFS_PAGE_FREE (0) +#define EUFS_PAGE_USED (1) +#define EUFS_PAGE_LINE_USED (2) +#define EUFS_PAGE_LARGE_USED (3) +#define EUFS_PAGE_RESERVED (5) + +#define EUFS_LINE_FREE (0) +#define EUFS_LINE_USED (1) +#define EUFS_LINE4_USED (2) + +#define EUFS_PAGE_FILE_DATA (8) +#define EUFS_PAGE_FILE_INDEX (9) +#define EUFS_PAGE_HTABLE (10) +#define EUFS_PAGE_SYMLINK (11) +#define EUFS_PAGE_INODE_EXT (12) + +#define EUFS_LINE4_INODE (4) +#define EUFS_LINE_DENTRY (5) +#define EUFS_LINE_NAME_EXT (6) + +void *nvmalloc_pre_get_from_list(struct super_block *sb, struct list_head *list, + u8 tag); +int nvmalloc_pre(struct super_block *sb, struct alloc_batch *ab, size_t count, + size_t size); +void *nvmalloc(struct super_block *sb, size_t size, u8 tag, bool nonblocking); +void nvfree(struct super_block *sb, void *ptr, bool rest); +void nv_init(struct super_block *sb, bool init); +void nv_fini(struct super_block *sb); +void eufs_get_layout(struct super_block *sb, bool init); + +#define FETCH_COUNT 64 +#define EUFS_PRE_PAGES_PERCPU (4096) + +#define LOCAL_PAGE_MAX (4096 * 8) +#define LOCAL_LINE_MAX (4096) + +#define NR_RESERVED_PAGES (64) + +static __always_inline void print_line_map(line_info_t *line_map, u8 line_num) +{ + int i; + + eufs_info("line_map[line_num]: %px[%u]=%u\n", line_map, line_num, + line_map[line_num]); + eufs_info("line_map=%px ===>\n", line_map); + for (i = 0; i < 8; ++i) { + int i8 = i * 8; + + eufs_info("%d: %u %u %u %u %u %u %u %u\n", i, line_map[i8 + 0], + line_map[i8 + 1], line_map[i8 + 2], line_map[i8 + 3], + line_map[i8 + 4], line_map[i8 + 5], line_map[i8 + 6], + line_map[i8 + 7]); + } +} + +static __always_inline void nv_stat(struct eufs_sb_info *sbi, u64 *page, + u64 *line) +{ + struct mem_pool *ppool; + u64 nlarges = sbi->gpool->nlarges; + u64 npages = sbi->gpool->npages; + u64 nline4s = 0; + u64 nlines = sbi->gpool->nlines; + int cpu; + + for_each_online_cpu(cpu) { + ppool = per_cpu_ptr(sbi->ppool, cpu); + + nlarges += ppool->nlarges; + npages += ppool->npages; + nline4s += ppool->nline4s; + nlines += ppool->nlines; + } + *page = npages + (nlarges << 9); + *line = nlines + (nline4s << 2); +} + +static __always_inline void print_stats(struct eufs_sb_info *sbi) +{ + struct mem_pool *ppool; + int cpu; + u64 nlarges = sbi->gpool->nlarges; + u64 npages = sbi->gpool->npages; + u64 nline4s = 0; + u64 nlines = sbi->gpool->nlines; + + eufs_info("Stat: (g,%lld,%lld), ", sbi->gpool->npages, + sbi->gpool->nlines); + for_each_online_cpu(cpu) { + ppool = per_cpu_ptr(sbi->ppool, cpu); + + nlarges += ppool->nlarges; + npages += ppool->npages; + nline4s += ppool->nline4s; + nlines += ppool->nlines; + + eufs_info("(@%d,%lld,%lld,%lld,%lld) ", cpu, ppool->nlarges, + ppool->npages, ppool->nline4s, ppool->nlines); + } + eufs_info("= (summary: larges=%lld pages=%lld line4s=%lld lines=%lld)\n", + nlarges, npages, nline4s, nlines); +} + +#endif /* EUFS_NVALLOC_H */