From: Yu Kuai yukuai3@huawei.com
hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/I40JRR CVE: NA
--------------------------------------
These interfaces will be implemented and used in later patches.
Signed-off-by: Mingkai Dong dongmingkai1@huawei.com Signed-off-by: Hou Tao houtao1@huawei.com Signed-off-by: Yu Kuai yukuai3@huawei.com Reviewed-by: Hou Tao houtao1@huawei.com Signed-off-by: Zheng Zengkai zhengzengkai@huawei.com --- fs/eulerfs/const.h | 80 ++++++++++ fs/eulerfs/euler.h | 84 ++++++++++ fs/eulerfs/euler_common.h | 225 +++++++++++++++++++++++++++ fs/eulerfs/euler_dbg.h | 36 +++++ fs/eulerfs/euler_def.h | 201 ++++++++++++++++++++++++ fs/eulerfs/nvm_struct.h | 297 +++++++++++++++++++++++++++++++++++ fs/eulerfs/pbatch.h | 314 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 1237 insertions(+) create mode 100644 fs/eulerfs/const.h create mode 100644 fs/eulerfs/euler.h create mode 100644 fs/eulerfs/euler_common.h create mode 100644 fs/eulerfs/euler_dbg.h create mode 100644 fs/eulerfs/euler_def.h create mode 100644 fs/eulerfs/nvm_struct.h create mode 100644 fs/eulerfs/pbatch.h
diff --git a/fs/eulerfs/const.h b/fs/eulerfs/const.h new file mode 100644 index 000000000000..1e3485ecc8a0 --- /dev/null +++ b/fs/eulerfs/const.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_CONST_H +#define EUFS_CONST_H + +/* EULER */ +#define EUFS_SUPER_MAGIC 0x50C9 +/* Mount flags */ +#define EUFS_MOUNT_ERRORS_RO 0x000001 /* Remount fs ro on errors */ +#define EUFS_MOUNT_ERRORS_PANIC 0x000002 /* Panic on errors */ +#define EUFS_MOUNT_FORMAT 0x000004 /* was FS formatted on mount? */ + +#define NULL_ADDR ((u64)-1ll) +#define NULL_VAL (0) +#define NULL_ADDR_PTR ((void *)(NULL_ADDR)) + +/* FS Limits */ +#define EUFS_MAX_NAME_LEN (255) +#define EUFS_LINK_MAX (32000) /* max links to a file */ + +/* layout: hash_len (u64) + sym_link + trailing zero */ +#define EUFS_MAX_SYMLINK_LEN (PAGE_SIZE - sizeof(u64) - 1) +#define EUFS_SYMLINK_HASHLEN_LEN(hashlen) (((hashlen) >> 48) & 0xfff) +#define EUFS_SYMLINK_SIZE(len) ((len) + sizeof(u64) + 1) + +#define EUFS_BLOCK_SIZE (4096) +#define EUFS_BLOCK_SIZE_BITS (12) + +/* The initial height is 0 when the file tree contains no or one block */ +#define EUFS_MAX_FILE_TREE_HEIGHT 3 +#define EUFS_FILE_TREE_DEGREE_SHIFT 9 +#define EUFS_FILE_TREE_DEGREE (1U << EUFS_FILE_TREE_DEGREE_SHIFT) +#define EUFS_MAX_FILE_BLK_CNT \ + (1ll << (EUFS_MAX_FILE_TREE_HEIGHT * EUFS_FILE_TREE_DEGREE_SHIFT)) +#define EUFS_MAX_FILE_SIZE (4096ll * EUFS_MAX_FILE_BLK_CNT) + +#define EUFS_POISON_POINTER ((void *)0x1010101010101010UL) +#define EUFS_POISON_VALUE ((u64)0x1010101010101010UL) + +#define CACHELINE_SIZE (64) + +#define EUFS_ALLOC_BLOCKS_ZERO_NONE (0x0) /* Zero none NULL_ADDR pages */ +#define EUFS_ALLOC_BLOCKS_ZERO_ALL (0x1) /* Zero all NULL_ADDR pages */ +#define EUFS_ALLOC_BLOCKS_ZERO_EDGE (0x2) /* Zero edge NULL_ADDR pages */ + +#define EUFS_INODE_SIZE (CACHELINE_SIZE * 2) + +#define NV_DICT_CAPACITY (512ULL) +/* + * EOC stands for "End Of Chain". + * + * When volatile bucket (namely table[idx]) is EUFS_DIR_EOC_PTR, + * it means that both volatile bucket and persist bucket are empty. + * When volatile bucket is NULL, it just means that volatile + * bucket is empty. + * + * When volatile_next is EUFS_DIR_EOC, it means current entry is + * the last one in the chain although its next may still points + * to an entry (because the setting and persistence of next are + * deferred). When volatile_next is NULL, it means next should be + * checked to ensure whether or not the current entry is the last + * one in the chain. + */ +#define EUFS_DIR_EOC ((u64)-1) +#define EUFS_DIR_EOC_PTR ((void *)EUFS_DIR_EOC) +/* DIR DELeted NEW dentry */ +#define EUFS_DIR_DELNEW ((u64)0x3030303030303030UL) + +#endif /* EUFS_CONST_H */ diff --git a/fs/eulerfs/euler.h b/fs/eulerfs/euler.h new file mode 100644 index 000000000000..0abb7602bb63 --- /dev/null +++ b/fs/eulerfs/euler.h @@ -0,0 +1,84 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_H +#define EUFS_H + +#include <linux/crc16.h> +#include <linux/crc32.h> +#include <linux/crc32c.h> +#include <linux/version.h> +#include <linux/pagemap.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/uio.h> +#include <linux/mutex.h> +#include <linux/slab.h> + +/* annotation for pointer to persistent memory */ +#define __pmem + +#define NV_CHECK (0) + +#if NV_CHECK +#pragma message "NV CHECK IS TURNED ON! NO PERF. EVAL.!" +#endif + +#if NV_CHECK +#define NV_ASSERT(x) \ + do { \ + if (!(x)) { \ + eufs_warn("assertion failed %s:%d: %s\n", __FILE__, \ + __LINE__, #x); \ + } \ + WARN(!(x), "detail:"); \ + } while (0) +#else +#define NV_ASSERT(x) +#endif + +#include "const.h" +#include "euler_dbg.h" +#include "nvm_struct.h" +#include "euler_def.h" +#include "kmem_cache.h" +#include "flush.h" +#include "euler_common.h" +#include "inode.h" +#include "nvalloc.h" + +extern int num_sockets; + +/* Function Prototypes */ +extern __printf(2, 3) void eufs_error_mng(struct super_block *sb, + const char *fmt, ...); + +/* dir.c */ +extern const struct file_operations eufs_dir_operations; + +/* file.c */ +extern const struct inode_operations eufs_file_inode_operations; +extern const struct file_operations eufs_file_operations; +int eufs_fsync(struct file *file, loff_t start, loff_t end, int datasync); + +/* inode.c */ +extern const struct address_space_operations eufs_aops; + +/* namei.c */ +extern const struct inode_operations eufs_dir_inode_operations; +extern const struct inode_operations eufs_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations eufs_symlink_inode_operations; + +#endif /* EUFS_H */ diff --git a/fs/eulerfs/euler_common.h b/fs/eulerfs/euler_common.h new file mode 100644 index 000000000000..b7684de19c7d --- /dev/null +++ b/fs/eulerfs/euler_common.h @@ -0,0 +1,225 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_COMMON_H +#define EUFS_COMMON_H + +#include <linux/crc16.h> +#include <linux/crc32.h> +#include <linux/crc32c.h> +#include <linux/pagemap.h> +#include <linux/types.h> +#include <linux/rcupdate.h> +#include <linux/uio.h> +#include <linux/mutex.h> +#include <linux/version.h> +#include <linux/slab.h> +#include <linux/fs.h> + +#ifndef EUFS_H +#error "Please include euler_common.h by including euler.h" +#endif + +#define EUFS_INODE_CNT_IN_RENAME 4 + +#define PAGE_DIV_ROUND_UP(x) (((x) + PAGE_SIZE - 1) >> PAGE_SHIFT) +#define PAGE_DIV_ROUND_DOWN(x) (((x)) >> PAGE_SHIFT) + +#define clear_opt(o, opt) (o &= ~EUFS_MOUNT_##opt) +#define set_opt(o, opt) (o |= EUFS_MOUNT_##opt) +#define test_opt(sb, opt) (EUFS_SB(sb)->s_mount_opt & EUFS_MOUNT_##opt) + +static __always_inline void *o2p(struct super_block *sb, u64 offset); +static __always_inline u64 p2o(struct super_block *sb, void *ptr); + +static __always_inline struct eufs_sb_info *EUFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static __always_inline struct eufs_inode_info *EUFS_I(struct inode *inode) +{ + return container_of(inode, struct eufs_inode_info, vfs_inode); +} + +static __always_inline struct eufs_inode *EUFS_PI(struct inode *inode) +{ + return (struct eufs_inode *)o2p(inode->i_sb, inode->i_ino); +} + +static __always_inline unsigned long eufs_pi2ino(struct super_block *sb, + struct eufs_inode *pi) +{ + return p2o(sb, EUFS_HEAD_PI(pi)); +} + +static __always_inline struct eufs_super_block * +eufs_get_super(struct super_block *sb) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + return (struct eufs_super_block *)sbi->virt_addr; +} + +static __always_inline void *eufs_get_renamej(struct super_block *sb, int cpu) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + + return (void *)((u64)sbi->renamej + EUFS_RENAMEJ_ENTRY_SIZE * cpu); +} + +/* + * o: offset: u64 + * p: pointer: void * + * s: storage: __le64 + */ +static __always_inline void *o2p(struct super_block *sb, u64 offset) +{ + if (offset == 0) + return NULL; + if (offset == -1) + return (void *)-1; + return (void *)(EUFS_SB(sb)->virt_addr + offset); +} + +static __always_inline u64 p2o(struct super_block *sb, void *ptr) +{ + if (ptr == NULL) + return 0; + if (ptr == (void *)-1) + return -1; + return (u64)(ptr - EUFS_SB(sb)->virt_addr); +} + +/* pointer to storage */ +static __always_inline __le64 p2s(struct super_block *sb, void *ptr) +{ + return cpu_to_le64(p2o(sb, ptr)); +} +/* storage to pointer */ +static __always_inline void *s2p(struct super_block *sb, __le64 s) +{ + return o2p(sb, le64_to_cpu(s)); +} + +static __always_inline bool +eufs_access_ok(struct super_block *sb, const void *pointer, unsigned long sz) +{ + return true; +} + +#define eufs_ptr_fast_check_b(ptr) 0 + +#define eufs_ptr_fast_check(ptr) BUG_ON(eufs_ptr_fast_check_b(ptr)) + +#define HASHLEN_LEN(hashlen) (((hashlen) >> 48) & 0xff) + +static __always_inline hashlen_t hash(const char *name, size_t len) +{ + static const int seed = 131; + u64 r = 0; + int i; + + for (i = 0; i < len; ++i) + r = r * seed + (int)name[i]; + + return (u64)len << 48 | (r & 0xffffffffffff); +} + +static __always_inline bool key_equals(struct super_block *sb, const char *key, + hashlen_t hashlen, + const struct nv_dict_entry *de) +{ + int len; + struct nv_name_ext *p; + + NV_ASSERT(key); + NV_ASSERT(hashlen); + + if (hashlen != de->hv) + return false; + len = HASHLEN_LEN(hashlen); + if (likely(len <= FIRST_LEN)) + return memcmp(de->name, key, len) == 0; + if (memcmp(de->name, key, FIRST_LEN)) + return false; + eufs_dbg("first len ok\n"); + len -= FIRST_LEN; + p = s2p(sb, de->nextname); + key += FIRST_LEN; + while (len > FOLLOW_LEN) { + eufs_dbg("check again p:%*s key:%*s\n", (int)FOLLOW_LEN, + p->name, (int)FOLLOW_LEN, key); + if (memcmp(p->name, key, FOLLOW_LEN)) + return false; + p = s2p(sb, p->nextname); + key += FOLLOW_LEN; + len -= FOLLOW_LEN; + } + eufs_dbg("final check name p:%*s key:%*s\n", len, p->name, len, key); + return !memcmp(p->name, key, len); +} + +static __always_inline void eufs_flush_pi(struct eufs_inode *pi) +{ + eufs_flush_cacheline(pi); + eufs_flush_cacheline(&pi->i_fresh); +} + +static __always_inline void inode_dep_lock(struct inode *inode) +{ + mutex_lock(&EUFS_I(inode)->i_dep_lock); +} +static __always_inline void inode_dep_unlock(struct inode *inode) +{ + mutex_unlock(&EUFS_I(inode)->i_dep_lock); +} +static __always_inline int inode_is_dep_locked(struct inode *inode) +{ + return mutex_is_locked(&EUFS_I(inode)->i_dep_lock); +} +static __always_inline void inode_header_lock(struct inode *inode) +{ + mutex_lock(&EUFS_I(inode)->i_header_lock); +} +static __always_inline void inode_header_unlock(struct inode *inode) +{ + mutex_unlock(&EUFS_I(inode)->i_header_lock); +} +static __always_inline int inode_is_header_locked(struct inode *inode) +{ + return mutex_is_locked(&EUFS_I(inode)->i_header_lock); +} +static __always_inline void inode_urgent_lock(struct inode *inode) +{ + mutex_lock(&EUFS_I(inode)->i_urgent_mutex); +} +static __always_inline void inode_urgent_unlock(struct inode *inode) +{ + mutex_unlock(&EUFS_I(inode)->i_urgent_mutex); +} +static __always_inline int inode_is_urgent_locked(struct inode *inode) +{ + return mutex_is_locked(&EUFS_I(inode)->i_urgent_mutex); +} + +static __always_inline void inode_leaf_lock(struct inode *inode) +{ + mutex_lock(&EUFS_I(inode)->i_leaf_lock); +} + +static __always_inline void inode_leaf_unlock(struct inode *inode) +{ + mutex_unlock(&EUFS_I(inode)->i_leaf_lock); +} + +#endif /* EUFS_COMMON_H */ diff --git a/fs/eulerfs/euler_dbg.h b/fs/eulerfs/euler_dbg.h new file mode 100644 index 000000000000..fbd3851cb5cd --- /dev/null +++ b/fs/eulerfs/euler_dbg.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_DBG_H +#define EUFS_DBG_H + +/* + * Debug code + */ + +#ifdef pr_fmt +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#endif + +#define eufs_dbg(s, args...) +#define eufs_dbg_vlimit(s, args...) +#define eufs_dbg_dir(s, args...) + +#define eufs_crit(s, args...) pr_crit(s, ##args) +#define eufs_err(sb, s, args...) eufs_error_mng(sb, s, ##args) +#define eufs_warn(s, args...) pr_warn(s, ##args) +#define eufs_info(s, args...) \ + pr_info("(pid=%d,cmd=%s) " s, current->pid, current->comm, ##args) + +#endif /* EUFS_DBG_H */ diff --git a/fs/eulerfs/euler_def.h b/fs/eulerfs/euler_def.h new file mode 100644 index 000000000000..727f1c4cf181 --- /dev/null +++ b/fs/eulerfs/euler_def.h @@ -0,0 +1,201 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_DEF_H +#define EUFS_DEF_H + +#ifndef EUFS_H +#error "Do not include euler_def.h directly. Include euler.h instead." +#endif + +#include <linux/cpufeature.h> +#include <linux/processor.h> +#include <linux/types.h> +#include <linux/magic.h> +#include <linux/delay.h> + +struct alloc_batch { + /* both in slots */ + long size; + long n_used; + void **batch; + long n_pending; + struct list_head list; +}; + +struct v_dict; + +enum { I_TRANS_NONE = 0, I_TRANS_AVAIL, I_TRANS_LOCKED }; + +struct eufs_inode_info { + struct list_head i_dep_list; /* A list of struct op_node to persist */ + /* protect operations on i_dep_list */ + struct mutex i_dep_lock; + + struct llist_node i_persistee_node; + + u32 i_next_dep_seq; + u32 i_persisted_dep_seq; + spinlock_t i_owner_lock; + struct list_head i_owner_list; + + /* regular file: pmem pointer */ + void __pmem *i_volatile_root; + struct v_dict *i_volatile_dict; + /* + * serialize the insertion of dependency nodes into the same + * directory by different processes or CPUs + */ + struct mutex i_header_lock; + + struct mutex i_urgent_mutex; + + int i_volatile_height; + u64 i_volatile_tree_blocks; + + u64 i_dotdot; + /* + * a inode can only be added into a persistence list once, + * so use i_is_persisting & inode_lock to ensure that. + */ + bool i_is_persisting; + /* whether or not the inode need persistence */ + bool i_is_dirty; + + int i_lock_transferred; + + bool hole_at_sta; /* the 0th data block is a hole */ + + u64 i_ext; + u16 i_version; + + struct alloc_batch page_batch; + /* serialize mmap with truncate/fallocate/write/unlink */ + struct rw_semaphore mmap_rwsem; + /* Protect pointers to leaf nodes (data pages) */ + struct mutex i_leaf_lock; + + spinlock_t i_dentry_persist_lock; + struct inode vfs_inode; +}; + +typedef u8 page_info_t; +struct page_wear; + +/* + * EulerFS super-block data in memory + */ +struct eufs_sb_info { + struct block_device *s_bdev; + struct dax_device *s_dax_dev; + phys_addr_t phys_addr; + void __pmem *virt_addr; + struct vm_struct *vm; + unsigned long block_start; + unsigned long block_end; + + void __pmem *renamej; + + u64 s_crash_ver; + + /* protects the SB's buffer-head */ + struct mutex s_lock; + + unsigned long blocksize; + unsigned long initsize; + unsigned long s_mount_opt; + atomic_t next_generation; + + /* Begin of Allocator */ + /* DRAM pools: + * - a single global pool + * - potected by page_lock and line_lock + * - a local pool per cpu + * - allocate/free from global pool in batch + * - no locks needed + * - a single (global) rest pool + * - when a page is used too many times, it is put into rest pool + * - cache lines are never put in rest pool + */ + spinlock_t large_lock; + spinlock_t page_lock; + spinlock_t line_lock; + struct mem_pool *gpool; + struct mem_pool *ppool; /* percpu variable */ + + spinlock_t rest_lock; + struct mem_pool *rest_pool; + + page_info_t __pmem *page_map; + void __pmem *data_start; + u64 npages; + + /* Other DRAM structures for the allcoator: + * + * - struct ptr_list_node: an unit for allocation (i.e., a page + * or a cacheline). + * + * - cached nodes: preallocated ptr_list_node for all pages, indexed by + * the page number. If the page is free, its ptr_list_node should + * be in some mem_pool. + * + * - line_node_ptrs: preallocated pointers for all pages. For each + * page, the pointer may point to an array of + * (PAGE_SIZE/CACHELINE_SIZE) ptr_list_nodes, each of which presents + * the allocation status of the corresponding cache line in the page. + * The array is dynamically allocated for memory conservation. + * + * - line_indicators: preallocated u8s for all pages. Each of the u8s + * records the number of cache lines available in global pool. This + * is used for cacheline coalescence. + * + * - page_wears: preallocated ints for all pages. Each of the ints + * records the number of writes to the page. This is used to + * coarse-grainedly show the degree of wear. + * + */ + struct ptr_list_node *cached_nodes; + struct ptr_list_node **line_node_ptrs; + u8 *line_indicators; /* Number of lines used per page! */ + + struct page_wear *page_wears; + + /* End of Allocator */ + + /* Begin of Persister */ + /* kmem cache for dep_node is universal defined in super.c */ + struct llist_head *persistee_list; /* percpu variable */ + struct task_struct **persisters; + bool *need_sync; /* for fssync */ + wait_queue_head_t sync_wq; /* for fssync's thread */ + struct mutex sync_mutex; /* serialize fssync request */ + /* End of Persister */ + + /* The word `draining` is reserved for volatility quota limitation */ + bool s_draining; + wait_queue_head_t s_draining_wq; + + atomic_t s_nr_dirty_inodes; + atomic_t s_nr_dep_nodes; + + struct mutex gather_mutex; +}; + +struct dir_scan_data { + struct super_block *sb; + struct dir_context *ctx; +}; + +typedef u64 hashlen_t; + +#endif /* EUFS_DEF_H */ diff --git a/fs/eulerfs/nvm_struct.h b/fs/eulerfs/nvm_struct.h new file mode 100644 index 000000000000..4818ae6d49bf --- /dev/null +++ b/fs/eulerfs/nvm_struct.h @@ -0,0 +1,297 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_NVM_STRUCT_H +#define EUFS_NVM_STRUCT_H + +#define EUFS_SB_SIZE 512 +#define EUFS_SB2_OFFSET 512 +#define EUFS_SB_PADDING (1024 * 2) + +/* Used by rename journal */ +#define EUFS_MAX_CPU_CNT 128 +#define EUFS_RENAMEJ_ENTRY_SIZE (1024) +#define EUFS_RENAMEJ_SIZE (EUFS_MAX_CPU_CNT * EUFS_RENAMEJ_ENTRY_SIZE) +#define EUFS_RENAMEJ_OFFSET (EUFS_SB_SIZE * 2 + EUFS_SB_PADDING) + +#define EUFS_CRC_SEED (~0) +#define EUFS_RENAME_IN_ACTION 1 + +/* + * Layout + * +------------------------+ + * | Super Block | 64B + * +------------------------+ + * | Padding | 512B-64B + * +------------------------+ + * | Seconary Super Block | 64B + * +------------------------+ + * | Padding | Aligned to 4K + * +------------------------+ + * + * +------------------------+ + * | | + * | pages | + * | | + * +------------------------+ + * | bitmap for pages | 4K-aligned + * +------------------------+ + * | Rename-Journals | 128K (128 cores * 1024B/core) + * +------------------------+ + * | | + * | pages | + * | | + * +------------------------+ + */ +/* + * Structure of the EulerFS super block. + */ +struct eufs_super_block { + /* checksum of this sb */ + __le16 s_sum; + /* magic signature */ + __le16 s_magic; + char s_safe_umount; + char s_flag; + __le16 s_fs_version; + /* 8 Bytes */ + + /* total size of fs in bytes */ + __le64 s_size; + /* base virtual address used in fs */ + __le64 s_virt_addr; + /* 24 Bytes */ + + char s_volume_name[16]; + /* 40 Bytes */ + + /* points to the location of mini-journal and rename journal */ + __le64 s_page_map; + /* 48 Bytes */ + + /* + * s_mtime(mount time) and s_wtime(write time) should be together and + * their order should not be changed. we use an 8 byte write to update + * both of them atomically. + */ + __le32 s_mtime; + __le32 s_wtime; + /* 56 Bytes */ + + __le64 s_root_pi; + /* 64 Bytes */ + __le64 s_crash_ver; +}; + +/* ========== directory & hash ========== */ +#define FIRST_LEN (CACHELINE_SIZE - sizeof(__le64) * 5) +#define FOLLOW_LEN (CACHELINE_SIZE - sizeof(__le64)) + +typedef u64 hashlen_t; +struct nv_dict { + __le64 __pmem table[NV_DICT_CAPACITY]; /* <struct nv_dict_entry *> */ +} __aligned(PAGE_SIZE); + +struct nv_dict_entry { + /* half a cache line (8B * 4) size in total */ + __le64 inode; /* <struct eufs_inode *> */ + __le64 next; /* <struct nv_dict_entry *> */ + __le64 volatile_next; /* <struct nv_dict_entry *> */ + /* store some filename */ + __le64 hv; /* <hashlen_t> hashlen */ + __le64 nextname; /* <char *> */ + char name[FIRST_LEN]; +} __aligned(CACHELINE_SIZE); + +struct nv_name_ext { + char name[FOLLOW_LEN]; + __le64 nextname; +} __aligned(CACHELINE_SIZE); + +#define EUFS_IS_HEAD_PI(pi) (!((u64)(pi) & (0x100 - 1))) + +#define EUFS_TWIN_PI(pi) \ + (EUFS_IS_HEAD_PI(pi) ? (((struct eufs_inode *)(pi)) + 1) : \ + (((struct eufs_inode *)(pi)) - 1)) + +#define EUFS_FRESH_PI(pi) \ + (((pi)->i_fresh >= EUFS_TWIN_PI(pi)->i_fresh) ? (pi) : \ + EUFS_TWIN_PI(pi)) + +#define EUFS_HEAD_PI(pi) (EUFS_IS_HEAD_PI(pi) ? (pi) : EUFS_TWIN_PI(pi)) + +/* ========== inode ========== */ +struct eufs_inode { + /* Cacheline 1: readmost part */ + /* 0 ~ 8 */ + __le32 i_flags; /* Inode flags */ + __le16 i_mode; /* File mode */ + __le16 i_version; /* Inode version */ + /* 8 ~ 16 */ + /* Note: the ctime to report is max(i_ctime, i_mtime) */ + __le64 i_ctime; /* Inode modification time (only for metadata) */ + /* 16 ~ 24 */ + __le32 i_uid; /* Owner Uid */ + __le32 i_gid; /* Group Id */ + /* 24 ~ 32 */ + __le64 i_dotdot; /* <struct eufs_inode *> parent inode (dir only) */ + /* 32 ~ 40 */ + __le64 i_ext; /* reserved for extension */ + /* 40 ~ 48 */ + __le32 i_ctime_nsec; /* nano sec */ + /* 48 ~ 56 */ + __le64 padding1; + /* 56 ~ 64 */ + __le64 padding2; + + /* Cacheline 2: readmost part */ + /* readwirte part */ + /* 0 ~ 8 */ + __le32 i_generation; /* File version (for NFS) */ + __le16 i_nlink; /* Links count */ + /* + * Freshness: we have twin-inodes here. When we access an inode, + * we compare the freshness of the two inodes and use the one with + * higher freshness. The freshness is only 16-bit, but we can easily + * handle the overflow. + */ + __le16 i_fresh; /* Freshness of the inode */ + /* 8 ~ 16 */ + __le64 i_mtime; /* Inode b-tree Modification time */ + /* 16 ~ 24 */ + __le64 i_atime; /* Access time */ + /* 24 ~ 32 */ + union { + __le64 i_root; /* btree root (regular only) */ + __le64 i_dict; /* dict root (dir only */ + __le32 i_rdev; /* major/minor (device only) */ + }; + /* 32 ~ 40 */ + /* + * Size: + * for directory: number of entries inside + * for regular: number of bytes stored + * others: not used + */ + __le64 i_size; /* Size of data in bytes */ + /* 40 ~ 48 */ + __le64 i_tree_blocks; /* #blocks allocated in btree (regular only) */ + + /* 48 ~ 56 */ + __le32 i_mtime_nsec; /* nano sec */ + __le32 i_atime_nsec; /* nano sec */ + /* 56 ~ 64 */ + __le64 padding3; +} __aligned(CACHELINE_SIZE); + +#define eufs_iread_flags(i) (le32_to_cpu((i)->i_flags)) +#define eufs_iread_mode(i) (le16_to_cpu((i)->i_mode)) +#define eufs_iread_ctime(i) (le64_to_cpu((i)->i_ctime)) +#define eufs_iread_uid(i) (le32_to_cpu((i)->i_uid)) +#define eufs_iread_gid(i) (le32_to_cpu((i)->i_gid)) +#define eufs_iread_dotdot(i) (le64_to_cpu((i)->i_dotdot)) + +#define eufs_iwrite_flags(i, v) ((i)->i_flags = cpu_to_le32(v)) +#define eufs_iwrite_mode(i, v) ((i)->i_mode = cpu_to_le16(v)) +#define eufs_iwrite_ctime(i, v) ((i)->i_ctime = cpu_to_le64(v)) +#define eufs_iwrite_uid(i, v) ((i)->i_uid = cpu_to_le32(v)) +#define eufs_iwrite_gid(i, v) ((i)->i_gid = cpu_to_le32(v)) +#define eufs_iwrite_dotdot(i, v) ((i)->i_dotdot = cpu_to_le64(v)) + +#define eufs_iread_version(i) (le16_to_cpu((i)->i_version)) +#define eufs_iread_ctime_nsec(i) (le32_to_cpu((i)->i_ctime_nsec)) +#define eufs_iread_ext(i) (le64_to_cpu((i)->i_ext)) +#define eufs_iwrite_version(i, v) ((i)->i_version = cpu_to_le16(v)) +#define eufs_iwrite_ctime_nsec(i, v) ((i)->i_ctime_nsec = cpu_to_le32(v)) +#define eufs_iwrite_ext(i, v) ((i)->i_ext = cpu_to_le64(v)) + +#define eufs_writemostly_inode(i) ((i)) + +#define eufs_iread_generation(i) \ + (le32_to_cpu(eufs_writemostly_inode(i)->i_generation)) +#define eufs_iread_nlink(i) (le16_to_cpu(eufs_writemostly_inode(i)->i_nlink)) +#define eufs_iread_mtime(i) (le64_to_cpu(eufs_writemostly_inode(i)->i_mtime)) +#define eufs_iread_atime(i) (le64_to_cpu(eufs_writemostly_inode(i)->i_atime)) +#define eufs_iread_root(i) (le64_to_cpu(eufs_writemostly_inode(i)->i_root)) +#define eufs_iread_dict(i) (le64_to_cpu(eufs_writemostly_inode(i)->i_dict)) +#define eufs_iread_rdev(i) (le32_to_cpu(eufs_writemostly_inode(i)->i_rdev)) +#define eufs_iread_size(i) (le64_to_cpu(eufs_writemostly_inode(i)->i_size)) +#define eufs_iread_tree_blocks(i) \ + (le64_to_cpu(eufs_writemostly_inode(i)->i_tree_blocks)) + +#define eufs_iwrite_generation(i, v) \ + (eufs_writemostly_inode(i)->i_generation = cpu_to_le32(v)) +#define eufs_iwrite_nlink(i, v) \ + (eufs_writemostly_inode(i)->i_nlink = cpu_to_le16(v)) +#define eufs_iwrite_mtime(i, v) \ + (eufs_writemostly_inode(i)->i_mtime = cpu_to_le64(v)) +#define eufs_iwrite_atime(i, v) \ + (eufs_writemostly_inode(i)->i_atime = cpu_to_le64(v)) +#define eufs_iwrite_root(i, v) \ + (eufs_writemostly_inode(i)->i_root = cpu_to_le64(v)) +#define eufs_iwrite_dict(i, v) \ + (eufs_writemostly_inode(i)->i_dict = cpu_to_le64(v)) +#define eufs_iwrite_rdev(i, v) \ + (eufs_writemostly_inode(i)->i_rdev = cpu_to_le32(v)) +#define eufs_iwrite_size(i, v) \ + (eufs_writemostly_inode(i)->i_size = cpu_to_le64(v)) +#define eufs_iwrite_tree_blocks(i, v) \ + (eufs_writemostly_inode(i)->i_tree_blocks = cpu_to_le64(v)) + +#define eufs_iread_mtime_nsec(i) \ + (le32_to_cpu(eufs_writemostly_inode(i)->i_mtime_nsec)) +#define eufs_iread_atime_nsec(i) \ + (le32_to_cpu(eufs_writemostly_inode(i)->i_atime_nsec)) +#define eufs_iwrite_mtime_nsec(i, v) \ + (eufs_writemostly_inode(i)->i_mtime_nsec = cpu_to_le32(v)) +#define eufs_iwrite_atime_nsec(i, v) \ + (eufs_writemostly_inode(i)->i_atime_nsec = cpu_to_le32(v)) + +static inline void eufs_iwrite_ctime_mtime(struct eufs_inode *pi, + struct inode *vi) +{ + eufs_iwrite_ctime(pi, vi->i_ctime.tv_sec); + eufs_iwrite_ctime_nsec(pi, vi->i_ctime.tv_nsec); + + eufs_iwrite_mtime(pi, vi->i_mtime.tv_sec); + eufs_iwrite_mtime_nsec(pi, vi->i_mtime.tv_nsec); +} + +struct eufs_renamej { + __le32 crc; + __le32 flags; + __le64 addr_of_oldnext; + __le64 oldnext; + __le64 addr_of_newde; + __le64 composed_newde; /* composed as list header */ + __le64 newde_inode; + __le64 old_dir_pi; + __le64 new_dir_pi; + + __le64 time; + __le32 time_nsec; + __le16 old_link; + __le16 new_link; + __le32 old_size; + __le32 new_size; + __u8 pad[40]; +} __aligned(CACHELINE_SIZE); + +typedef u8 page_info_t; +typedef u8 line_info_t; + +struct embedded_line_info { + line_info_t gens[64]; +}; + +#endif /* EUFS_NVM_STRUCT_H */ diff --git a/fs/eulerfs/pbatch.h b/fs/eulerfs/pbatch.h new file mode 100644 index 000000000000..1a7bcf089213 --- /dev/null +++ b/fs/eulerfs/pbatch.h @@ -0,0 +1,314 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef EUFS_PBATCH_H +#define EUFS_PBATCH_H + +/** + * To prevent data races, only two cases are allowed: + * 1) nvmalloc -> alloc_batch_persist -> nvfree + * 2) nvmalloc -> nvfree + */ + +/** + * eufs_alloc_batch_* API usage: + * + * struct alloc_batch batch; + * [ eufs_alloc_batch_init(&batch, estimated_size); ] + * eufs_alloc_batch_hint(&batch, estimated_size); + * eufs_alloc_batch_add(&batch, the_page_pointer); + * eufs_alloc_batch_add(&batch, the_page_pointer); + * ... + * eufs_alloc_batch_add(&batch, the_page_pointer); + * eufs_alloc_batch_persist_reset(&batch); + * + * eufs_alloc_batch_fini(&batch); + * + */ +/* TODO: consider using list? */ + +#define EUFS_AB_MAX_SIZE (KMALLOC_MAX_SIZE / 8) + +/* log2(cache_line size / page_info_t size) */ +#define EUFS_PMAP_CNT_SHIFT_PER_CACHELINE 6 + +static __always_inline void eufs_alloc_batch_hint(struct alloc_batch *pb, + ssize_t size); +static __always_inline void +eufs_alloc_batch_persist_reset(struct super_block *sb, struct alloc_batch *pb); +static __always_inline void eufs_alloc_batch_init(struct alloc_batch *pb, + ssize_t size) +{ + pb->n_used = 0; + pb->batch = NULL; + pb->size = 0; + pb->n_pending = 0; + eufs_alloc_batch_hint(pb, size); + BUG_ON(!pb->batch); +} + +/* This gives only hints, no guarantees. */ +static __always_inline void eufs_alloc_batch_hint(struct alloc_batch *pb, + ssize_t size) +{ + ssize_t realsize; + void **batch; + + realsize = round_up(size * sizeof(void *), PAGE_SIZE); + if (realsize > KMALLOC_MAX_SIZE) + realsize = KMALLOC_MAX_SIZE; + size = realsize / sizeof(void *); + + if (pb->size >= size) + return; + batch = krealloc(pb->batch, realsize, GFP_KERNEL | __GFP_NOFAIL); + BUG_ON(batch == NULL); + pb->batch = batch; + pb->size = size; + eufs_dbg("! eufs_alloc_batch_hint ; ab=%px size=%ld\n", pb, size); +} + +static __always_inline void eufs_alloc_batch_hint_off(struct alloc_batch *pb, + ssize_t off_size) +{ + eufs_alloc_batch_hint(pb, pb->size + pb->n_pending + off_size); +} + +static __always_inline void eufs_alloc_batch_fini(struct alloc_batch *pb) +{ + kfree(pb->batch); + pb->batch = NULL; + pb->size = pb->n_used = 0; +} +/* Add an already allocated address */ +static __always_inline void eufs_alloc_batch_add(struct super_block *sb, + struct alloc_batch *pb, + void *page) +{ + if (pb->n_used == pb->size) { + /* Enlarge */ + if (pb->size == EUFS_AB_MAX_SIZE) + eufs_alloc_batch_persist_reset(sb, pb); + else + eufs_alloc_batch_hint(pb, pb->size * 2); + BUG_ON(pb->n_used >= pb->size); + } + BUG_ON(pb->n_used >= pb->size); + pb->batch[pb->n_used] = page; + pb->n_used++; +} + +/* + * With the following four functions, alloc_batch can be used as a pool of + * preallocation. + */ +static __always_inline int +eufs_alloc_batch_pre_allocate_begin(struct super_block *sb, + struct alloc_batch *ab, size_t need_blocks) +{ + long r; + BUG_ON(ab->n_pending); + eufs_alloc_batch_hint_off(ab, need_blocks); + ab->n_pending = need_blocks; + r = nvmalloc_pre(sb, ab, need_blocks, PAGE_SIZE); + if (r) + ab->n_pending = 0; + return r; +} +static __always_inline void +eufs_alloc_batch_pre_allocate_end(struct super_block *sb, + struct alloc_batch *ab) +{ + WARN((ab->n_pending != 0), + "Some pre-allocated pages are not used in %px!\n", ab); + BUG_ON(!list_empty(&ab->list)); +} + +/* Allocate from the pre-allocated addresses */ +static __always_inline void *eufs_alloc_batch_allocate(struct super_block *sb, + struct alloc_batch *ab, + u8 tag) +{ + void *page = NULL; + /* used up */ + BUG_ON(ab->n_pending <= 0); + page = nvmalloc_pre_get_from_list(sb, &ab->list, tag); + BUG_ON(!page); + ab->n_pending--; + eufs_alloc_batch_add(sb, ab, page); + return page; +} +static __always_inline void * +eufs_alloc_batch_allocate_file_index(struct super_block *sb, + struct alloc_batch *ab) +{ + return eufs_alloc_batch_allocate(sb, ab, EUFS_PAGE_FILE_INDEX); +} +static __always_inline void * +eufs_alloc_batch_allocate_file_data(struct super_block *sb, + struct alloc_batch *ab) +{ + return eufs_alloc_batch_allocate(sb, ab, EUFS_PAGE_FILE_DATA); +} + +static int cmp_func(const void *a, const void *b) +{ + const void **_a = (const void **)a; + const void **_b = (const void **)b; + + if (*_a > *_b) + return 1; + if (*_a < *_b) + return -1; + return 0; +} +#define _PAGE_NO(ptr) (((u64)ptr - (u64)sbi->data_start) / PAGE_SIZE) +#define _LINE_MAP(addr) ((line_info_t *)((u64)(addr)&PAGE_MASK)) +#define _IS_LINE(addr) ((u64)addr % PAGE_SIZE) +static __always_inline void _set_bitmap(struct eufs_sb_info *sbi, u64 addr, + bool forced) +{ + u64 page_no = _PAGE_NO(addr); + u64 rem = addr % PAGE_SIZE; + line_info_t __pmem *line_map; + /* no one can free this address now, so no race will happen */ + struct ptr_list_node *node; + int line_no; + + if (rem == 0) { + /* page */ + node = sbi->cached_nodes + (page_no); + if (!forced) { + BUG_ON(node->solid); + BUG_ON(sbi->page_map[page_no] != EUFS_PAGE_FREE); + } + WARN(node->tag == 0, + "unexpected page node tag %u (addr 0x%llx)\n", node->tag, + addr); + sbi->page_map[page_no] = node->tag; + node->solid = true; + } else { + /* line */ + BUG_ON(rem % CACHELINE_SIZE != 0); + + line_map = (void *)(addr - rem); + line_no = rem / CACHELINE_SIZE; + + BUG_ON(sbi->page_map[page_no] != EUFS_PAGE_FREE && + sbi->page_map[page_no] != EUFS_PAGE_LINE_USED); + /* \ _set _unset + * _set idempotent + * _unset + */ + if (sbi->page_map[page_no] == EUFS_PAGE_FREE) { + /* idempotent */ + sbi->page_map[page_no] = EUFS_PAGE_LINE_USED; + node = sbi->cached_nodes + (page_no); + BUG_ON(!node->busy); + node->solid = true; + } + + node = &sbi->line_node_ptrs[page_no][line_no]; + if (!forced) { + BUG_ON(node->solid); + if (line_map[line_no]) { + eufs_info( + "!line_map[line_no] = %px[%d] = %d\n", + line_map, line_no, line_map[line_no]); + BUG(); + } + BUG_ON(line_map[line_no]); + } + WARN(node->tag == 0, + "unexpected line node tag %u (addr 0x%llx)\n", node->tag, + addr); + line_map[line_no] = node->tag; + eufs_dbg("set %px[%d] = %d forced=%d\n", line_map, line_no, + line_map[line_no], forced); + node->solid = true; + BUG_ON(!node->busy); + } +} + +static __always_inline void +eufs_alloc_batch_persist_reset(struct super_block *sb, struct alloc_batch *pb) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + u64 page_no, page_no0; + int i; + + if (pb->n_used == 0) + goto reset; + if (pb->size == 0) + goto reset; + + BUG_ON(!pb->batch); + + sort(pb->batch, pb->n_used, sizeof(void *), cmp_func, NULL); + + for (i = 0; i < pb->n_used; ++i) { + if (i > 0 && pb->batch[i] == pb->batch[i - 1]) { + pr_info("!pb->batch[i]=%px [i-1]=%px i=%d\n", + pb->batch[i], pb->batch[i - 1], i); + BUG(); + } + _set_bitmap(sbi, (u64)pb->batch[i], false); + } + + page_no0 = _PAGE_NO(pb->batch[0]); + if (_IS_LINE(pb->batch[0])) + eufs_flush_cacheline(_LINE_MAP(pb->batch[0])); + eufs_flush_cacheline(&sbi->page_map[page_no0]); + + for (i = 1; i < pb->n_used; ++i) { + page_no = _PAGE_NO(pb->batch[i]); + if (page_no == page_no0) + /* same page, must be allocation of two cache lines */ + continue; + + /* different page */ + if (_IS_LINE(pb->batch[i])) + eufs_flush_cacheline(_LINE_MAP(pb->batch[i])); + + /* not in a single cache line */ + if ((page_no >> EUFS_PMAP_CNT_SHIFT_PER_CACHELINE) != + (page_no0 >> EUFS_PMAP_CNT_SHIFT_PER_CACHELINE)) + eufs_flush_cacheline(&sbi->page_map[page_no]); + page_no0 = page_no; + } + + eufs_dbg("!persistallocation: pb=%px sorted %px~%px %ld\n", pb, + pb->batch[0], pb->batch[pb->n_used - 1], pb->n_used); +reset: + pb->n_used = 0; +} + +static __always_inline void eufs_alloc_persist(struct super_block *sb, + void *ptr, bool forced) +{ + struct eufs_sb_info *sbi = EUFS_SB(sb); + u64 page_no = _PAGE_NO(ptr); + + _set_bitmap(sbi, (u64)ptr, forced); + + if (_IS_LINE(ptr)) + eufs_flush_cacheline(_LINE_MAP(ptr)); + + eufs_flush_cacheline(&sbi->page_map[page_no]); +} + +#undef _PAGE_NO +#undef _LINE_MAP +#undef _IS_LINE + +#endif /* EUFS_PBATCH_H */